diff options
Diffstat (limited to 'fs')
184 files changed, 6992 insertions, 5111 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 17903b4..031dbe3 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head, if (lkb->lkb_rqmode < mode) break; - if (!lkb) - list_add_tail(new, head); - else - __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); } /* add/remove lkb to rsb's grant/convert/wait queue */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 8b6e73c..b627285 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) if (!ast_type) { kref_get(&lkb->lkb_ref); list_add_tail(&lkb->lkb_astqueue, &proc->asts); + lkb->lkb_ast_first = type; wake_up_interruptible(&proc->wait); } if (type == AST_COMP && (ast_type & AST_COMP)) @@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type); if (eol) { - lkb->lkb_ast_type &= ~AST_BAST; lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; } @@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file) } static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, - int bmode, char __user *buf, size_t count) + int mode, char __user *buf, size_t count) { #ifdef CONFIG_COMPAT struct dlm_lock_result32 result32; @@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, if (type == AST_BAST) { result.user_astaddr = ua->bastaddr; result.user_astparam = ua->bastparam; - result.bast_mode = bmode; + result.bast_mode = mode; } else { result.user_astaddr = ua->castaddr; result.user_astparam = ua->castparam; @@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, struct dlm_user_proc *proc = file->private_data; struct dlm_lkb *lkb; DECLARE_WAITQUEUE(wait, current); - int error, type=0, bmode=0, removed = 0; + int error = 0, removed; + int ret_type, ret_mode; + int bastmode, castmode, do_bast, do_cast; if (count == sizeof(struct dlm_device_version)) { error = copy_version_to_user(buf, count); @@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, #endif return -EINVAL; + try_another: + /* do we really need this? can a read happen after a close? */ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) return -EINVAL; @@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); - if (lkb->lkb_ast_type & AST_COMP) { - lkb->lkb_ast_type &= ~AST_COMP; - type = AST_COMP; - } else if (lkb->lkb_ast_type & AST_BAST) { - lkb->lkb_ast_type &= ~AST_BAST; - type = AST_BAST; - bmode = lkb->lkb_bastmode; + removed = 0; + ret_type = 0; + ret_mode = 0; + do_bast = lkb->lkb_ast_type & AST_BAST; + do_cast = lkb->lkb_ast_type & AST_COMP; + bastmode = lkb->lkb_bastmode; + castmode = lkb->lkb_castmode; + + /* when both are queued figure out which to do first and + switch first so the other goes in the next read */ + + if (do_cast && do_bast) { + if (lkb->lkb_ast_first == AST_COMP) { + ret_type = AST_COMP; + ret_mode = castmode; + lkb->lkb_ast_type &= ~AST_COMP; + lkb->lkb_ast_first = AST_BAST; + } else { + ret_type = AST_BAST; + ret_mode = bastmode; + lkb->lkb_ast_type &= ~AST_BAST; + lkb->lkb_ast_first = AST_COMP; + } + } else { + ret_type = lkb->lkb_ast_first; + ret_mode = (ret_type == AST_COMP) ? castmode : bastmode; + lkb->lkb_ast_type &= ~ret_type; + lkb->lkb_ast_first = 0; + } + + /* if we're doing a bast but the bast is unnecessary, then + switch to do nothing or do a cast if that was needed next */ + + if ((ret_type == AST_BAST) && + dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) { + ret_type = 0; + ret_mode = 0; + + if (do_cast) { + ret_type = AST_COMP; + ret_mode = castmode; + lkb->lkb_ast_type &= ~AST_COMP; + lkb->lkb_ast_first = 0; + } + } + + if (lkb->lkb_ast_first != lkb->lkb_ast_type) { + log_print("device_read %x ast_first %x ast_type %x", + lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type); } if (!lkb->lkb_ast_type) { @@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, } spin_unlock(&proc->asts_spin); - error = copy_result_to_user(lkb->lkb_ua, - test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), - type, bmode, buf, count); + if (ret_type) { + error = copy_result_to_user(lkb->lkb_ua, + test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), + ret_type, ret_mode, buf, count); + + if (ret_type == AST_COMP) + lkb->lkb_castmode_done = castmode; + if (ret_type == AST_BAST) + lkb->lkb_bastmode_done = bastmode; + } /* removes reference for the proc->asts lists added by dlm_user_add_ast() and may result in the lkb being freed */ + if (removed) dlm_put_lkb(lkb); + /* the bast that was queued was eliminated (see unnecessary above), + leaving nothing to return */ + + if (!ret_type) + goto try_another; + return error; } @@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown) return ret; } -static DEFINE_RWLOCK(fasync_lock); +static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; +static void fasync_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(fasync_cache, + container_of(head, struct fasync_struct, fa_rcu)); +} + /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, @@ -625,8 +631,6 @@ static struct kmem_cache *fasync_cache __read_mostly; * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * - * We always take the 'filp->f_lock', in since fasync_lock - * needs to be irq-safe. */ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { @@ -634,17 +638,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) int result = 0; spin_lock(&filp->f_lock); - write_lock_irq(&fasync_lock); + spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; + + spin_lock_irq(&fa->fa_lock); + fa->fa_file = NULL; + spin_unlock_irq(&fa->fa_lock); + *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); + call_rcu(&fa->fa_rcu, fasync_free_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } - write_unlock_irq(&fasync_lock); + spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } @@ -666,25 +675,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa return -ENOMEM; spin_lock(&filp->f_lock); - write_lock_irq(&fasync_lock); + spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; + + spin_lock_irq(&fa->fa_lock); fa->fa_fd = fd; + spin_unlock_irq(&fa->fa_lock); + kmem_cache_free(fasync_cache, new); goto out; } + spin_lock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; - *fapp = new; + rcu_assign_pointer(*fapp, new); result = 1; filp->f_flags |= FASYNC; out: - write_unlock_irq(&fasync_lock); + spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } @@ -704,37 +718,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap EXPORT_SYMBOL(fasync_helper); -void __kill_fasync(struct fasync_struct *fa, int sig, int band) +/* + * rcu_read_lock() is held + */ +static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { - struct fown_struct * fown; + struct fown_struct *fown; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - fown = &fa->fa_file->f_owner; - /* Don't send SIGURG to processes which have not set a - queued signum: SIGURG has its own default signalling - mechanism. */ - if (!(sig == SIGURG && fown->signum == 0)) - send_sigio(fown, fa->fa_fd, band); - fa = fa->fa_next; + spin_lock(&fa->fa_lock); + if (fa->fa_file) { + fown = &fa->fa_file->f_owner; + /* Don't send SIGURG to processes which have not set a + queued signum: SIGURG has its own default signalling + mechanism. */ + if (!(sig == SIGURG && fown->signum == 0)) + send_sigio(fown, fa->fa_fd, band); + } + spin_unlock(&fa->fa_lock); + fa = rcu_dereference(fa->fa_next); } } -EXPORT_SYMBOL(__kill_fasync); - void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { - read_lock(&fasync_lock); - /* reread *fp after obtaining the lock */ - __kill_fasync(*fp, sig, band); - read_unlock(&fasync_lock); + rcu_read_lock(); + kill_fasync_rcu(rcu_dereference(*fp), sig, band); + rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0c1d0b8..a739a0a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) { struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); void *kaddr; int error; @@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) return error; kaddr = kmap_atomic(page, KM_USER0); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = (dibh->b_size - sizeof(struct gfs2_dinode)); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); brelse(dibh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e411d5..4a48c0f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!PageUptodate(page)) { void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode))) + dsize = dibh->b_size - sizeof(struct gfs2_dinode); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, - PAGE_CACHE_SIZE - ip->i_disksize); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); kunmap(page); SetPageUptodate(page); @@ -1038,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - ip->i_disksize = size; + u64 dsize = size + sizeof(struct gfs2_inode); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); + if (dsize > dibh->b_size) + dsize = dibh->b_size; + gfs2_buffer_clear_tail(dibh, dsize); error = 1; - } else { if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) error = gfs2_block_truncate_page(ip->i_inode.i_mapping); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 25fddc1..8295c5b 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) inode = gfs2_inode_lookup(dir->i_sb, be16_to_cpu(dent->de_type), be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + be64_to_cpu(dent->de_inum.no_formal_ino)); brelse(bh); return inode; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index c22c211..dfe237a 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) goto fail; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 454d4b4..ddcdbf4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = (unsigned long)__builtin_return_address(0); + if (gh->gh_owner_pid) + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3aac46f..b5d7363 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -439,9 +439,6 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_incore_log_blocks; - unsigned int gt_log_flush_secs; - unsigned int gt_logd_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ @@ -462,6 +459,7 @@ enum { SDF_SHUTDOWN = 2, SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, }; #define GFS2_FSNAME_LEN 256 @@ -618,6 +616,7 @@ struct gfs2_sbd { unsigned int sd_log_commited_databuf; int sd_log_commited_revoke; + atomic_t sd_log_pinned; unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; unsigned int sd_log_num_rg; @@ -629,15 +628,17 @@ struct gfs2_sbd { struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; - struct mutex sd_log_reserve_mutex; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; unsigned int sd_log_head; unsigned int sd_log_tail; int sd_log_idle; - unsigned long sd_log_flush_time; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; wait_queue_head_t sd_log_flush_wait; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b1bf269..51d8061 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode) * @sb: The super block * @no_addr: The inode number * @type: The type of the inode - * @skip_freeing: set this not return an inode if it is currently being freed. * * Returns: A VFS inode, or an error */ @@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode) struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, u64 no_addr, - u64 no_formal_ino, int skip_freeing) + u64 no_formal_ino) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl; int error; - if (skip_freeing) - inode = gfs2_iget_skip(sb, no_addr); - else - inode = gfs2_iget(sb, no_addr); + inode = gfs2_iget(sb, no_addr); ip = GFS2_I(inode); if (!inode) @@ -234,13 +230,100 @@ fail_glock: fail_iopen: gfs2_glock_put(io_gl); fail_put: - ip->i_gl->gl_object = NULL; + if (inode->i_state & I_NEW) + ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); return ERR_PTR(error); } +/** + * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation + * @sb: The super block + * no_addr: The inode number + * @@inode: A pointer to the inode found, if any + * + * Returns: 0 and *inode if no errors occurred. If an error occurs, + * the resulting *inode may or may not be NULL. + */ + +int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, + struct inode **inode) +{ + struct gfs2_sbd *sdp; + struct gfs2_inode *ip; + struct gfs2_glock *io_gl; + int error; + struct gfs2_holder gh; + + *inode = gfs2_iget_skip(sb, no_addr); + + if (!(*inode)) + return -ENOBUFS; + + if (!((*inode)->i_state & I_NEW)) + return -ENOBUFS; + + ip = GFS2_I(*inode); + sdp = GFS2_SB(*inode); + ip->i_no_formal_ino = -1; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + set_bit(GIF_INVALID, &ip->i_flags); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, + &ip->i_iopen_gh); + if (unlikely(error)) { + if (error == GLR_TRYFAILED) + error = 0; + goto fail_iopen; + } + ip->i_iopen_gh.gh_gl->gl_object = ip; + gfs2_glock_put(io_gl); + + (*inode)->i_mode = DT2IF(DT_UNKNOWN); + + /* + * We must read the inode in order to work out its type in + * this case. Note that this doesn't happen often as we normally + * know the type beforehand. This code path only occurs during + * unlinked inode recovery (where it is safe to do this glock, + * which is not true in the general case). + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, + &gh); + if (unlikely(error)) { + if (error == GLR_TRYFAILED) + error = 0; + goto fail_glock; + } + /* Inode is now uptodate */ + gfs2_glock_dq_uninit(&gh); + gfs2_set_iop(*inode); + + return 0; +fail_glock: + gfs2_glock_dq(&ip->i_iopen_gh); +fail_iopen: + gfs2_glock_put(io_gl); +fail_put: + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + return error; +} + static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -862,7 +945,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, goto fail_gunlock2; inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, - inum.no_formal_ino, 0); + inum.no_formal_ino); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c341aaf..e161461 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -83,8 +83,9 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); + u64 no_addr, u64 no_formal_ino); +extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, + struct inode **inode); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e5bf4b5..b593f0e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl return list_empty(&ai->ai_ail1_list); } -static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +static void gfs2_ail1_start(struct gfs2_sbd *sdp) { struct list_head *head; u64 sync_gen; - struct list_head *first; - struct gfs2_ail *first_ai, *ai, *tmp; + struct gfs2_ail *ai; int done = 0; gfs2_log_lock(sdp); @@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) } sync_gen = sdp->sd_ail_sync_gen++; - first = head->prev; - first_ai = list_entry(first, struct gfs2_ail, ai_list); - first_ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */ - - if (flags & DIO_ALL) - first = NULL; - while(!done) { - if (first && (head->prev != first || - gfs2_ail1_empty_one(sdp, first_ai, 0))) - break; - done = 1; - list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { + list_for_each_entry_reverse(ai, head, ai_list) { if (ai->ai_sync_gen >= sync_gen) continue; ai->ai_sync_gen = sync_gen; @@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) * flush time, so we ensure that we have just enough free blocks at all * times to avoid running out during a log flush. * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + * * Returns: errno */ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned int try = 0; unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned wanted = blks + reserved_blks; + DEFINE_WAIT(wait); + int did_wait = 0; + unsigned int free_blocks; if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; - - mutex_lock(&sdp->sd_log_reserve_mutex); - gfs2_log_lock(sdp); - while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) { - gfs2_log_unlock(sdp); - gfs2_ail1_empty(sdp, 0); - gfs2_log_flush(sdp, NULL); - - if (try++) - gfs2_ail1_start(sdp, 0); - gfs2_log_lock(sdp); +retry: + free_blocks = atomic_read(&sdp->sd_log_blks_free); + if (unlikely(free_blocks <= wanted)) { + do { + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sdp->sd_logd_waitq); + did_wait = 1; + if (atomic_read(&sdp->sd_log_blks_free) <= wanted) + io_schedule(); + free_blocks = atomic_read(&sdp->sd_log_blks_free); + } while(free_blocks <= wanted); + finish_wait(&sdp->sd_log_waitq, &wait); } - atomic_sub(blks, &sdp->sd_log_blks_free); + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, + free_blocks - blks) != free_blocks) + goto retry; trace_gfs2_log_blocks(sdp, -blks); - gfs2_log_unlock(sdp); - mutex_unlock(&sdp->sd_log_reserve_mutex); + + /* + * If we waited, then so might others, wake them up _after_ we get + * our share of the log. + */ + if (unlikely(did_wait)) + wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); return 0; } -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - gfs2_log_lock(sdp); - atomic_add(blks, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, blks); - gfs2_assert_withdraw(sdp, - atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); - up_read(&sdp->sd_log_flush_lock); -} - static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) { struct gfs2_journal_extent *je; @@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) ail2_empty(sdp, new_tail); - gfs2_log_lock(sdp); atomic_add(dist, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, dist); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - gfs2_log_unlock(sdp); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); sdp->sd_log_tail = new_tail; } @@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); set_buffer_uptodate(bh); + fs_info(sdp, "barrier sync failed - disabling barriers\n"); set_bit(SDF_NOBARRIERS, &sdp->sd_flags); lock_buffer(bh); skip_barrier: @@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * @sdp: the filesystem * @tr: the transaction * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of + * journal size. + * * Returns: errno */ @@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) up_read(&sdp->sd_log_flush_lock); - gfs2_log_lock(sdp); - if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) - wake_up_process(sdp->sd_logd_process); - gfs2_log_unlock(sdp); + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > + atomic_read(&sdp->sd_log_thresh2))) + wake_up(&sdp->sd_logd_waitq); } /** @@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) { gfs2_log_flush(sdp, NULL); for (;;) { - gfs2_ail1_start(sdp, DIO_ALL); + gfs2_ail1_start(sdp); if (gfs2_ail1_empty(sdp, DIO_ALL)) break; msleep(10); } } +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + return used_blocks >= atomic_read(&sdp->sd_log_thresh2); +} /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks @@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; - unsigned long t; - int need_flush; + unsigned long t = 1; + DEFINE_WAIT(wait); + unsigned preflush; while (!kthread_should_stop()) { - /* Advance the log tail */ - t = sdp->sd_log_flush_time + - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; + preflush = atomic_read(&sdp->sd_log_pinned); + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp, DIO_ALL); + gfs2_log_flush(sdp, NULL); + gfs2_ail1_empty(sdp, DIO_ALL); + } - gfs2_ail1_empty(sdp, DIO_ALL); - gfs2_log_lock(sdp); - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks); - gfs2_log_unlock(sdp); - if (need_flush || time_after_eq(jiffies, t)) { + if (gfs2_ail_flush_reqd(sdp)) { + gfs2_ail1_start(sdp); + io_schedule(); + gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL); - sdp->sd_log_flush_time = jiffies; + gfs2_ail1_empty(sdp, DIO_ALL); } + wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; if (freezing(current)) refrigerator(); - schedule_timeout_interruptible(t); + + do { + prepare_to_wait(&sdp->sd_logd_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()) + t = schedule_timeout(t); + } while(t && !gfs2_ail_flush_reqd(sdp) && + !gfs2_jrnl_flush_reqd(sdp) && + !kthread_should_stop()); + finish_wait(&sdp->sd_logd_waitq, &wait); } return 0; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 7c64510..eb570b4 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); void gfs2_log_incr_head(struct gfs2_sbd *sdp); struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index adc260f..bf33f82 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd->bd_ail) list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); trace_gfs2_pin(bd, 1); } @@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, trace_gfs2_pin(bd, 0); gfs2_log_unlock(sdp); unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a88fadc..fb2a5f9 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_glock_cachep) goto fail; - gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)", + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", sizeof(struct gfs2_glock) + sizeof(struct address_space), 0, 0, gfs2_init_gl_aspace_once); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0bb12c8..18176d0 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -34,7 +34,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - int err; struct buffer_head *bh, *head; int nr_underway = 0; int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? @@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } while (bh != head); unlock_page(page); - err = 0; if (nr_underway == 0) end_page_writeback(page); - return err; + return 0; } const struct address_space_operations gfs2_meta_aops = { @@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct gfs2_bufdata *bd = bh->b_private; if (test_clear_buffer_pinned(bh)) { + atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_le.le_list); if (meta) { gfs2_assert_warn(sdp, sdp->sd_log_num_buf); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c1309ed..3593b3a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_incore_log_blocks = 1024; - gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; @@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_log_lock); - + atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_rg); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); - mutex_init(&sdp->sd_log_reserve_mutex); + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); INIT_LIST_HEAD(&sdp->sd_ail1_list); INIT_LIST_HEAD(&sdp->sd_ail2_list); @@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); @@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); } else { if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { fs_err(sdp, "can't mount journal #%u\n", @@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jinode_gh; } atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ map_journal_extents(sdp); @@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_quotad; - sdp->sd_log_flush_time = jiffies; - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); error = IS_ERR(p); if (error) { @@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; if (sdp->sd_args.ar_statfs_quantum) { sdp->sd_tune.gt_statfs_slow = 0; @@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 60; + args.ar_commit = 30; args.ar_statfs_quantum = 30; args.ar_quota_quantum = 60; args.ar_errors = GFS2_ERRORS_DEFAULT; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6dbcbad..d5f4661 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, unsigned blocksize, iblock, pos; struct buffer_head *bh, *dibh; struct page *page; - void *kaddr; - struct gfs2_quota *qp; - s64 value; - int err = -EIO; + void *kaddr, *ptr; + struct gfs2_quota q, *qp; + int err, nbytes; u64 size; if (gfs2_is_stuffed(ip)) gfs2_unstuff_dinode(ip, NULL); - + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + err = -EIO; + qp = &q; + qp->qu_value = be64_to_cpu(qp->qu_value); + qp->qu_value += change; + qp->qu_value = cpu_to_be64(qp->qu_value); + qd->qd_qb.qb_value = qp->qu_value; + if (fdq) { + if (fdq->d_fieldmask & FS_DQ_BSOFT) { + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qd->qd_qb.qb_warn = qp->qu_warn; + } + if (fdq->d_fieldmask & FS_DQ_BHARD) { + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qd->qd_qb.qb_limit = qp->qu_limit; + } + } + + /* Write the quota into the quota file on disk */ + ptr = qp; + nbytes = sizeof(struct gfs2_quota); +get_a_page: page = grab_cache_page(mapping, index); if (!page) return -ENOMEM; @@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (!buffer_mapped(bh)) { gfs2_block_map(inode, iblock, bh, 1); if (!buffer_mapped(bh)) - goto unlock; + goto unlock_out; + /* If it's a newly allocated disk block for quota, zero it */ + if (buffer_new(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } } if (PageUptodate(page)) @@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, ll_rw_block(READ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) - goto unlock; + goto unlock_out; } gfs2_trans_add_bh(ip->i_gl, bh, 0); kaddr = kmap_atomic(page, KM_USER0); - qp = kaddr + offset; - value = (s64)be64_to_cpu(qp->qu_value) + change; - qp->qu_value = cpu_to_be64(value); - qd->qd_qb.qb_value = qp->qu_value; - if (fdq) { - if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); - qd->qd_qb.qb_warn = qp->qu_warn; - } - if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); - qd->qd_qb.qb_limit = qp->qu_limit; - } - } + if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) + nbytes = PAGE_CACHE_SIZE - offset; + memcpy(kaddr + offset, ptr, nbytes); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); + unlock_page(page); + page_cache_release(page); + /* If quota straddles page boundary, we need to update the rest of the + * quota at the beginning of the next page */ + if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + ptr = ptr + nbytes; + nbytes = sizeof(struct gfs2_quota) - nbytes; + offset = 0; + index++; + goto get_a_page; + } + + /* Update the disk inode timestamp and size (if extended) */ err = gfs2_meta_inode_buffer(ip, &dibh); if (err) - goto unlock; + goto out; size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) { @@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, brelse(dibh); mark_inode_dirty(inode); -unlock: +out: + return err; +unlock_out: unlock_page(page); page_cache_release(page); return err; @@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) * rgrp since it won't be allocated during the transaction */ al->al_requested = 1; - /* +1 in the end for block requested above for unstuffing */ - blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1; + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; if (nalloc) al->al_requested += nalloc * (data_blocks + ind_blocks); @@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb, memset(fqs, 0, sizeof(struct fs_quota_stat)); fqs->qs_version = FS_QSTAT_VERSION; - if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON) - fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); - else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) - fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_ON: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + /*FALLTHRU*/ + case GFS2_QUOTA_ACCOUNT: + fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + break; + case GFS2_QUOTA_OFF: + break; + } + if (sdp->sd_quota_inode) { fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 503b842..8bce73e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -948,13 +948,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes * @rgd: The rgrp * - * Returns: The inode, if one has been found + * Returns: 0 if no error + * The inode, if one has been found, in inode. */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { - struct inode *inode; u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; @@ -979,14 +979,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, - no_addr, -1, 1); - if (!IS_ERR(inode)) - return inode; + return no_addr; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return NULL; + return 0; } /** @@ -1067,11 +1064,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno + * unlinked: the block address of an unlinked block to be reclaimed */ -static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, + u64 *last_unlinked) { - struct inode *inode = NULL; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; struct gfs2_alloc *al = ip->i_alloc; @@ -1080,6 +1078,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) int loops = 0; int error, rg_locked; + *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1096,19 +1095,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + /* If the rg came in already locked, there's no + way we can recover from a failed try_rgrp_unlink + because that would require an iput which can only + happen after the rgrp is unlocked. */ + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; + if (*unlinked) + return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); break; default: - return ERR_PTR(error); + return error; } } @@ -1130,12 +1134,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); + if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) + *unlinked = try_rgrp_unlink(rgd, last_unlinked, + ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (inode) - return inode; + if (*unlinked) + return -EAGAIN; break; case GLR_TRYFAILED: @@ -1143,7 +1148,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) break; default: - return ERR_PTR(error); + return error; } rgd = gfs2_rgrpd_get_next(rgd); @@ -1152,7 +1157,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (rgd == begin) { if (++loops >= 3) - return ERR_PTR(-ENOSPC); + return -ENOSPC; if (!skipped) loops++; flags = 0; @@ -1172,7 +1177,7 @@ out: forward_rgrp_set(sdp, rgd); } - return NULL; + return 0; } /** @@ -1188,7 +1193,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) struct gfs2_alloc *al = ip->i_alloc; struct inode *inode; int error = 0; - u64 last_unlinked = NO_BLOCK; + u64 last_unlinked = NO_BLOCK, unlinked; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; @@ -1204,14 +1209,19 @@ try_again: if (error) return error; - inode = get_local_rgrp(ip, &last_unlinked); - if (inode) { + error = get_local_rgrp(ip, &unlinked, &last_unlinked); + if (error) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iput(inode); + if (error != -EAGAIN) + return error; + error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, + unlinked, &inode); + if (inode) + iput(inode); gfs2_log_flush(sdp, NULL); + if (error == GLR_TRYFAILED) + error = 0; goto try_again; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 50aac60..4d1aad3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) int error; spin_lock(>->gt_spin); - args.ar_commit = gt->gt_log_flush_secs; + args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; if (gt->gt_statfs_slow) args.ar_statfs_quantum = 0; @@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) else clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); spin_lock(>->gt_spin); - gt->gt_log_flush_secs = args.ar_commit; + gt->gt_logd_secs = args.ar_commit; gt->gt_quota_quantum = args.ar_quota_quantum; if (args.ar_statfs_quantum) { gt->gt_statfs_slow = 0; @@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - val = sdp->sd_tune.gt_log_flush_secs; - if (val != 60) + val = sdp->sd_tune.gt_logd_secs; + if (val != 30) seq_printf(s, ",commit=%d", val); val = sdp->sd_tune.gt_statfs_quantum; if (val != 30) @@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) seq_printf(s, ",nobarrier"); - + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_printf(s, ",demote_interface_used"); return 0; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 54fd984..37f5393 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -232,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len glops = gfs2_glops_list[gltype]; if (glops == NULL) return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); if (rv) return rv; @@ -468,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -TUNE_ATTR(incore_log_blocks, 0); -TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); TUNE_ATTR(max_readahead, 0); @@ -481,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_incore_log_blocks.attr, - &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, &tune_attr_max_readahead.attr, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 4ef0e9f..9ec73a8 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -23,6 +23,7 @@ #include "meta_io.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes) @@ -75,6 +76,23 @@ fail_holder_uninit: return error; } +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; @@ -1205,8 +1205,6 @@ void generic_delete_inode(struct inode *inode) inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - security_inode_delete(inode); - if (op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; /* Filesystems implementing their own diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c03d4dc..bc2ff59 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1889,7 +1889,7 @@ static struct kmem_cache *get_slab(size_t size) BUG_ON(i >= JBD2_MAX_SLABS); if (unlikely(i < 0)) i = 0; - BUG_ON(jbd2_slab[i] == 0); + BUG_ON(jbd2_slab[i] == NULL); return jbd2_slab[i]; } diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 3ff50da..55f1dde 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *); void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c) { - spin_lock(&c->erase_completion_lock); + assert_spin_locked(&c->erase_completion_lock); if (c->gc_task && jffs2_thread_should_wake(c)) send_sig(SIGHUP, c->gc_task, 1); - spin_unlock(&c->erase_completion_lock); } /* This must only ever be called when no GC thread is currently running */ diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index b47679b..6286ad9 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_erase_failed(c, jeb, bad_offset); } -void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) { struct jffs2_eraseblock *jeb; + int work_done = 0; mutex_lock(&c->erase_free_sem); @@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) mutex_unlock(&c->erase_free_sem); jffs2_mark_erased_block(c, jeb); + work_done++; if (!--count) { D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n")); goto done; @@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) mutex_unlock(&c->erase_free_sem); done: D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); + return work_done; } static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) @@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move_tail(&jeb->list, &c->erase_complete_list); + /* Wake the GC thread to mark them clean */ + jffs2_garbage_collect_trigger(c); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->erase_free_sem); - /* Ensure that kupdated calls us again to mark them clean */ - jffs2_erase_pending_trigger(c); + wake_up(&c->erase_wait); } static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset) @@ -487,9 +491,9 @@ filebad: refile: /* Stick it back on the list from whence it came and come back later */ - jffs2_erase_pending_trigger(c); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); list_move(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->erase_free_sem); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 3451a81..86e0821 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFBLK: case S_IFCHR: /* Read the device numbers from the media */ - if (f->metadata->size != sizeof(jdev.old) && - f->metadata->size != sizeof(jdev.new)) { + if (f->metadata->size != sizeof(jdev.old_id) && + f->metadata->size != sizeof(jdev.new_id)) { printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size); goto error_io; } @@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino); goto error; } - if (f->metadata->size == sizeof(jdev.old)) - rdev = old_decode_dev(je16_to_cpu(jdev.old)); + if (f->metadata->size == sizeof(jdev.old_id)) + rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); else - rdev = new_decode_dev(je32_to_cpu(jdev.new)); + rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); case S_IFSOCK: case S_IFIFO: diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 3b6f2fa..f5e96bd 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) return ret; } + /* If there are any blocks which need erasing, erase them now */ + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) { + spin_unlock(&c->erase_completion_lock); + D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); + if (jffs2_erase_pending_blocks(c, 1)) { + mutex_unlock(&c->alloc_sem); + return 0; + } + D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); + spin_lock(&c->erase_completion_lock); + } + /* First, work out which block we're garbage-collecting */ jeb = c->gcblock; @@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (!jeb) { /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ - if (!list_empty(&c->erase_pending_list)) { + if (c->nr_erasing_blocks) { spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); return -EAGAIN; @@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) list_add_tail(&c->gcblock->list, &c->erase_pending_list); c->gcblock = NULL; c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } spin_unlock(&c->erase_completion_lock); diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 507ed6e..a881a42 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c) static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev) { if (old_valid_dev(rdev)) { - jdev->old = cpu_to_je16(old_encode_dev(rdev)); - return sizeof(jdev->old); + jdev->old_id = cpu_to_je16(old_encode_dev(rdev)); + return sizeof(jdev->old_id); } else { - jdev->new = cpu_to_je32(new_encode_dev(rdev)); - return sizeof(jdev->new); + jdev->new_id = cpu_to_je32(new_encode_dev(rdev)); + return sizeof(jdev->new_id); } } @@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb int jffs2_do_mount_fs(struct jffs2_sb_info *c); /* erase.c */ -void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); +int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 191359d..694aa5b 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, ret = jffs2_garbage_collect_pass(c); - if (ret == -EAGAIN) - jffs2_erase_pending_blocks(c, 1); - else if (ret) + if (ret == -EAGAIN) { + spin_lock(&c->erase_completion_lock); + if (c->nr_erasing_blocks && + list_empty(&c->erase_pending_list) && + list_empty(&c->erase_complete_list)) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&c->erase_wait, &wait); + D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__)); + spin_unlock(&c->erase_completion_lock); + + schedule(); + } else + spin_unlock(&c->erase_completion_lock); + } else if (ret) return ret; cond_resched(); @@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", ejeb->offset)); } @@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, void jffs2_complete_reservation(struct jffs2_sb_info *c) { D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); + spin_lock(&c->erase_completion_lock); jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); mutex_unlock(&c->alloc_sem); } @@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ @@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) int nr_very_dirty = 0; struct jffs2_eraseblock *jeb; + if (!list_empty(&c->erase_complete_list) || + !list_empty(&c->erase_pending_list)) + return 1; + if (c->unchecked_size) { D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n", c->unchecked_size, c->checked_ino)); diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index a7f03b7..035a767 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); #endif /* WRITEBUFFER */ -/* erase.c */ -static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c) +static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c) { OFNI_BS_2SFFJ(c)->s_dirt = 1; } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 696686c..46f870d 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) ret = -EIO; goto out; } - jffs2_erase_pending_trigger(c); + spin_lock(&c->erase_completion_lock); + jffs2_garbage_collect_trigger(c); + spin_unlock(&c->erase_completion_lock); } ret = 0; out: diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 9a80e8e..511e2d6 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { D1(printk(KERN_DEBUG "jffs2_write_super()\n")); - jffs2_garbage_collect_trigger(c); - jffs2_erase_pending_blocks(c, 0); jffs2_flush_wbuf_gc(c, 0); } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 5ef7bac..07ee154 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) struct jffs2_inodirty *new; /* Mark the superblock dirty so that kupdated will flush... */ - jffs2_erase_pending_trigger(c); + jffs2_dirty_trigger(c); if (jffs2_wbuf_pending_for_ino(c, ino)) return; @@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n")); list_add_tail(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } else { /* Sometimes, however, we leave it elsewhere so it doesn't get immediately reused, and we spread the load a bit. */ @@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); list_add(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) { @@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; - jffs2_erase_pending_trigger(c); + jffs2_garbage_collect_trigger(c); } jffs2_dbg_acct_sanity_check_nolock(c, jeb); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 9e2f6a7..c92ea3b 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -2438,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) /* check if this is a control page update for an allocation. * if so, update the leaf to reflect the new leaf value using - * dbSplit(); otherwise (deallocation), use dbJoin() to udpate + * dbSplit(); otherwise (deallocation), use dbJoin() to update * the leaf with the new value. in addition to updating the * leaf, dbSplit() will also split the binary buddy system of * the leaves, if required, and bubble new values within the @@ -547,6 +547,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, } /** + * simple_write_to_buffer - copy data from user space to the buffer + * @to: the buffer to write to + * @available: the size of the buffer + * @ppos: the current position in the buffer + * @from: the user space buffer to read from + * @count: the maximum number of bytes to read + * + * The simple_write_to_buffer() function reads up to @count bytes from the user + * space address starting at @from into the buffer @to at offset @ppos. + * + * On success, the number of bytes written is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, + const void __user *from, size_t count) +{ + loff_t pos = *ppos; + size_t res; + + if (pos < 0) + return -EINVAL; + if (pos >= available || !count) + return 0; + if (count > available - pos) + count = available - pos; + res = copy_from_user(to + pos, from, count); + if (res == count) + return -EFAULT; + count -= res; + *ppos = pos + count; + return count; +} + +/** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read @@ -864,6 +898,7 @@ EXPORT_SYMBOL(simple_statfs); EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); +EXPORT_SYMBOL(simple_write_to_buffer); EXPORT_SYMBOL(memory_read_from_buffer); EXPORT_SYMBOL(simple_transaction_set); EXPORT_SYMBOL(simple_transaction_get); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 2396a85..72d1893 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -12,7 +12,7 @@ * Atomic dir operations * * Directory operations are by default not atomic. Dentries and Inodes are - * created/removed/altered in seperate operations. Therefore we need to do + * created/removed/altered in separate operations. Therefore we need to do * a small amount of journaling. * * Create, link, mkdir, mknod and symlink all share the same function to do diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 93b55f3..1a9db84 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -707,7 +707,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level) u8 level = (__force u8)__level; if (ino == LOGFS_INO_MASTER) { - /* ifile has seperate areas */ + /* ifile has separate areas */ level += LOGFS_MAX_LEVELS; } return (__force gc_level_t)level; diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h index f674725..ae96051 100644 --- a/fs/logfs/logfs_abi.h +++ b/fs/logfs/logfs_abi.h @@ -50,9 +50,9 @@ static inline void check_##type(void) \ * 12 - gc recycled blocks, long-lived data * 13 - replacement blocks, short-lived data * - * Levels 1-11 are necessary for robust gc operations and help seperate + * Levels 1-11 are necessary for robust gc operations and help separate * short-lived metadata from longer-lived file data. In the future, - * file data should get seperated into several segments based on simple + * file data should get separated into several segments based on simple * heuristics. Old data recycled during gc operation is expected to be * long-lived. New data is of uncertain life expectancy. New data * used to replace older blocks in existing files is expected to be @@ -117,7 +117,7 @@ static inline void check_##type(void) \ #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) /* - * LogFS needs to seperate data into levels. Each level is defined as the + * LogFS needs to separate data into levels. Each level is defined as the * maximal possible distance from the master inode (inode of the inode file). * Data blocks reside on level 0, 1x indirect block on level 1, etc. * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. @@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); * @ds_crc: crc32 of structure starting with the next field * @ds_ifile_levels: maximum number of levels for ifile * @ds_iblock_levels: maximum number of levels for regular files - * @ds_data_levels: number of seperate levels for data + * @ds_data_levels: number of separate levels for data * @pad0: reserved, must be 0 * @ds_feature_incompat: incompatible filesystem features * @ds_feature_ro_compat: read-only compatible filesystem features @@ -456,7 +456,7 @@ enum logfs_vim { * @vim: life expectancy of data * * "Areas" are segments currently being used for writing. There is at least - * one area per GC level. Several may be used to seperate long-living from + * one area per GC level. Several may be used to separate long-living from * short-living data. If an area with unknown vim is encountered, it can * simply be closed. * The write buffer immediately follow this header. diff --git a/fs/namespace.c b/fs/namespace.c index f20cb57..88058de 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -628,7 +628,6 @@ repeat: mnt->mnt_pinned = 0; spin_unlock(&vfsmount_lock); acct_auto_close_mnt(mnt); - security_sb_umount_close(mnt); goto repeat; } } @@ -1117,8 +1116,6 @@ static int do_umount(struct vfsmount *mnt, int flags) retval = 0; } spin_unlock(&vfsmount_lock); - if (retval) - security_sb_umount_busy(mnt); up_write(&namespace_sem); release_mounts(&umount_list); return retval; @@ -1435,17 +1432,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) if (cant_mount(path->dentry)) goto out_unlock; - err = security_sb_check_sb(mnt, path); - if (err) - goto out_unlock; - - err = -ENOENT; if (!d_unlinked(path->dentry)) err = attach_recursive_mnt(mnt, path, NULL); out_unlock: mutex_unlock(&path->dentry->d_inode->i_mutex); - if (!err) - security_sb_post_addmount(mnt, path); return err; } @@ -1581,8 +1571,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } up_write(&sb->s_umount); if (!err) { - security_sb_post_remount(path->mnt, flags, data); - spin_lock(&vfsmount_lock); touch_mnt_namespace(path->mnt->mnt_ns); spin_unlock(&vfsmount_lock); @@ -2277,7 +2265,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); spin_unlock(&vfsmount_lock); chroot_fs_refs(&root, &new); - security_sb_post_pivotroot(&root, &new); error = 0; path_put(&root_parent); path_put(&parent_path); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index acc9c49..7ec9b34 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -934,7 +934,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } fsinfo.fattr = fattr; - nfs_fattr_init(fattr); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1047,13 +1046,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { struct nfs_server *server; - struct nfs_fattr fattr; + struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + /* Get a client representation */ error = nfs_init_server(server, data); if (error < 0) @@ -1064,7 +1068,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); /* Probe the root fh to retrieve its FSID */ - error = nfs_probe_fsinfo(server, mntfh, &fattr); + error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { @@ -1077,14 +1081,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, server->namelen = NFS2_MAXNAMLEN; } - if (!(fattr.valid & NFS_ATTR_FATTR)) { - error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + if (!(fattr->valid & NFS_ATTR_FATTR)) { + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } - memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, @@ -1096,9 +1100,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, spin_unlock(&nfs_client_lock); server->mount_time = jiffies; + nfs_free_fattr(fattr); return server; error: + nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } @@ -1340,7 +1346,7 @@ error: struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, struct nfs_fh *mntfh) { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; struct nfs_server *server; int error; @@ -1350,6 +1356,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + /* set up the general RPC client */ error = nfs4_init_server(server, data); if (error < 0) @@ -1364,7 +1375,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, goto error; /* Probe the root fh to retrieve its FSID */ - error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); + error = nfs4_get_rootfh(server, mntfh); if (error < 0) goto error; @@ -1375,7 +1386,7 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, nfs4_session_set_rwsize(server); - error = nfs_probe_fsinfo(server, mntfh, &fattr); + error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto error; @@ -1389,9 +1400,11 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, server->mount_time = jiffies; dprintk("<-- nfs4_create_server() = %p\n", server); + nfs_free_fattr(fattr); return server; error: + nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); @@ -1405,7 +1418,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, { struct nfs_client *parent_client; struct nfs_server *server, *parent_server; - struct nfs_fattr fattr; + struct nfs_fattr *fattr; int error; dprintk("--> nfs4_create_referral_server()\n"); @@ -1414,6 +1427,11 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto error; + parent_server = NFS_SB(data->sb); parent_client = parent_server->nfs_client; @@ -1443,12 +1461,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); /* Probe the root fh to retrieve its FSID and filehandle */ - error = nfs4_path_walk(server, mntfh, data->mnt_path); + error = nfs4_get_rootfh(server, mntfh); if (error < 0) goto error; /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, mntfh, &fattr); + error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto error; @@ -1466,10 +1484,12 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, server->mount_time = jiffies; + nfs_free_fattr(fattr); dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: + nfs_free_fattr(fattr); nfs_free_server(server); dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); @@ -1485,7 +1505,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fattr *fattr) { struct nfs_server *server; - struct nfs_fattr fattr_fsinfo; + struct nfs_fattr *fattr_fsinfo; int error; dprintk("--> nfs_clone_server(,%llx:%llx,)\n", @@ -1496,6 +1516,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (!server) return ERR_PTR(-ENOMEM); + error = -ENOMEM; + fattr_fsinfo = nfs_alloc_fattr(); + if (fattr_fsinfo == NULL) + goto out_free_server; + /* Copy data from the source */ server->nfs_client = source->nfs_client; atomic_inc(&server->nfs_client->cl_count); @@ -1512,7 +1537,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, nfs_init_server_aclclient(server); /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); + error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); if (error < 0) goto out_free_server; @@ -1534,10 +1559,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->mount_time = jiffies; + nfs_free_fattr(fattr_fsinfo); dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: + nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index ea61d26..3016345 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct struct nfs_delegation *freeme = NULL; int status = 0; - delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; memcpy(delegation->stateid.data, res->delegation.data, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a7bb5c6..ee9a179 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -530,9 +530,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_entry my_entry; - struct nfs_fh fh; - struct nfs_fattr fattr; - long res; + int res = -ENOMEM; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -554,9 +552,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) my_entry.cookie = my_entry.prev_cookie = 0; my_entry.eof = 0; - my_entry.fh = &fh; - my_entry.fattr = &fattr; - nfs_fattr_init(&fattr); + my_entry.fh = nfs_alloc_fhandle(); + my_entry.fattr = nfs_alloc_fattr(); + if (my_entry.fh == NULL || my_entry.fattr == NULL) + goto out_alloc_failed; + desc->entry = &my_entry; nfs_block_sillyrename(dentry); @@ -598,7 +598,10 @@ out: nfs_unblock_sillyrename(dentry); if (res > 0) res = 0; - dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n", +out_alloc_failed: + nfs_free_fattr(my_entry.fattr); + nfs_free_fhandle(my_entry.fh); + dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, res); return res; @@ -776,9 +779,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) struct inode *dir; struct inode *inode; struct dentry *parent; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; int error; - struct nfs_fh fhandle; - struct nfs_fattr fattr; parent = dget_parent(dentry); dir = parent->d_inode; @@ -811,14 +814,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) if (NFS_STALE(inode)) goto out_bad; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + error = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out_error; + + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error) goto out_bad; - if (nfs_compare_fh(NFS_FH(inode), &fhandle)) + if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out_bad; - if ((error = nfs_refresh_inode(inode, &fattr)) != 0) + if ((error = nfs_refresh_inode(inode, fattr)) != 0) goto out_bad; + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: @@ -842,11 +853,21 @@ out_zap_parent: shrink_dcache_parent(dentry); } d_drop(dentry); + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n", __func__, dentry->d_parent->d_name.name, dentry->d_name.name); return 0; +out_error: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + dput(parent); + dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", + __func__, dentry->d_parent->d_name.name, + dentry->d_name.name, error); + return error; } /* @@ -911,9 +932,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru struct dentry *res; struct dentry *parent; struct inode *inode = NULL; + struct nfs_fh *fhandle = NULL; + struct nfs_fattr *fattr = NULL; int error; - struct nfs_fh fhandle; - struct nfs_fattr fattr; dfprintk(VFS, "NFS: lookup(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); @@ -923,7 +944,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru if (dentry->d_name.len > NFS_SERVER(dir)->namelen) goto out; - res = ERR_PTR(-ENOMEM); dentry->d_op = NFS_PROTO(dir)->dentry_ops; /* @@ -936,17 +956,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru goto out; } + res = ERR_PTR(-ENOMEM); + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fhandle == NULL || fattr == NULL) + goto out; + parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); goto out_unblock_sillyrename; } - inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); res = (struct dentry *)inode; if (IS_ERR(res)) goto out_unblock_sillyrename; @@ -962,6 +988,8 @@ no_entry: out_unblock_sillyrename: nfs_unblock_sillyrename(parent); out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); return res; } @@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) smp_mb__after_atomic_dec(); } +static void nfs_access_free_list(struct list_head *head) +{ + struct nfs_access_entry *cache; + + while (!list_empty(head)) { + cache = list_entry(head->next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } +} + int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; struct nfs_access_entry *cache; -restart: + if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) + return (nr_to_scan == 0) ? 0 : -1; + spin_lock(&nfs_access_lru_lock); list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { - struct rw_semaphore *s_umount; struct inode *inode; if (nr_to_scan-- == 0) break; - s_umount = &nfsi->vfs_inode.i_sb->s_umount; - if (!down_read_trylock(s_umount)) - continue; - inode = igrab(&nfsi->vfs_inode); - if (inode == NULL) { - up_read(s_umount); - continue; - } + inode = &nfsi->vfs_inode; spin_lock(&inode->i_lock); if (list_empty(&nfsi->access_cache_entry_lru)) goto remove_lru_entry; @@ -1704,61 +1737,47 @@ restart: else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); + smp_mb__before_clear_bit(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + smp_mb__after_clear_bit(); } - spin_unlock(&inode->i_lock); - spin_unlock(&nfs_access_lru_lock); - iput(inode); - up_read(s_umount); - goto restart; } spin_unlock(&nfs_access_lru_lock); - while (!list_empty(&head)) { - cache = list_entry(head.next, struct nfs_access_entry, lru); - list_del(&cache->lru); - nfs_access_free_entry(cache); - } + nfs_access_free_list(&head); return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; } -static void __nfs_access_zap_cache(struct inode *inode) +static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) { - struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; - struct rb_node *n, *dispose = NULL; + struct rb_node *n; struct nfs_access_entry *entry; /* Unhook entries from the cache */ while ((n = rb_first(root_node)) != NULL) { entry = rb_entry(n, struct nfs_access_entry, rb_node); rb_erase(n, root_node); - list_del(&entry->lru); - n->rb_left = dispose; - dispose = n; + list_move(&entry->lru, head); } nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; - spin_unlock(&inode->i_lock); - - /* Now kill them all! */ - while (dispose != NULL) { - n = dispose; - dispose = n->rb_left; - nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); - } } void nfs_access_zap_cache(struct inode *inode) { + LIST_HEAD(head); + + if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) + return; /* Remove from global LRU init */ - if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { - spin_lock(&nfs_access_lru_lock); + spin_lock(&nfs_access_lru_lock); + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_del_init(&NFS_I(inode)->access_cache_inode_lru); - spin_unlock(&nfs_access_lru_lock); - } spin_lock(&inode->i_lock); - /* This will release the spinlock */ - __nfs_access_zap_cache(inode); + __nfs_access_zap_cache(NFS_I(inode), &head); + spin_unlock(&inode->i_lock); + spin_unlock(&nfs_access_lru_lock); + nfs_access_free_list(&head); } static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) @@ -1809,8 +1828,8 @@ out_stale: nfs_access_free_entry(cache); return -ENOENT; out_zap: - /* This will release the spinlock */ - __nfs_access_zap_cache(inode); + spin_unlock(&inode->i_lock); + nfs_access_zap_cache(inode); return -ENOENT; } @@ -1865,9 +1884,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s smp_mb__after_atomic_inc(); /* Add inode to global LRU list */ - if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { + if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { spin_lock(&nfs_access_lru_lock); - list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, + &nfs_access_lru_list); spin_unlock(&nfs_access_lru_lock); } } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8d965bd..cac96bc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,14 +161,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); - if (server->flags & NFS_MOUNT_NOAC) - goto force_reval; + if (nfs_have_delegated_attributes(inode)) + goto out_noreval; + if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfsi->npages != 0) - return 0; - if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) - return 0; + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + goto force_reval; + if (nfs_attribute_timeout(inode)) + goto force_reval; +out_noreval: + return 0; force_reval: return __nfs_revalidate_inode(server, inode); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index a6b16ed..ce153a6 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, struct list_head *pages, unsigned *nr_pages) { - int ret, npages = *nr_pages; + unsigned npages = *nr_pages; + int ret; dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", NFS_I(inode)->fscache, npages, inode); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b35d2a6..7428f7d 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) { struct nfs_server *server = NFS_SB(sb); struct nfs_fsinfo fsinfo; - struct nfs_fattr fattr; - struct dentry *mntroot; + struct dentry *ret; struct inode *inode; int error; /* get the actual root for this mount */ - fsinfo.fattr = &fattr; + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + return ERR_PTR(-ENOMEM); error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); + ret = ERR_PTR(error); + goto out; } inode = nfs_fhget(sb, mntfh, fsinfo.fattr); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); - return ERR_CAST(inode); + ret = ERR_CAST(inode); + goto out; } error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) - return ERR_PTR(error); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - mntroot = d_obtain_alias(inode); - if (IS_ERR(mntroot)) { + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); - return mntroot; + goto out; } - security_d_instantiate(mntroot, inode); - - if (!mntroot->d_op) - mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; + security_d_instantiate(ret, inode); - return mntroot; + if (ret->d_op == NULL) + ret->d_op = server->nfs_client->rpc_ops->dentry_ops; +out: + nfs_free_fattr(fsinfo.fattr); + return ret; } #ifdef CONFIG_NFS_V4 -/* - * Do a simple pathwalk from the root FH of the server to the nominated target - * of the mountpoint - * - give error on symlinks - * - give error on ".." occurring in the path - * - follow traversals - */ -int nfs4_path_walk(struct nfs_server *server, - struct nfs_fh *mntfh, - const char *path) +int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fsinfo fsinfo; - struct nfs_fattr fattr; - struct nfs_fh lastfh; - struct qstr name; - int ret; + int ret = -ENOMEM; - dprintk("--> nfs4_path_walk(,,%s)\n", path); + dprintk("--> nfs4_get_rootfh()\n"); - fsinfo.fattr = &fattr; - nfs_fattr_init(&fattr); - - /* Eat leading slashes */ - while (*path == '/') - path++; + fsinfo.fattr = nfs_alloc_fattr(); + if (fsinfo.fattr == NULL) + goto out; /* Start by getting the root filehandle from the server */ ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); if (ret < 0) { - dprintk("nfs4_get_root: getroot error = %d\n", -ret); - return ret; + dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); + goto out; } - if (!S_ISDIR(fattr.mode)) { - printk(KERN_ERR "nfs4_get_root:" + if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_MODE) + || !S_ISDIR(fsinfo.fattr->mode)) { + printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); - return -ENOTDIR; + ret = -ENOTDIR; + goto out; } - /* FIXME: It is quite valid for the server to return a referral here */ - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_root:" + if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_rootfh:" " getroot obtained referral\n"); - return -EREMOTE; - } - -next_component: - dprintk("Next: %s\n", path); - - /* extract the next bit of the path */ - if (!*path) - goto path_walk_complete; - - name.name = path; - while (*path && *path != '/') - path++; - name.len = path - (const char *) name.name; - - if (name.len > NFS4_MAXNAMLEN) - return -ENAMETOOLONG; - -eat_dot_dir: - while (*path == '/') - path++; - - if (path[0] == '.' && (path[1] == '/' || !path[1])) { - path += 2; - goto eat_dot_dir; - } - - /* FIXME: Why shouldn't the user be able to use ".." in the path? */ - if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) - ) { - printk(KERN_ERR "nfs4_get_root:" - " Mount path contains reference to \"..\"\n"); - return -EINVAL; + ret = -EREMOTE; + goto out; } - /* lookup the next FH in the sequence */ - memcpy(&lastfh, mntfh, sizeof(lastfh)); - - dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); - - ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, - mntfh, &fattr); - if (ret < 0) { - dprintk("nfs4_get_root: getroot error = %d\n", -ret); - return ret; - } - - if (!S_ISDIR(fattr.mode)) { - printk(KERN_ERR "nfs4_get_root:" - " lookupfh encountered non-directory\n"); - return -ENOTDIR; - } - - /* FIXME: Referrals are quite valid here too */ - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_root:" - " lookupfh obtained referral\n"); - return -EREMOTE; - } - - goto next_component; - -path_walk_complete: - memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); - dprintk("<-- nfs4_path_walk() = 0\n"); - return 0; + memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); +out: + nfs_free_fattr(fsinfo.fattr); + dprintk("<-- nfs4_get_rootfh() = %d\n", ret); + return ret; } /* @@ -239,8 +174,8 @@ path_walk_complete: struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) { struct nfs_server *server = NFS_SB(sb); - struct nfs_fattr fattr; - struct dentry *mntroot; + struct nfs_fattr *fattr = NULL; + struct dentry *ret; struct inode *inode; int error; @@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) return ERR_PTR(error); } + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return ERR_PTR(-ENOMEM);; + /* get the actual root for this mount */ - error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); + ret = ERR_PTR(error); + goto out; } - inode = nfs_fhget(sb, mntfh, &fattr); + inode = nfs_fhget(sb, mntfh, fattr); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); - return ERR_CAST(inode); + ret = ERR_CAST(inode); + goto out; } error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) - return ERR_PTR(error); + if (error != 0) { + ret = ERR_PTR(error); + goto out; + } /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - mntroot = d_obtain_alias(inode); - if (IS_ERR(mntroot)) { + ret = d_obtain_alias(inode); + if (IS_ERR(ret)) { dprintk("nfs_get_root: get root dentry failed\n"); - return mntroot; + goto out; } - security_d_instantiate(mntroot, inode); + security_d_instantiate(ret, inode); - if (!mntroot->d_op) - mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; + if (ret->d_op == NULL) + ret->d_op = server->nfs_client->rpc_ops->dentry_ops; +out: + nfs_free_fattr(fattr); dprintk("<-- nfs4_get_root()\n"); - return mntroot; + return ret; } #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 50a56ed..099b351 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -393,8 +393,8 @@ int nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct nfs_fattr fattr; - int error; + struct nfs_fattr *fattr; + int error = -ENOMEM; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -417,14 +417,20 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) filemap_write_and_wait(inode->i_mapping); nfs_wb_all(inode); } + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; /* * Return any delegations if we're going to change ACLs */ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) nfs_inode_return_delegation(inode); - error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); + error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: return error; } @@ -682,7 +688,7 @@ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; - struct nfs_fattr fattr; + struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", @@ -693,8 +699,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", inode->i_sb->s_id, @@ -707,7 +718,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - status = nfs_refresh_inode(inode, &fattr); + status = nfs_refresh_inode(inode, fattr); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, @@ -723,6 +734,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: + nfs_free_fattr(fattr); return status; } @@ -730,9 +742,14 @@ int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +static int nfs_attribute_cache_expired(struct inode *inode) +{ if (nfs_have_delegated_attributes(inode)) return 0; - return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); + return nfs_attribute_timeout(inode); } /** @@ -745,7 +762,7 @@ int nfs_attribute_timeout(struct inode *inode) int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) - && !nfs_attribute_timeout(inode)) + && !nfs_attribute_cache_expired(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } @@ -782,7 +799,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) int ret = 0; if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { + || nfs_attribute_cache_expired(inode) + || NFS_STALE(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) goto out; @@ -916,6 +934,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); } +struct nfs_fattr *nfs_alloc_fattr(void) +{ + struct nfs_fattr *fattr; + + fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + if (fattr != NULL) + nfs_fattr_init(fattr); + return fattr; +} + +struct nfs_fh *nfs_alloc_fhandle(void) +{ + struct nfs_fh *fh; + + fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + if (fh != NULL) + fh->size = 0; + return fh; +} + /** * nfs_inode_attrs_need_update - check if the inode attributes need updating * @inode - pointer to inode diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 11f82f0..d8bd619 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -244,9 +244,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); #ifdef CONFIG_NFS_V4 extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); -extern int nfs4_path_walk(struct nfs_server *server, - struct nfs_fh *mntfh, - const char *path); +extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif /* read.c */ diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 1d8d5c8..c583248 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode, static inline void nfs_add_server_stats(const struct nfs_server *server, enum nfs_stat_bytecounters stat, - unsigned long addend) + long addend) { this_cpu_add(server->io_stats->bytes[stat], addend); } static inline void nfs_add_stats(const struct inode *inode, enum nfs_stat_bytecounters stat, - unsigned long addend) + long addend) { nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } @@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode, #ifdef CONFIG_NFS_FSCACHE static inline void nfs_add_fscache_stats(struct inode *inode, enum nfs_stat_fscachecounters stat, - unsigned long addend) + long addend) { this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7888cf3..db6aa36 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) struct vfsmount *mnt; struct nfs_server *server = NFS_SERVER(dentry->d_inode); struct dentry *parent; - struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fh *fh = NULL; + struct nfs_fattr *fattr = NULL; int err; dprintk("--> nfs_follow_mountpoint()\n"); @@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) if (IS_ROOT(dentry)) goto out_err; + err = -ENOMEM; + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + if (fh == NULL || fattr == NULL) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); @@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) parent = dget_parent(nd->path.dentry); err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &nd->path.dentry->d_name, - &fh, &fattr); + fh, fattr); dput(parent); if (err != 0) goto out_err; - if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry); else - mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh, - &fattr); + mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh, + fattr); err = PTR_ERR(mnt); if (IS_ERR(mnt)) goto out_err; @@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) nd->path.dentry = dget(mnt->mnt_root); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); dprintk("%s: done, returned %d\n", __func__, err); dprintk("<-- nfs_follow_mountpoint() = %d\n", err); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index d150ae0..9f88c5f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; struct page *pages[NFSACL_MAXPAGES] = { }; struct nfs3_getaclargs args = { .fh = NFS_FH(inode), @@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .pages = pages, }; struct nfs3_getaclres res = { - .fattr = &fattr, + 0 }; struct rpc_message msg = { .rpc_argp = &args, @@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; - nfs_fattr_init(&fattr); + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return ERR_PTR(-ENOMEM); + status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) switch (status) { case 0: - status = nfs_refresh_inode(inode, &fattr); + status = nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) getout: posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); + nfs_free_fattr(res.fattr); if (status != 0) { posix_acl_release(acl); @@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; + struct nfs_fattr *fattr; struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, @@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } dprintk("NFS call setacl\n"); + status = -ENOMEM; + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out_freepages; + msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; - nfs_fattr_init(&fattr); + msg.rpc_resp = fattr; status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: - status = nfs_refresh_inode(inode, &fattr); + status = nfs_refresh_inode(inode, fattr); nfs3_cache_acls(inode, acl, dfacl); break; case -EPFNOSUPPORT: @@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } + nfs_free_fattr(fattr); out_freepages: while (args.npages != 0) { args.npages--; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index e701002..fabb4f2 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -144,14 +144,12 @@ static int nfs3_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { - struct nfs_fattr dir_attr; struct nfs3_diropargs arg = { .fh = NFS_FH(dir), .name = name->name, .len = name->len }; struct nfs3_diropres res = { - .dir_attr = &dir_attr, .fh = fhandle, .fattr = fattr }; @@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name, int status; dprintk("NFS call lookup %s\n", name->name); - nfs_fattr_init(&dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + return -ENOMEM; + nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_refresh_inode(dir, &dir_attr); + nfs_refresh_inode(dir, res.dir_attr); if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; msg.rpc_argp = fhandle; msg.rpc_resp = fattr; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } + nfs_free_fattr(res.dir_attr); dprintk("NFS reply lookup: %d\n", status); return status; } static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { - struct nfs_fattr fattr; struct nfs3_accessargs arg = { .fh = NFS_FH(inode), }; - struct nfs3_accessres res = { - .fattr = &fattr, - }; + struct nfs3_accessres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, @@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_cred = entry->cred, }; int mode = entry->mask; - int status; + int status = -ENOMEM; dprintk("NFS call access\n"); @@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) if (mode & MAY_EXEC) arg.access |= NFS3_ACCESS_EXECUTE; } - nfs_fattr_init(&fattr); + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, res.fattr); if (status == 0) { entry->mask = 0; if (res.access & NFS3_ACCESS_READ) @@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) entry->mask |= MAY_EXEC; } + nfs_free_fattr(res.fattr); +out: dprintk("NFS reply access: %d\n", status); return status; } @@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) static int nfs3_proc_readlink(struct inode *inode, struct page *page, unsigned int pgbase, unsigned int pglen) { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; struct nfs3_readlinkargs args = { .fh = NFS_FH(inode), .pgbase = pgbase, @@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page, struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READLINK], .rpc_argp = &args, - .rpc_resp = &fattr, }; - int status; + int status = -ENOMEM; dprintk("NFS call readlink\n"); - nfs_fattr_init(&fattr); + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + goto out; + msg.rpc_resp = fattr; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, fattr); + nfs_free_fattr(fattr); +out: dprintk("NFS reply readlink: %d\n", status); return status; } @@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call remove %s\n", name->name); - nfs_fattr_init(&res.dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &res.dir_attr); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_free_fattr(res.dir_attr); +out: dprintk("NFS reply remove: %d\n", status); return status; } @@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs3_async_handle_jukebox(task, dir)) return 0; res = task->tk_msg.rpc_resp; - nfs_post_op_update_inode(dir, &res->dir_attr); + nfs_post_op_update_inode(dir, res->dir_attr); return 1; } @@ -427,7 +442,6 @@ static int nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, struct inode *new_dir, struct qstr *new_name) { - struct nfs_fattr old_dir_attr, new_dir_attr; struct nfs3_renameargs arg = { .fromfh = NFS_FH(old_dir), .fromname = old_name->name, @@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, .toname = new_name->name, .tolen = new_name->len }; - struct nfs3_renameres res = { - .fromattr = &old_dir_attr, - .toattr = &new_dir_attr - }; + struct nfs3_renameres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - nfs_fattr_init(&old_dir_attr); - nfs_fattr_init(&new_dir_attr); + + res.fromattr = nfs_alloc_fattr(); + res.toattr = nfs_alloc_fattr(); + if (res.fromattr == NULL || res.toattr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, &old_dir_attr); - nfs_post_op_update_inode(new_dir, &new_dir_attr); + nfs_post_op_update_inode(old_dir, res.fromattr); + nfs_post_op_update_inode(new_dir, res.toattr); +out: + nfs_free_fattr(res.toattr); + nfs_free_fattr(res.fromattr); dprintk("NFS reply rename: %d\n", status); return status; } @@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, static int nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { - struct nfs_fattr dir_attr, fattr; struct nfs3_linkargs arg = { .fromfh = NFS_FH(inode), .tofh = NFS_FH(dir), .toname = name->name, .tolen = name->len }; - struct nfs3_linkres res = { - .dir_attr = &dir_attr, - .fattr = &fattr - }; + struct nfs3_linkres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_LINK], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call link %s\n", name->name); - nfs_fattr_init(&dir_attr); - nfs_fattr_init(&fattr); + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); - nfs_post_op_update_inode(inode, &fattr); + nfs_post_op_update_inode(dir, res.dir_attr); + nfs_post_op_update_inode(inode, res.fattr); +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); dprintk("NFS reply link: %d\n", status); return status; } @@ -554,7 +574,7 @@ out: static int nfs3_proc_rmdir(struct inode *dir, struct qstr *name) { - struct nfs_fattr dir_attr; + struct nfs_fattr *dir_attr; struct nfs3_diropargs arg = { .fh = NFS_FH(dir), .name = name->name, @@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name) struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_RMDIR], .rpc_argp = &arg, - .rpc_resp = &dir_attr, }; - int status; + int status = -ENOMEM; dprintk("NFS call rmdir %s\n", name->name); - nfs_fattr_init(&dir_attr); + dir_attr = nfs_alloc_fattr(); + if (dir_attr == NULL) + goto out; + + msg.rpc_resp = dir_attr; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - nfs_post_op_update_inode(dir, &dir_attr); + nfs_post_op_update_inode(dir, dir_attr); + nfs_free_fattr(dir_attr); +out: dprintk("NFS reply rmdir: %d\n", status); return status; } @@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page *page, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; - struct nfs_fattr dir_attr; __be32 *verf = NFS_COOKIEVERF(dir); struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), @@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .pages = &page }; struct nfs3_readdirres res = { - .dir_attr = &dir_attr, .verf = verf, .plus = plus }; @@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, .rpc_resp = &res, .rpc_cred = cred }; - int status; + int status = -ENOMEM; if (plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; @@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, dprintk("NFS call readdir%s %d\n", plus? "plus" : "", (unsigned int) cookie); - nfs_fattr_init(&dir_attr); + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_invalidate_atime(dir); + nfs_refresh_inode(dir, res.dir_attr); - nfs_refresh_inode(dir, &dir_attr); + nfs_free_fattr(res.dir_attr); +out: dprintk("NFS reply readdir: %d\n", status); return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 56a86f6..75dcfc7 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) static int nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res) { - return nfs3_xdr_wccstat(req, p, &res->dir_attr); + return nfs3_xdr_wccstat(req, p, res->dir_attr); } /* diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a187200..c538c61 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -206,14 +206,14 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ -extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); -extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait); +extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); @@ -286,7 +286,7 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); -extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter); +extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f071d12..3c2a172 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -115,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) { + const size_t addr_bufsize = sizeof(struct sockaddr_storage); struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; unsigned int maxbuflen; @@ -126,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; + mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); + if (mountdata->addr == NULL) + return ERR_PTR(-ENOMEM); + for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; - struct sockaddr_storage addr; if (buf->len <= 0 || buf->len >= maxbuflen) continue; @@ -137,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, - (struct sockaddr *)&addr, sizeof(addr)); + mountdata->addr, addr_bufsize); if (mountdata->addrlen == 0) continue; - mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); @@ -156,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (!IS_ERR(mnt)) break; } + kfree(mountdata->addr); return mnt; } @@ -221,8 +225,8 @@ out: /* * nfs_do_refmount - handle crossing a referral on server + * @mnt_parent - mountpoint of referral * @dentry - dentry of referral - * @nd - nameidata info * */ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 071fced..70015dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, + struct nfs_fattr *fattr, struct iattr *sattr, + struct nfs4_state *state); /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) @@ -714,17 +717,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, struct nfs4_state_owner *sp, fmode_t fmode, int flags, - const struct iattr *attrs) + const struct iattr *attrs, + gfp_t gfp_mask) { struct dentry *parent = dget_parent(path->dentry); struct inode *dir = parent->d_inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *p; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) goto err; - p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); if (p->o_arg.seqid == NULL) goto err_free; path_get(path); @@ -1060,7 +1064,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1648,7 +1652,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in if (path->dentry->d_inode != NULL) nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); + opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; @@ -1659,15 +1663,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in if (status != 0) goto err_opendata_put; - if (opendata->o_arg.open_flags & O_EXCL) - nfs4_exclusive_attrset(opendata, sattr); - state = nfs4_opendata_to_nfs4_state(opendata); status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + if (opendata->o_arg.open_flags & O_EXCL) { + nfs4_exclusive_attrset(opendata, sattr); + + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state); + if (status == 0) + nfs_setattr_update_inode(state->inode, sattr); + nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + } nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); *res = state; @@ -1914,7 +1927,7 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) +int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -1933,7 +1946,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) }; int status = -ENOMEM; - calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); + calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; calldata->inode = state->inode; @@ -1941,7 +1954,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) calldata->arg.fh = NFS_FH(state->inode); calldata->arg.stateid = &state->open_stateid; /* Serialization for the sequence id */ - calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); + calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); if (calldata->arg.seqid == NULL) goto out_free_calldata; calldata->arg.fmode = 0; @@ -2404,14 +2417,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; struct nfs4_accessargs args = { .fh = NFS_FH(inode), .bitmask = server->attr_bitmask, }; struct nfs4_accessres res = { .server = server, - .fattr = &fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], @@ -2438,7 +2449,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (mode & MAY_EXEC) args.access |= NFS4_ACCESS_EXECUTE; } - nfs_fattr_init(&fattr); + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + status = nfs4_call_sync(server, &msg, &args, &res, 0); if (!status) { entry->mask = 0; @@ -2448,8 +2463,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry entry->mask |= MAY_WRITE; if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) entry->mask |= MAY_EXEC; - nfs_refresh_inode(inode, &fattr); + nfs_refresh_inode(inode, res.fattr); } + nfs_free_fattr(res.fattr); return status; } @@ -2562,13 +2578,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, } d_add(dentry, igrab(state->inode)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - if (flags & O_EXCL) { - struct nfs_fattr fattr; - status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state); - if (status == 0) - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, &fattr); - } if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) status = nfs4_intent_set_file(nd, &path, state, fmode); else @@ -2596,14 +2605,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; + + res.dir_attr = nfs_alloc_fattr(); + if (res.dir_attr == NULL) + goto out; - nfs_fattr_init(&res.dir_attr); status = nfs4_call_sync(server, &msg, &args, &res, 1); if (status == 0) { update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(dir, &res.dir_attr); + nfs_post_op_update_inode(dir, res.dir_attr); } + nfs_free_fattr(res.dir_attr); +out: return status; } @@ -2638,7 +2652,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); - nfs_post_op_update_inode(dir, &res->dir_attr); + nfs_post_op_update_inode(dir, res->dir_attr); return 1; } @@ -2653,29 +2667,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, .new_name = new_name, .bitmask = server->attr_bitmask, }; - struct nfs_fattr old_fattr, new_fattr; struct nfs4_rename_res res = { .server = server, - .old_fattr = &old_fattr, - .new_fattr = &new_fattr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; - nfs_fattr_init(res.old_fattr); - nfs_fattr_init(res.new_fattr); - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + res.old_fattr = nfs_alloc_fattr(); + res.new_fattr = nfs_alloc_fattr(); + if (res.old_fattr == NULL || res.new_fattr == NULL) + goto out; + status = nfs4_call_sync(server, &msg, &arg, &res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); nfs_post_op_update_inode(old_dir, res.old_fattr); update_changeattr(new_dir, &res.new_cinfo); nfs_post_op_update_inode(new_dir, res.new_fattr); } +out: + nfs_free_fattr(res.new_fattr); + nfs_free_fattr(res.old_fattr); return status; } @@ -2702,28 +2718,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * .name = name, .bitmask = server->attr_bitmask, }; - struct nfs_fattr fattr, dir_attr; struct nfs4_link_res res = { .server = server, - .fattr = &fattr, - .dir_attr = &dir_attr, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], .rpc_argp = &arg, .rpc_resp = &res, }; - int status; + int status = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + res.dir_attr = nfs_alloc_fattr(); + if (res.fattr == NULL || res.dir_attr == NULL) + goto out; - nfs_fattr_init(res.fattr); - nfs_fattr_init(res.dir_attr); status = nfs4_call_sync(server, &msg, &arg, &res, 1); if (!status) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); nfs_post_op_update_inode(inode, res.fattr); } - +out: + nfs_free_fattr(res.dir_attr); + nfs_free_fattr(res.fattr); return status; } @@ -3146,23 +3164,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } +struct nfs4_renewdata { + struct nfs_client *client; + unsigned long timestamp; +}; + /* * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special * standalone procedure for queueing an asynchronous RENEW. */ -static void nfs4_renew_release(void *data) +static void nfs4_renew_release(void *calldata) { - struct nfs_client *clp = data; + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; if (atomic_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); + kfree(data); } -static void nfs4_renew_done(struct rpc_task *task, void *data) +static void nfs4_renew_done(struct rpc_task *task, void *calldata) { - struct nfs_client *clp = data; - unsigned long timestamp = task->tk_start; + struct nfs4_renewdata *data = calldata; + struct nfs_client *clp = data->client; + unsigned long timestamp = data->timestamp; if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ @@ -3188,11 +3214,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) .rpc_argp = clp, .rpc_cred = cred, }; + struct nfs4_renewdata *data; if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + data->client = clp; + data->timestamp = jiffies; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs4_renew_ops, clp); + &nfs4_renew_ops, data); } int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) @@ -3494,7 +3526,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, return _nfs4_async_handle_error(task, server, server->nfs_client, state); } -int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) +int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) { nfs4_verifier sc_verifier; struct nfs4_setclientid setclientid = { @@ -3504,7 +3538,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], .rpc_argp = &setclientid, - .rpc_resp = clp, + .rpc_resp = res, .rpc_cred = cred, }; __be32 *p; @@ -3547,12 +3581,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po return status; } -static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) +static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, + struct nfs4_setclientid_res *arg, + struct rpc_cred *cred) { struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], - .rpc_argp = clp, + .rpc_argp = arg, .rpc_resp = &fsinfo, .rpc_cred = cred, }; @@ -3570,12 +3606,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre return status; } -int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, + struct nfs4_setclientid_res *arg, + struct rpc_cred *cred) { long timeout = 0; int err; do { - err = _nfs4_proc_setclientid_confirm(clp, cred); + err = _nfs4_proc_setclientid_confirm(clp, arg, cred); switch (err) { case 0: return err; @@ -3667,7 +3705,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co }; int status = 0; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; data->args.fhandle = &data->fh; @@ -3823,7 +3861,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct inode *inode = lsp->ls_state->inode; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); @@ -3961,7 +3999,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * if (test_bit(NFS_DELEGATED_STATE, &state->flags)) goto out; lsp = request->fl_u.nfs4_fl.owner; - seqid = nfs_alloc_seqid(&lsp->ls_seqid); + seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; if (seqid == NULL) goto out; @@ -3989,22 +4027,23 @@ struct nfs4_lockdata { }; static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, - struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) + struct nfs_open_context *ctx, struct nfs4_lock_state *lsp, + gfp_t gfp_mask) { struct nfs4_lockdata *p; struct inode *inode = lsp->ls_state->inode; struct nfs_server *server = NFS_SERVER(inode); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; - p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid); + p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); if (p->arg.open_seqid == NULL) goto out_free; - p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); + p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); if (p->arg.lock_seqid == NULL) goto out_free_seqid; p->arg.lock_stateid = &lsp->ls_stateid; @@ -4158,7 +4197,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f dprintk("%s: begin!\n", __func__); data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), - fl->fl_u.nfs4_fl.owner); + fl->fl_u.nfs4_fl.owner, + recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); if (data == NULL) return -ENOMEM; if (IS_SETLKW(cmd)) @@ -4647,7 +4687,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, if (max_reqs != tbl->max_slots) { ret = -ENOMEM; new = kmalloc(max_reqs * sizeof(struct nfs4_slot), - GFP_KERNEL); + GFP_NOFS); if (!new) goto out; ret = 0; @@ -4712,7 +4752,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL); + slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); if (!slot) goto out; ret = 0; @@ -4761,7 +4801,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) struct nfs4_session *session; struct nfs4_slot_table *tbl; - session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL); + session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); if (!session) return NULL; @@ -5105,8 +5145,8 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; - args = kzalloc(sizeof(*args), GFP_KERNEL); - res = kzalloc(sizeof(*res), GFP_KERNEL); + args = kzalloc(sizeof(*args), GFP_NOFS); + res = kzalloc(sizeof(*res), GFP_NOFS); if (!args || !res) { kfree(args); kfree(res); @@ -5207,7 +5247,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) int status = -ENOMEM; dprintk("--> %s\n", __func__); - calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); if (calldata == NULL) goto out; calldata->clp = clp; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6c5ed51..34acf59 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { + struct nfs4_setclientid_res clid; unsigned short port; int status; @@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) if (clp->cl_addr.ss_family == AF_INET6) port = nfs_callback_tcpport6; - status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred); - if (status == 0) - status = nfs4_proc_setclientid_confirm(clp, cred); - if (status == 0) - nfs4_schedule_state_renewal(clp); + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + status = nfs4_proc_setclientid_confirm(clp, &clid, cred); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + nfs4_schedule_state_renewal(clp); +out: return status; } @@ -361,7 +366,7 @@ nfs4_alloc_state_owner(void) { struct nfs4_state_owner *sp; - sp = kzalloc(sizeof(*sp),GFP_KERNEL); + sp = kzalloc(sizeof(*sp),GFP_NOFS); if (!sp) return NULL; spin_lock_init(&sp->so_lock); @@ -435,7 +440,7 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kzalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc(sizeof(*state), GFP_NOFS); if (!state) return NULL; atomic_set(&state->count, 1); @@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state) /* * Close the current file. */ -static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) +static void __nfs4_close(struct path *path, struct nfs4_state *state, + fmode_t fmode, gfp_t gfp_mask, int wait) { struct nfs4_state_owner *owner = state->owner; int call_close = 0; @@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm nfs4_put_open_state(state); nfs4_put_state_owner(owner); } else - nfs4_do_close(path, state, wait); + nfs4_do_close(path, state, gfp_mask, wait); } void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, fmode, 0); + __nfs4_close(path, state, fmode, GFP_NOFS, 0); } void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, fmode, 1); + __nfs4_close(path, state, fmode, GFP_KERNEL, 1); } /* @@ -618,7 +624,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f struct nfs4_lock_state *lsp; struct nfs_client *clp = state->owner->so_client; - lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) return NULL; rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); @@ -754,11 +760,11 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f nfs4_put_lock_state(lsp); } -struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter) +struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) { struct nfs_seqid *new; - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), gfp_mask); if (new != NULL) { new->sequence = counter; INIT_LIST_HEAD(&new->list); @@ -1347,7 +1353,7 @@ static int nfs4_recall_slot(struct nfs_client *clp) nfs4_begin_drain_session(clp); new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), - GFP_KERNEL); + GFP_NOFS); if (!new) return -ENOMEM; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 38f3b58..6bdef28 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1504,14 +1504,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie hdr->replen += decode_setclientid_maxsz; } -static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) +static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) { __be32 *p; p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); - p = xdr_encode_hyper(p, client_state->cl_clientid); - xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_encode_hyper(p, arg->clientid); + xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -2324,7 +2324,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4 /* * a SETCLIENTID_CONFIRM request */ -static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp) +static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -2334,7 +2334,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, req, &hdr); - encode_setclientid_confirm(&xdr, clp, &hdr); + encode_setclientid_confirm(&xdr, arg, &hdr); encode_putrootfh(&xdr, &hdr); encode_fsinfo(&xdr, lease_bitmap, &hdr); encode_nops(&hdr); @@ -4397,7 +4397,7 @@ out_overflow: return -EIO; } -static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) +static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) { __be32 *p; uint32_t opnum; @@ -4417,8 +4417,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &clp->cl_clientid); - memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); + p = xdr_decode_hyper(p, &res->clientid); + memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; @@ -4815,7 +4815,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem goto out; if ((status = decode_remove(&xdr, &res->cinfo)) != 0) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server, + decode_getfattr(&xdr, res->dir_attr, res->server, !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; @@ -5498,7 +5498,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy) * Decode SETCLIENTID response */ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, - struct nfs_client *clp) + struct nfs4_setclientid_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -5507,7 +5507,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, xdr_init_decode(&xdr, &req->rq_rcv_buf, p); status = decode_compound_hdr(&xdr, &hdr); if (!status) - status = decode_setclientid(&xdr, clp); + status = decode_setclientid(&xdr, res); return status; } diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 8c55b27..6bd19d8 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -488,7 +488,6 @@ static int __init root_nfs_ports(void) */ static int __init root_nfs_get_handle(void) { - struct nfs_fh fh; struct sockaddr_in sin; unsigned int auth_flav_len = 0; struct nfs_mount_request request = { @@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void) NFS_MNT3_VERSION : NFS_MNT_VERSION, .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, - .fh = &fh, .auth_flav_len = &auth_flav_len, }; - int status; + int status = -ENOMEM; + request.fh = nfs_alloc_fhandle(); + if (!request.fh) + goto out; set_sockaddr(&sin, servaddr, htons(mount_port)); status = nfs_mount(&request); if (status < 0) printk(KERN_ERR "Root-NFS: Server returned error %d " "while mounting %s\n", status, nfs_export_path); else { - nfs_data.root.size = fh.size; - memcpy(nfs_data.root.data, fh.data, fh.size); + nfs_data.root.size = request.fh->size; + memcpy(&nfs_data.root.data, request.fh->data, request.fh->size); } - + nfs_free_fhandle(request.fh); +out: return status; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 29d9d36..a3654e5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, { struct nfs_page *req; - for (;;) { - /* try to allocate the request struct */ - req = nfs_page_alloc(); - if (req != NULL) - break; - - if (fatal_signal_pending(current)) - return ERR_PTR(-ERESTARTSYS); - yield(); - } + /* try to allocate the request struct */ + req = nfs_page_alloc(); + if (req == NULL) + return ERR_PTR(-ENOMEM); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 0288be8..611bec2 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page, return status; } +struct nfs_createdata { + struct nfs_createargs arg; + struct nfs_diropok res; + struct nfs_fh fhandle; + struct nfs_fattr fattr; +}; + +static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir, + struct dentry *dentry, struct iattr *sattr) +{ + struct nfs_createdata *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data != NULL) { + data->arg.fh = NFS_FH(dir); + data->arg.name = dentry->d_name.name; + data->arg.len = dentry->d_name.len; + data->arg.sattr = sattr; + nfs_fattr_init(&data->fattr); + data->fhandle.size = 0; + data->res.fh = &data->fhandle; + data->res.fattr = &data->fattr; + } + return data; +}; + +static void nfs_free_createdata(const struct nfs_createdata *data) +{ + kfree(data); +} + static int nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nameidata *nd) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status; + int status = -ENOMEM; - nfs_fattr_init(&fattr); dprintk("NFS call create %s\n", dentry->d_name.name); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: dprintk("NFS reply create: %d\n", status); return status; } @@ -264,24 +289,12 @@ static int nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_CREATE], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status, mode; + umode_t mode; + int status = -ENOMEM; dprintk("NFS call mknod %s\n", dentry->d_name.name); @@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */ } - nfs_fattr_init(&fattr); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == -EINVAL && S_ISFIFO(mode)) { sattr->ia_mode = mode; - nfs_fattr_init(&fattr); + nfs_fattr_init(data->res.fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: dprintk("NFS reply mknod: %d\n", status); return status; } @@ -398,8 +418,8 @@ static int nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; struct nfs_symlinkargs arg = { .fromfh = NFS_FH(dir), .fromname = dentry->d_name.name, @@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, .rpc_proc = &nfs_procedures[NFSPROC_SYMLINK], .rpc_argp = &arg, }; - int status; + int status = -ENAMETOOLONG; + + dprintk("NFS call symlink %s\n", dentry->d_name.name); if (len > NFS2_MAXPATHLEN) - return -ENAMETOOLONG; + goto out; - dprintk("NFS call symlink %s\n", dentry->d_name.name); + fh = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) + goto out; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, * filehandle size to zero indicates to nfs_instantiate that it * should fill in the data with a LOOKUP call on the wire. */ - if (status == 0) { - nfs_fattr_init(&fattr); - fhandle.size = 0; - status = nfs_instantiate(dentry, &fhandle, &fattr); - } + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); +out: dprintk("NFS reply symlink: %d\n", status); return status; } @@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, static int nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { - struct nfs_fh fhandle; - struct nfs_fattr fattr; - struct nfs_createargs arg = { - .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len, - .sattr = sattr - }; - struct nfs_diropok res = { - .fh = &fhandle, - .fattr = &fattr - }; + struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], - .rpc_argp = &arg, - .rpc_resp = &res, }; - int status; + int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); - nfs_fattr_init(&fattr); + data = nfs_alloc_createdata(dir, dentry, sattr); + if (data == NULL) + goto out; + msg.rpc_argp = &data->arg; + msg.rpc_resp = &data->res; + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + nfs_free_createdata(data); +out: dprintk("NFS reply mkdir: %d\n", status); return status; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index db9b360..6e2b06e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool; struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); + struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL); if (p) { memset(p, 0, sizeof(*p)); @@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) if (pagecount <= ARRAY_SIZE(p->page_array)) p->pagevec = p->page_array; else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); if (!p->pagevec) { mempool_free(p, nfs_rdata_mempool); p = NULL; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b4148fc..2f8b115 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -141,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_resvport, "resvport" }, { Opt_noresvport, "noresvport" }, { Opt_fscache, "fsc" }, - { Opt_fscache_uniq, "fsc=%s" }, { Opt_nofscache, "nofsc" }, { Opt_port, "port=%s" }, @@ -171,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mountaddr, "mountaddr=%s" }, { Opt_lookupcache, "lookupcache=%s" }, + { Opt_fscache_uniq, "fsc=%s" }, { Opt_err, NULL } }; @@ -423,15 +423,19 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) unsigned char blockbits; unsigned long blockres; struct nfs_fh *fh = NFS_FH(dentry->d_inode); - struct nfs_fattr fattr; - struct nfs_fsstat res = { - .fattr = &fattr, - }; - int error; + struct nfs_fsstat res; + int error = -ENOMEM; + + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + + nfs_free_fattr(res.fattr); if (error < 0) goto out_err; + buf->f_type = NFS_SUPER_MAGIC; /* @@ -1046,14 +1050,6 @@ static int nfs_parse_mount_options(char *raw, kfree(mnt->fscache_uniq); mnt->fscache_uniq = NULL; break; - case Opt_fscache_uniq: - string = match_strdup(args); - if (!string) - goto out_nomem; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = string; - mnt->options |= NFS_OPTION_FSCACHE; - break; /* * options that take numeric values @@ -1384,6 +1380,14 @@ static int nfs_parse_mount_options(char *raw, return 0; }; break; + case Opt_fscache_uniq: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + kfree(mnt->fscache_uniq); + mnt->fscache_uniq = string; + mnt->options |= NFS_OPTION_FSCACHE; + break; /* * Special options @@ -2172,7 +2176,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, int error = -ENOMEM; data = nfs_alloc_parsed_mount_data(3); - mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2247,7 +2251,7 @@ out: kfree(data->fscache_uniq); security_free_mnt_opts(&data->lsm_opts); out_free_fh: - kfree(mntfh); + nfs_free_fhandle(mntfh); kfree(data); return error; @@ -2556,7 +2560,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2614,7 +2618,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, out: security_free_mnt_opts(&data->lsm_opts); out_free_fh: - kfree(mntfh); + nfs_free_fhandle(mntfh); return error; out_free: @@ -2669,41 +2673,120 @@ out_freepage: free_page((unsigned long)page); } +struct nfs_referral_count { + struct list_head list; + const struct task_struct *task; + unsigned int referral_count; +}; + +static LIST_HEAD(nfs_referral_count_list); +static DEFINE_SPINLOCK(nfs_referral_count_list_lock); + +static struct nfs_referral_count *nfs_find_referral_count(void) +{ + struct nfs_referral_count *p; + + list_for_each_entry(p, &nfs_referral_count_list, list) { + if (p->task == current) + return p; + } + return NULL; +} + +#define NFS_MAX_NESTED_REFERRALS 2 + +static int nfs_referral_loop_protect(void) +{ + struct nfs_referral_count *p, *new; + int ret = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + new->task = current; + new->referral_count = 1; + + ret = 0; + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + if (p != NULL) { + if (p->referral_count >= NFS_MAX_NESTED_REFERRALS) + ret = -ELOOP; + else + p->referral_count++; + } else { + list_add(&new->list, &nfs_referral_count_list); + new = NULL; + } + spin_unlock(&nfs_referral_count_list_lock); + kfree(new); +out: + return ret; +} + +static void nfs_referral_loop_unprotect(void) +{ + struct nfs_referral_count *p; + + spin_lock(&nfs_referral_count_list_lock); + p = nfs_find_referral_count(); + p->referral_count--; + if (p->referral_count == 0) + list_del(&p->list); + else + p = NULL; + spin_unlock(&nfs_referral_count_list_lock); + kfree(p); +} + static int nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path, struct vfsmount *mnt_target) { + struct nameidata *nd = NULL; struct mnt_namespace *ns_private; - struct nameidata nd; struct super_block *s; int ret; + nd = kmalloc(sizeof(*nd), GFP_KERNEL); + if (nd == NULL) + return -ENOMEM; + ns_private = create_mnt_ns(root_mnt); ret = PTR_ERR(ns_private); if (IS_ERR(ns_private)) goto out_mntput; + ret = nfs_referral_loop_protect(); + if (ret != 0) + goto out_put_mnt_ns; + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW, &nd); + export_path, LOOKUP_FOLLOW, nd); + nfs_referral_loop_unprotect(); put_mnt_ns(ns_private); if (ret != 0) goto out_err; - s = nd.path.mnt->mnt_sb; + s = nd->path.mnt->mnt_sb; atomic_inc(&s->s_active); mnt_target->mnt_sb = s; - mnt_target->mnt_root = dget(nd.path.dentry); + mnt_target->mnt_root = dget(nd->path.dentry); /* Correct the device pathname */ - nfs_fix_devname(&nd.path, mnt_target); + nfs_fix_devname(&nd->path, mnt_target); - path_put(&nd.path); + path_put(&nd->path); + kfree(nd); down_write(&s->s_umount); return 0; +out_put_mnt_ns: + put_mnt_ns(ns_private); out_mntput: mntput(root_mnt); out_err: + kfree(nd); return ret; } @@ -2874,17 +2957,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, struct super_block *s; struct nfs_server *server; struct dentry *mntroot; - struct nfs_fh mntfh; + struct nfs_fh *mntfh; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; dprintk("--> nfs4_referral_get_sb()\n"); + mntfh = nfs_alloc_fhandle(); + if (mntfh == NULL) + goto out_err_nofh; + /* create a new volume representation */ - server = nfs4_create_referral_server(data, &mntfh); + server = nfs4_create_referral_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out_err_noserver; @@ -2916,7 +3003,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, nfs_fscache_get_super_cookie(s, NULL, data); } - mntroot = nfs4_get_root(s, &mntfh); + mntroot = nfs4_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; @@ -2933,12 +3020,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, security_sb_clone_mnt_opts(data->sb, s); + nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = 0\n"); return 0; out_err_nosb: nfs_free_server(server); out_err_noserver: + nfs_free_fhandle(mntfh); +out_err_nofh: dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); return error; @@ -2947,6 +3037,7 @@ error_splat_super: bdi_unregister(&server->backing_dev_info); error_splat_bdi: deactivate_locked_super(s); + nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); return error; } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 6da3d3f..a2242af 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -23,6 +23,7 @@ struct nfs_unlinkdata { struct nfs_removeres res; struct inode *dir; struct rpc_cred *cred; + struct nfs_fattr dir_attr; }; /** @@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n } nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); - nfs_fattr_init(&data->res.dir_attr); + nfs_fattr_init(data->res.dir_attr); NFS_PROTO(dir)->unlink_setup(&msg, dir); @@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) goto out_free; } data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + data->res.dir_attr = &data->dir_attr; status = -EBUSY; spin_lock(&dentry->d_lock); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 872a5ef..c2a4f71 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_cache = { .alloc = expkey_alloc, }; -static struct svc_expkey * -svc_expkey_lookup(struct svc_expkey *item) +static int +svc_expkey_hash(struct svc_expkey *item) { - struct cache_head *ch; int hash = item->ek_fsidtype; char * cp = (char*)item->ek_fsid; int len = key_len(item->ek_fsidtype); @@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item) hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); hash &= EXPKEY_HASHMASK; + return hash; +} + +static struct svc_expkey * +svc_expkey_lookup(struct svc_expkey *item) +{ + struct cache_head *ch; + int hash = svc_expkey_hash(item); ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h, hash); @@ -283,13 +290,7 @@ static struct svc_expkey * svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) { struct cache_head *ch; - int hash = new->ek_fsidtype; - char * cp = (char*)new->ek_fsid; - int len = key_len(new->ek_fsidtype); - - hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); - hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS); - hash &= EXPKEY_HASHMASK; + int hash = svc_expkey_hash(new); ch = sunrpc_cache_update(&svc_expkey_cache, &new->h, &old->h, hash); @@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = { .alloc = svc_export_alloc, }; -static struct svc_export * -svc_export_lookup(struct svc_export *exp) +static int +svc_export_hash(struct svc_export *exp) { - struct cache_head *ch; int hash; + hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); + return hash; +} + +static struct svc_export * +svc_export_lookup(struct svc_export *exp) +{ + struct cache_head *ch; + int hash = svc_export_hash(exp); ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h, hash); @@ -759,10 +768,7 @@ static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; - int hash; - hash = hash_ptr(old->ex_client, EXPORT_HASHBITS); - hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS); - hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS); + int hash = svc_export_hash(old); ch = sunrpc_cache_update(&svc_export_cache, &new->h, &old->h, @@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp) err = 0; finish: kfree(new.ex_pathname); - if (exp) + if (!IS_ERR_OR_NULL(exp)) exp_put(exp); - if (fsid_key && !IS_ERR(fsid_key)) + if (!IS_ERR_OR_NULL(fsid_key)) cache_put(&fsid_key->h, &svc_expkey_cache); path_put(&path); out_put_clp: diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7e32bd3..eb78e7e 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -32,6 +32,7 @@ */ #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include "nfsd.h" #include "state.h" @@ -79,11 +80,6 @@ enum nfs_cb_opnum4 { cb_sequence_dec_sz + \ op_dec_sz) -struct nfs4_rpc_args { - void *args_op; - struct nfsd4_cb_sequence args_seq; -}; - /* * Generic encode routines from fs/nfs/nfs4xdr.c */ @@ -428,13 +424,19 @@ static struct rpc_procinfo nfs4_cb_procedures[] = { }; static struct rpc_version nfs_cb_version4 = { +/* + * Note on the callback rpc program version number: despite language in rfc + * 5661 section 18.36.3 requiring servers to use 4 in this field, the + * official xdr descriptions for both 4.0 and 4.1 specify version 1, and + * in practice that appears to be what implementations use. The section + * 18.36.3 language is expected to be fixed in an erratum. + */ .number = 1, .nrprocs = ARRAY_SIZE(nfs4_cb_procedures), .procs = nfs4_cb_procedures }; static struct rpc_version * nfs_cb_version[] = { - NULL, &nfs_cb_version4, }; @@ -456,15 +458,14 @@ static struct rpc_program cb_program = { static int max_cb_time(void) { - return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ; + return max(nfsd4_lease/10, (time_t)1) * HZ; } /* Reference counting, callback cleanup, etc., all look racy as heck. - * And why is cb_set an atomic? */ + * And why is cl_cb_set an atomic? */ -int setup_callback_client(struct nfs4_client *clp) +int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb) { - struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_timeout timeparms = { .to_initval = max_cb_time(), .to_retries = 0, @@ -476,7 +477,7 @@ int setup_callback_client(struct nfs4_client *clp) .timeout = &timeparms, .program = &cb_program, .prognumber = cb->cb_prog, - .version = nfs_cb_version[1]->number, + .version = 0, .authflavor = clp->cl_flavor, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .client_name = clp->cl_principal, @@ -486,7 +487,7 @@ int setup_callback_client(struct nfs4_client *clp) if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; if (cb->cb_minorversion) { - args.bc_xprt = clp->cl_cb_xprt; + args.bc_xprt = cb->cb_xprt; args.protocol = XPRT_TRANSPORT_BC_TCP; } /* Create RPC client */ @@ -496,7 +497,7 @@ int setup_callback_client(struct nfs4_client *clp) PTR_ERR(client)); return PTR_ERR(client); } - cb->cb_client = client; + nfsd4_set_callback_client(clp, client); return 0; } @@ -514,8 +515,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) if (task->tk_status) warn_no_callback_path(clp, task->tk_status); else - atomic_set(&clp->cl_cb_conn.cb_set, 1); - put_nfs4_client(clp); + atomic_set(&clp->cl_cb_set, 1); } static const struct rpc_call_ops nfsd4_cb_probe_ops = { @@ -537,7 +537,6 @@ int set_callback_cred(void) void do_probe_callback(struct nfs4_client *clp) { - struct nfs4_cb_conn *cb = &clp->cl_cb_conn; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, @@ -545,34 +544,27 @@ void do_probe_callback(struct nfs4_client *clp) }; int status; - status = rpc_call_async(cb->cb_client, &msg, + status = rpc_call_async(clp->cl_cb_client, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); - if (status) { + if (status) warn_no_callback_path(clp, status); - put_nfs4_client(clp); - } } /* * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... */ -void -nfsd4_probe_callback(struct nfs4_client *clp) +void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb) { int status; - BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set)); + BUG_ON(atomic_read(&clp->cl_cb_set)); - status = setup_callback_client(clp); + status = setup_callback_client(clp, cb); if (status) { warn_no_callback_path(clp, status); return; } - - /* the task holds a reference to the nfs4_client struct */ - atomic_inc(&clp->cl_count); - do_probe_callback(clp); } @@ -658,18 +650,32 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) } } + static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfs4_delegation *dp = calldata; struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ + /* XXX: err, or is it ok just to fall through + * and rpc_restart_call? */ + return; + } + switch (task->tk_status) { case -EIO: /* Network partition? */ - atomic_set(&clp->cl_cb_conn.cb_set, 0); + atomic_set(&clp->cl_cb_set, 0); warn_no_callback_path(clp, task->tk_status); + if (current_rpc_client != task->tk_client) { + /* queue a callback on the new connection: */ + nfsd4_cb_recall(dp); + return; + } case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: /* Race: client probably got cb_recall @@ -677,7 +683,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) break; default: /* success, or error we can't handle */ - goto done; + return; } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); @@ -685,20 +691,16 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) rpc_restart_call(task); return; } else { - atomic_set(&clp->cl_cb_conn.cb_set, 0); + atomic_set(&clp->cl_cb_set, 0); warn_no_callback_path(clp, task->tk_status); } -done: - kfree(task->tk_msg.rpc_argp); } static void nfsd4_cb_recall_release(void *calldata) { struct nfs4_delegation *dp = calldata; - struct nfs4_client *clp = dp->dl_client; nfs4_put_delegation(dp); - put_nfs4_client(clp); } static const struct rpc_call_ops nfsd4_cb_recall_ops = { @@ -707,33 +709,75 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = { .rpc_release = nfsd4_cb_recall_release, }; +static struct workqueue_struct *callback_wq; + +int nfsd4_create_callback_queue(void) +{ + callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); + if (!callback_wq) + return -ENOMEM; + return 0; +} + +void nfsd4_destroy_callback_queue(void) +{ + destroy_workqueue(callback_wq); +} + +/* must be called under the state lock */ +void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new) +{ + struct rpc_clnt *old = clp->cl_cb_client; + + clp->cl_cb_client = new; + /* + * After this, any work that saw the old value of cl_cb_client will + * be gone: + */ + flush_workqueue(callback_wq); + /* So we can safely shut it down: */ + if (old) + rpc_shutdown_client(old); +} + /* * called with dp->dl_count inc'ed. */ -void -nfsd4_cb_recall(struct nfs4_delegation *dp) +static void _nfsd4_cb_recall(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_client; - struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; - struct nfs4_rpc_args *args; + struct rpc_clnt *clnt = clp->cl_cb_client; + struct nfs4_rpc_args *args = &dp->dl_recall.cb_args; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], .rpc_cred = callback_cred }; - int status = -ENOMEM; + int status; + + if (clnt == NULL) + return; /* Client is shutting down; give up. */ - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - goto out; args->args_op = dp; msg.rpc_argp = args; dp->dl_retries = 1; status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); -out: - if (status) { - kfree(args); - put_nfs4_client(clp); + if (status) nfs4_put_delegation(dp); - } +} + +void nfsd4_do_callback_rpc(struct work_struct *w) +{ + /* XXX: for now, just send off delegation recall. */ + /* In future, generalize to handle any sort of callback. */ + struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work); + struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall); + + _nfsd4_cb_recall(dp); +} + + +void nfsd4_cb_recall(struct nfs4_delegation *dp) +{ + queue_work(callback_wq, &dp->dl_recall.cb_work); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2ab9e85..59ec449 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); /* - * Enforce NFSv4.1 COMPOUND ordering rules. + * Enforce NFSv4.1 COMPOUND ordering rules: * - * TODO: - * - enforce NFS4ERR_NOT_ONLY_OP, - * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. + * Also note, enforced elsewhere: + * - SEQUENCE other than as first op results in + * NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().) + * - BIND_CONN_TO_SESSION must be the only op in its compound + * (Will be enforced in nfsd4_bind_conn_to_session().) + * - DESTROY_SESSION must be the final operation in a compound, if + * sessionid's in SEQUENCE and DESTROY_SESSION are the same. + * (Enforced in nfsd4_destroy_session().) */ -static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) +static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) { - if (args->minorversion && args->opcnt > 0) { - struct nfsd4_op *op = &args->ops[0]; - return (op->status == nfserr_op_illegal) || - (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); - } - return true; + struct nfsd4_op *op = &args->ops[0]; + + /* These ordering requirements don't apply to NFSv4.0: */ + if (args->minorversion == 0) + return nfs_ok; + /* This is weird, but OK, not our problem: */ + if (args->opcnt == 0) + return nfs_ok; + if (op->status == nfserr_op_illegal) + return nfs_ok; + if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP)) + return nfserr_op_not_in_session; + if (op->opnum == OP_SEQUENCE) + return nfs_ok; + if (args->opcnt != 1) + return nfserr_not_only_op; + return nfs_ok; } /* @@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->rqstp = rqstp; resp->cstate.minorversion = args->minorversion; resp->cstate.replay_owner = NULL; + resp->cstate.session = NULL; fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); /* Use the deferral mechanism only for NFSv4.0 compounds */ @@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (args->minorversion > nfsd_supported_minorversion) goto out; - if (!nfs41_op_ordering_ok(args)) { + status = nfs41_check_op_ordering(args); + if (status) { op = &args->ops[0]; - op->status = nfserr_sequence_pos; + op->status = status; goto encode_op; } - status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; @@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_SEQUENCE", }, + [OP_RECLAIM_COMPLETE] = { + .op_func = (nfsd4op_func)nfsd4_reclaim_complete, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, }; static const char *nfsd4_op_name(unsigned opnum) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6a8feda..12f7109 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -45,8 +45,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -static time_t lease_time = 90; /* default lease time */ -static time_t user_lease_time = 90; +time_t nfsd4_lease = 90; /* default lease time */ +time_t nfsd4_grace = 90; static time_t boot_time; static u32 current_ownerid = 1; static u32 current_fileid = 1; @@ -190,7 +190,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_vfs_file = stp->st_vfs_file; dp->dl_type = type; dp->dl_ident = cb->cb_ident; - dp->dl_stateid.si_boot = get_seconds(); + dp->dl_stateid.si_boot = boot_time; dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; dp->dl_stateid.si_generation = 0; @@ -199,6 +199,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f atomic_set(&dp->dl_count, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); + INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); return dp; } @@ -249,6 +250,9 @@ unhash_delegation(struct nfs4_delegation *dp) * SETCLIENTID state */ +/* client_lock protects the client lru list and session hash table */ +static DEFINE_SPINLOCK(client_lock); + /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 #define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) @@ -367,7 +371,6 @@ static void release_openowner(struct nfs4_stateowner *sop) nfs4_put_stateowner(sop); } -static DEFINE_SPINLOCK(sessionid_lock); #define SESSION_HASH_SIZE 512 static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; @@ -565,10 +568,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, new->se_flags = cses->flags; kref_init(&new->se_ref); - spin_lock(&sessionid_lock); + spin_lock(&client_lock); list_add(&new->se_hash, &sessionid_hashtbl[idx]); list_add(&new->se_perclnt, &clp->cl_sessions); - spin_unlock(&sessionid_lock); + spin_unlock(&client_lock); status = nfs_ok; out: @@ -579,7 +582,7 @@ out_free: goto out; } -/* caller must hold sessionid_lock */ +/* caller must hold client_lock */ static struct nfsd4_session * find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) { @@ -602,7 +605,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) return NULL; } -/* caller must hold sessionid_lock */ +/* caller must hold client_lock */ static void unhash_session(struct nfsd4_session *ses) { @@ -610,15 +613,6 @@ unhash_session(struct nfsd4_session *ses) list_del(&ses->se_perclnt); } -static void -release_session(struct nfsd4_session *ses) -{ - spin_lock(&sessionid_lock); - unhash_session(ses); - spin_unlock(&sessionid_lock); - nfsd4_put_session(ses); -} - void free_session(struct kref *kref) { @@ -634,9 +628,18 @@ free_session(struct kref *kref) kfree(ses); } +/* must be called under the client_lock */ static inline void -renew_client(struct nfs4_client *clp) +renew_client_locked(struct nfs4_client *clp) { + if (is_client_expired(clp)) { + dprintk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + /* * Move client to the end to the LRU list. */ @@ -647,6 +650,14 @@ renew_client(struct nfs4_client *clp) clp->cl_time = get_seconds(); } +static inline void +renew_client(struct nfs4_client *clp) +{ + spin_lock(&client_lock); + renew_client_locked(clp); + spin_unlock(&client_lock); +} + /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int STALE_CLIENTID(clientid_t *clid) @@ -680,27 +691,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return clp; } -static void -shutdown_callback_client(struct nfs4_client *clp) -{ - struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; - - if (clnt) { - /* - * Callback threads take a reference on the client, so there - * should be no outstanding callbacks at this point. - */ - clp->cl_cb_conn.cb_client = NULL; - rpc_shutdown_client(clnt); - } -} - static inline void free_client(struct nfs4_client *clp) { - shutdown_callback_client(clp); - if (clp->cl_cb_xprt) - svc_xprt_put(clp->cl_cb_xprt); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -709,10 +702,34 @@ free_client(struct nfs4_client *clp) } void -put_nfs4_client(struct nfs4_client *clp) +release_session_client(struct nfsd4_session *session) { - if (atomic_dec_and_test(&clp->cl_count)) + struct nfs4_client *clp = session->se_client; + + if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) + return; + if (is_client_expired(clp)) { free_client(clp); + session->se_client = NULL; + } else + renew_client_locked(clp); + spin_unlock(&client_lock); + nfsd4_put_session(session); +} + +/* must be called under the client_lock */ +static inline void +unhash_client_locked(struct nfs4_client *clp) +{ + mark_client_expired(clp); + list_del(&clp->cl_lru); + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + unhash_session(ses); + nfsd4_put_session(ses); + } } static void @@ -722,9 +739,6 @@ expire_client(struct nfs4_client *clp) struct nfs4_delegation *dp; struct list_head reaplist; - dprintk("NFSD: expire_client cl_count %d\n", - atomic_read(&clp->cl_count)); - INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); while (!list_empty(&clp->cl_delegations)) { @@ -740,20 +754,20 @@ expire_client(struct nfs4_client *clp) list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } - list_del(&clp->cl_idhash); - list_del(&clp->cl_strhash); - list_del(&clp->cl_lru); while (!list_empty(&clp->cl_openowners)) { sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); release_openowner(sop); } - while (!list_empty(&clp->cl_sessions)) { - struct nfsd4_session *ses; - ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, - se_perclnt); - release_session(ses); - } - put_nfs4_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); + list_del(&clp->cl_idhash); + list_del(&clp->cl_strhash); + spin_lock(&client_lock); + unhash_client_locked(clp); + if (atomic_read(&clp->cl_refcount) == 0) + free_client(clp); + spin_unlock(&client_lock); } static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) @@ -839,14 +853,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, } memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); - atomic_set(&clp->cl_count, 1); - atomic_set(&clp->cl_cb_conn.cb_set, 0); + atomic_set(&clp->cl_refcount, 0); + atomic_set(&clp->cl_cb_set, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_sessions); INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); @@ -877,8 +892,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); idhashval = clientid_hashval(clp->cl_clientid.cl_id); list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); - list_add_tail(&clp->cl_lru, &client_lru); - clp->cl_time = get_seconds(); + renew_client(clp); } static void @@ -888,10 +902,9 @@ move_to_confirmed(struct nfs4_client *clp) unsigned int strhashval; dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); - list_del_init(&clp->cl_strhash); list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); strhashval = clientstr_hashval(clp->cl_recdir); - list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); + list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); } @@ -1327,15 +1340,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, cs_slot->sl_seqid++; /* from 0 to 1 */ move_to_confirmed(unconf); - /* - * We do not support RDMA or persistent sessions - */ - cr_ses->flags &= ~SESSION4_PERSIST; - cr_ses->flags &= ~SESSION4_RDMA; - if (cr_ses->flags & SESSION4_BACK_CHAN) { - unconf->cl_cb_xprt = rqstp->rq_xprt; - svc_xprt_get(unconf->cl_cb_xprt); + unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; + svc_xprt_get(rqstp->rq_xprt); rpc_copy_addr( (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, sa); @@ -1344,7 +1351,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, cstate->minorversion; unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; unconf->cl_cb_seq_nr = 1; - nfsd4_probe_callback(unconf); + nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); } conf = unconf; } else { @@ -1352,6 +1359,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out; } + /* + * We do not support RDMA or persistent sessions + */ + cr_ses->flags &= ~SESSION4_PERSIST; + cr_ses->flags &= ~SESSION4_RDMA; + status = alloc_init_session(rqstp, conf, cr_ses); if (status) goto out; @@ -1369,6 +1382,21 @@ out: return status; } +static bool nfsd4_last_compound_op(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + return argp->opcnt == resp->opcnt; +} + +static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) +{ + if (!session) + return 0; + return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); +} + __be32 nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, @@ -1384,19 +1412,25 @@ nfsd4_destroy_session(struct svc_rqst *r, * - Do we need to clear any callback info from previous session? */ + if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { + if (!nfsd4_last_compound_op(r)) + return nfserr_not_only_op; + } dump_sessionid(__func__, &sessionid->sessionid); - spin_lock(&sessionid_lock); + spin_lock(&client_lock); ses = find_in_sessionid_hashtbl(&sessionid->sessionid); if (!ses) { - spin_unlock(&sessionid_lock); + spin_unlock(&client_lock); goto out; } unhash_session(ses); - spin_unlock(&sessionid_lock); + spin_unlock(&client_lock); + nfs4_lock_state(); /* wait for callbacks */ - shutdown_callback_client(ses->se_client); + nfsd4_set_callback_client(ses->se_client, NULL); + nfs4_unlock_state(); nfsd4_put_session(ses); status = nfs_ok; out: @@ -1417,7 +1451,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (resp->opcnt != 1) return nfserr_sequence_pos; - spin_lock(&sessionid_lock); + spin_lock(&client_lock); status = nfserr_badsession; session = find_in_sessionid_hashtbl(&seq->sessionid); if (!session) @@ -1456,23 +1490,47 @@ nfsd4_sequence(struct svc_rqst *rqstp, cstate->slot = slot; cstate->session = session; - /* Hold a session reference until done processing the compound: - * nfsd4_put_session called only if the cstate slot is set. - */ - nfsd4_get_session(session); out: - spin_unlock(&sessionid_lock); - /* Renew the clientid on success and on replay */ + /* Hold a session reference until done processing the compound. */ if (cstate->session) { - nfs4_lock_state(); - renew_client(session->se_client); - nfs4_unlock_state(); + nfsd4_get_session(cstate->session); + atomic_inc(&session->se_client->cl_refcount); } + spin_unlock(&client_lock); dprintk("%s: return %d\n", __func__, ntohl(status)); return status; } __be32 +nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) +{ + if (rc->rca_one_fs) { + if (!cstate->current_fh.fh_dentry) + return nfserr_nofilehandle; + /* + * We don't take advantage of the rca_one_fs case. + * That's OK, it's optional, we can safely ignore it. + */ + return nfs_ok; + } + nfs4_lock_state(); + if (is_client_expired(cstate->session->se_client)) { + nfs4_unlock_state(); + /* + * The following error isn't really legal. + * But we only get here if the client just explicitly + * destroyed the client. Surely it no longer cares what + * error it gets back on an operation for the dead + * client. + */ + return nfserr_stale_clientid; + } + nfsd4_create_clid_dir(cstate->session->se_client); + nfs4_unlock_state(); + return nfs_ok; +} + +__be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) { @@ -1631,9 +1689,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { - /* XXX: We just turn off callbacks until we can handle - * change request correctly. */ - atomic_set(&conf->cl_cb_conn.cb_set, 0); + atomic_set(&conf->cl_cb_set, 0); + nfsd4_probe_callback(conf, &unconf->cl_cb_conn); expire_client(unconf); status = nfs_ok; @@ -1667,7 +1724,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } move_to_confirmed(unconf); conf = unconf; - nfsd4_probe_callback(conf); + nfsd4_probe_callback(conf, &conf->cl_cb_conn); status = nfs_ok; } } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) @@ -1700,12 +1757,12 @@ alloc_init_file(struct inode *ino) INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); - spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); return fp; } return NULL; @@ -1827,7 +1884,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_stateowner = sop; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = get_seconds(); + stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; stp->st_stateid.si_generation = 0; @@ -2028,7 +2085,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl) * lock) we know the server hasn't removed the lease yet, we know * it's safe to take a reference: */ atomic_inc(&dp->dl_count); - atomic_inc(&dp->dl_client->cl_count); spin_lock(&recall_lock); list_add_tail(&dp->dl_recall_lru, &del_recall_lru); @@ -2347,7 +2403,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta { struct nfs4_delegation *dp; struct nfs4_stateowner *sop = stp->st_stateowner; - struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn; + int cb_up = atomic_read(&sop->so_client->cl_cb_set); struct file_lock fl, *flp = &fl; int status, flag = 0; @@ -2355,7 +2411,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta open->op_recall = 0; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: - if (!atomic_read(&cb->cb_set)) + if (!cb_up) open->op_recall = 1; flag = open->op_delegate_type; if (flag == NFS4_OPEN_DELEGATE_NONE) @@ -2366,7 +2422,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta * had the chance to reclaim theirs.... */ if (locks_in_grace()) goto out; - if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) + if (!cb_up || !sop->so_confirmed) goto out; if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) flag = NFS4_OPEN_DELEGATE_WRITE; @@ -2483,10 +2539,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); - if (nfsd4_has_session(&resp->cstate)) { + if (nfsd4_has_session(&resp->cstate)) open->op_stateowner->so_confirmed = 1; - nfsd4_create_clid_dir(open->op_stateowner->so_client); - } /* * Attempt to hand out a delegation. No error return, because the @@ -2537,7 +2591,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, renew_client(clp); status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) - && !atomic_read(&clp->cl_cb_conn.cb_set)) + && !atomic_read(&clp->cl_cb_set)) goto out; status = nfs_ok; out: @@ -2554,6 +2608,12 @@ nfsd4_end_grace(void) dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); locks_end_grace(&nfsd4_manager); + /* + * Now that every NFSv4 client has had the chance to recover and + * to see the (possibly new, possibly shorter) lease time, we + * can safely set the next grace time to the current lease time: + */ + nfsd4_grace = nfsd4_lease; } static time_t @@ -2563,15 +2623,17 @@ nfs4_laundromat(void) struct nfs4_stateowner *sop; struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; - time_t cutoff = get_seconds() - NFSD_LEASE_TIME; - time_t t, clientid_val = NFSD_LEASE_TIME; - time_t u, test_val = NFSD_LEASE_TIME; + time_t cutoff = get_seconds() - nfsd4_lease; + time_t t, clientid_val = nfsd4_lease; + time_t u, test_val = nfsd4_lease; nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); if (locks_in_grace()) nfsd4_end_grace(); + INIT_LIST_HEAD(&reaplist); + spin_lock(&client_lock); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -2580,12 +2642,22 @@ nfs4_laundromat(void) clientid_val = t; break; } + if (atomic_read(&clp->cl_refcount)) { + dprintk("NFSD: client in use (clientid %08x)\n", + clp->cl_clientid.cl_id); + continue; + } + unhash_client_locked(clp); + list_add(&clp->cl_lru, &reaplist); + } + spin_unlock(&client_lock); + list_for_each_safe(pos, next, &reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); dprintk("NFSD: purging unused client (clientid %08x)\n", clp->cl_clientid.cl_id); nfsd4_remove_clid_dir(clp); expire_client(clp); } - INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); list_for_each_safe(pos, next, &del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); @@ -2605,7 +2677,7 @@ nfs4_laundromat(void) list_del_init(&dp->dl_recall_lru); unhash_delegation(dp); } - test_val = NFSD_LEASE_TIME; + test_val = nfsd4_lease; list_for_each_safe(pos, next, &close_lru) { sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) { @@ -2661,39 +2733,11 @@ nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) static int STALE_STATEID(stateid_t *stateid) { - if (time_after((unsigned long)boot_time, - (unsigned long)stateid->si_boot)) { - dprintk("NFSD: stale stateid " STATEID_FMT "!\n", - STATEID_VAL(stateid)); - return 1; - } - return 0; -} - -static int -EXPIRED_STATEID(stateid_t *stateid) -{ - if (time_before((unsigned long)boot_time, - ((unsigned long)stateid->si_boot)) && - time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) { - dprintk("NFSD: expired stateid " STATEID_FMT "!\n", - STATEID_VAL(stateid)); - return 1; - } - return 0; -} - -static __be32 -stateid_error_map(stateid_t *stateid) -{ - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; - if (EXPIRED_STATEID(stateid)) - return nfserr_expired; - - dprintk("NFSD: bad stateid " STATEID_FMT "!\n", + if (stateid->si_boot == boot_time) + return 0; + dprintk("NFSD: stale stateid " STATEID_FMT "!\n", STATEID_VAL(stateid)); - return nfserr_bad_stateid; + return 1; } static inline int @@ -2817,10 +2861,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, status = nfserr_bad_stateid; if (is_delegation_stateid(stateid)) { dp = find_delegation_stateid(ino, stateid); - if (!dp) { - status = stateid_error_map(stateid); + if (!dp) goto out; - } status = check_stateid_generation(stateid, &dp->dl_stateid, flags); if (status) @@ -2833,10 +2875,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); - if (!stp) { - status = stateid_error_map(stateid); + if (!stp) goto out; - } if (nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) @@ -2908,7 +2948,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, */ sop = search_close_lru(stateid->si_stateownerid, flags); if (sop == NULL) - return stateid_error_map(stateid); + return nfserr_bad_stateid; *sopp = sop; goto check_replay; } @@ -3175,10 +3215,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!is_delegation_stateid(stateid)) goto out; dp = find_delegation_stateid(inode, stateid); - if (!dp) { - status = stateid_error_map(stateid); + if (!dp) goto out; - } status = check_stateid_generation(stateid, &dp->dl_stateid, flags); if (status) goto out; @@ -3404,7 +3442,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_stateowner = sop; get_nfs4_file(fp); stp->st_file = fp; - stp->st_stateid.si_boot = get_seconds(); + stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; stp->st_stateid.si_generation = 0; @@ -3976,12 +4014,6 @@ nfsd4_load_reboot_recovery_data(void) printk("NFSD: Failure reading reboot recovery data\n"); } -unsigned long -get_nfs4_grace_period(void) -{ - return max(user_lease_time, lease_time) * HZ; -} - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -4008,20 +4040,27 @@ set_max_delegations(void) static int __nfs4_state_start(void) { - unsigned long grace_time; + int ret; boot_time = get_seconds(); - grace_time = get_nfs4_grace_period(); - lease_time = user_lease_time; locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", - grace_time/HZ); + nfsd4_grace); + ret = set_callback_cred(); + if (ret) + return -ENOMEM; laundry_wq = create_singlethread_workqueue("nfsd4"); if (laundry_wq == NULL) return -ENOMEM; - queue_delayed_work(laundry_wq, &laundromat_work, grace_time); + ret = nfsd4_create_callback_queue(); + if (ret) + goto out_free_laundry; + queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); set_max_delegations(); - return set_callback_cred(); + return 0; +out_free_laundry: + destroy_workqueue(laundry_wq); + return ret; } int @@ -4039,12 +4078,6 @@ nfs4_state_start(void) return 0; } -time_t -nfs4_lease_time(void) -{ - return lease_time; -} - static void __nfs4_state_shutdown(void) { @@ -4089,6 +4122,7 @@ nfs4_state_shutdown(void) nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); + nfsd4_destroy_callback_queue(); nfs4_unlock_state(); } @@ -4128,21 +4162,3 @@ nfs4_recoverydir(void) { return user_recovery_dirname; } - -/* - * Called when leasetime is changed. - * - * The only way the protocol gives us to handle on-the-fly lease changes is to - * simulate a reboot. Instead of doing that, we just wait till the next time - * we start to register any changes in lease time. If the administrator - * really wants to change the lease time *now*, they can go ahead and bring - * nfsd down and then back up again after changing the lease time. - * - * user_lease_time is protected by nfsd_mutex since it's only really accessed - * when nfsd is starting - */ -void -nfs4_reset_lease(time_t leasetime) -{ - user_lease_time = leasetime; -} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 34ccf81..ac17a70 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1234,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, DECODE_TAIL; } +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +{ + DECODE_HEAD; + + READ_BUF(4); + READ32(rc->rca_one_fs); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { @@ -1346,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, }; struct nfsd4_minorversion_ops { @@ -1900,7 +1910,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_LEASE_TIME) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(NFSD_LEASE_TIME); + WRITE32(nfsd4_lease); } if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { if ((buflen -= 4) < 0) @@ -3307,11 +3317,14 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov = &rqstp->rq_res.head[0]; iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); - if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) { - nfsd4_store_cache_entry(resp); - dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); - resp->cstate.slot->sl_inuse = false; - nfsd4_put_session(resp->cstate.session); + if (nfsd4_has_session(cs)) { + if (cs->status != nfserr_replay_cache) { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + cs->slot->sl_inuse = false; + } + /* Renew the clientid on success and on replay */ + release_session_client(cs->session); } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e359107..bc3194e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -46,6 +46,7 @@ enum { */ #ifdef CONFIG_NFSD_V4 NFSD_Leasetime, + NFSD_Gracetime, NFSD_RecoveryDir, #endif }; @@ -70,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); +static ssize_t write_gracetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif @@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_MaxBlkSize] = write_maxblksize, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, + [NFSD_Gracetime] = write_gracetime, [NFSD_RecoveryDir] = write_recoverydir, #endif }; @@ -1204,29 +1207,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) } #ifdef CONFIG_NFSD_V4 -extern time_t nfs4_leasetime(void); - -static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) +static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) { - /* if size > 10 seconds, call - * nfs4_reset_lease() then write out the new lease (seconds) as reply - */ char *mesg = buf; - int rv, lease; + int rv, i; if (size > 0) { if (nfsd_serv) return -EBUSY; - rv = get_int(&mesg, &lease); + rv = get_int(&mesg, &i); if (rv) return rv; - if (lease < 10 || lease > 3600) + /* + * Some sanity checking. We don't have a reason for + * these particular numbers, but problems with the + * extremes are: + * - Too short: the briefest network outage may + * cause clients to lose all their locks. Also, + * the frequent polling may be wasteful. + * - Too long: do you really want reboot recovery + * to take more than an hour? Or to make other + * clients wait an hour before being able to + * revoke a dead client's locks? + */ + if (i < 10 || i > 3600) return -EINVAL; - nfs4_reset_lease(lease); + *time = i; } - return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", - nfs4_lease_time()); + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); +} + +static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __nfsd4_write_time(file, buf, size, time); + mutex_unlock(&nfsd_mutex); + return rv; } /** @@ -1252,12 +1271,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { - ssize_t rv; + return nfsd4_write_time(file, buf, size, &nfsd4_lease); +} - mutex_lock(&nfsd_mutex); - rv = __write_leasetime(file, buf, size); - mutex_unlock(&nfsd_mutex); - return rv; +/** + * write_gracetime - Set or report current NFSv4 grace period time + * + * As above, but sets the time of the NFSv4 grace period. + * + * Note this should never be set to less than the *previous* + * lease-period time, but we don't try to enforce this. (In the common + * case (a new boot), we don't know what the previous lease time was + * anyway.) + */ +static ssize_t write_gracetime(struct file *file, char *buf, size_t size) +{ + return nfsd4_write_time(file, buf, size, &nfsd4_grace); } extern char *nfs4_recoverydir(void); @@ -1351,6 +1380,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, #endif /* last one */ {""} diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index e942a1a..7237776 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -82,7 +82,6 @@ int nfs4_state_init(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); void nfs4_state_shutdown(void); -time_t nfs4_lease_time(void); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); #else @@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } static inline void nfs4_state_shutdown(void) { } -static inline time_t nfs4_lease_time(void) { return 0; } static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } #endif @@ -229,6 +227,9 @@ extern struct timeval nfssvc_boot; #ifdef CONFIG_NFSD_V4 +extern time_t nfsd4_lease; +extern time_t nfsd4_grace; + /* before processing a COMPOUND operation, we have to check that there * is enough space in the buffer for XDR encode to succeed. otherwise, * we might process an operation with side effects, and be unable to @@ -247,7 +248,6 @@ extern struct timeval nfssvc_boot; #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ -#define NFSD_LEASE_TIME (nfs4_lease_time()) #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ /* diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 171699e..06b2a26 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion; int nfsd_vers(int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) - return -1; + return 0; switch(change) { case NFSD_SET: nfsd_versions[vers] = nfsd_version[vers]; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index fefeae2..006c842 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,16 @@ struct nfsd4_cb_sequence { struct nfs4_client *cbs_clp; }; +struct nfs4_rpc_args { + void *args_op; + struct nfsd4_cb_sequence args_seq; +}; + +struct nfsd4_callback { + struct nfs4_rpc_args cb_args; + struct work_struct cb_work; +}; + struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; @@ -86,6 +96,7 @@ struct nfs4_delegation { stateid_t dl_stateid; struct knfsd_fh dl_fh; int dl_retries; + struct nfsd4_callback dl_recall; }; /* client delegation callback info */ @@ -96,9 +107,7 @@ struct nfs4_cb_conn { u32 cb_prog; u32 cb_minorversion; u32 cb_ident; /* minorversion 0 only */ - /* RPC client info */ - atomic_t cb_set; /* successful CB_NULL call */ - struct rpc_clnt * cb_client; + struct svc_xprt *cb_xprt; /* minorversion 1 only */ }; /* Maximum number of slots per session. 160 is useful for long haul TCP */ @@ -157,7 +166,7 @@ struct nfsd4_session { struct list_head se_hash; /* hash by sessionid */ struct list_head se_perclnt; u32 se_flags; - struct nfs4_client *se_client; /* for expire_client */ + struct nfs4_client *se_client; struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; struct nfsd4_channel_attrs se_bchannel; @@ -212,25 +221,41 @@ struct nfs4_client { struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ - struct nfs4_cb_conn cl_cb_conn; /* callback info */ - atomic_t cl_count; /* ref count */ u32 cl_firststate; /* recovery dir creation */ + /* for v4.0 and v4.1 callbacks: */ + struct nfs4_cb_conn cl_cb_conn; + struct rpc_clnt *cl_cb_client; + atomic_t cl_cb_set; + /* for nfs41 */ struct list_head cl_sessions; struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ u32 cl_exchange_flags; struct nfs4_sessionid cl_sessionid; + /* number of rpc's in progress over an associated session: */ + atomic_t cl_refcount; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ unsigned long cl_cb_slot_busy; u32 cl_cb_seq_nr; - struct svc_xprt *cl_cb_xprt; /* 4.1 callback transport */ struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ }; +static inline void +mark_client_expired(struct nfs4_client *clp) +{ + clp->cl_time = 0; +} + +static inline bool +is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + /* struct nfs4_client_reset * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl * upon lease reset, or from upcall to state_daemon (to read in state @@ -377,11 +402,14 @@ extern void nfs4_lock_state(void); extern void nfs4_unlock_state(void); extern int nfs4_in_grace(void); extern __be32 nfs4_check_open_reclaim(clientid_t *clid); -extern void put_nfs4_client(struct nfs4_client *clp); extern void nfs4_free_stateowner(struct kref *kref); extern int set_callback_cred(void); -extern void nfsd4_probe_callback(struct nfs4_client *clp); +extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd4_do_callback_rpc(struct work_struct *); extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern int nfsd4_create_callback_queue(void); +extern void nfsd4_destroy_callback_queue(void); +extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *); extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); extern void nfsd4_init_recdir(char *recdir_name); @@ -392,6 +420,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); extern void nfsd4_recdir_purge_old(void); extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); +extern void release_session_client(struct nfsd4_session *); static inline void nfs4_put_stateowner(struct nfs4_stateowner *so) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6dd5f19..23c06f7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -724,7 +724,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, struct inode *inode; int flags = O_RDONLY|O_LARGEFILE; __be32 err; - int host_err; + int host_err = 0; validate_process_creds(); @@ -761,7 +761,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, * Check to see if there are any leases on this file. * This may block while leases are broken. */ - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) + host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); if (host_err == -EWOULDBLOCK) host_err = -ETIMEDOUT; if (host_err) /* NOMEM or WOULDBLOCK */ @@ -1169,7 +1170,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 4b1de0a..217a62c 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -20,6 +20,7 @@ #define NFSD_MAY_OWNER_OVERRIDE 64 #define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 +#define NFSD_MAY_NOT_BREAK_LEASE 512 #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index efa3377..4d476ff 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -381,6 +381,10 @@ struct nfsd4_destroy_session { struct nfs4_sessionid sessionid; }; +struct nfsd4_reclaim_complete { + u32 rca_one_fs; +}; + struct nfsd4_op { int opnum; __be32 status; @@ -421,6 +425,7 @@ struct nfsd4_op { struct nfsd4_create_session create_session; struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; } u; struct nfs4_replay * replay; }; @@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, - struct nfsd4_compound_state *, -struct nfsd4_exchange_id *); - extern __be32 nfsd4_create_session(struct svc_rqst *, + struct nfsd4_compound_state *, struct nfsd4_exchange_id *); +extern __be32 nfsd4_create_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_create_session *); extern __be32 nfsd4_sequence(struct svc_rqst *, @@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *, extern __be32 nfsd4_destroy_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_session *); +__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, struct nfsd4_open *open); extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 7cfb87e..d7fd696 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -31,6 +31,11 @@ #include "alloc.h" +/** + * nilfs_palloc_groups_per_desc_block - get the number of groups that a group + * descriptor block can maintain + * @inode: inode of metadata file using this allocator + */ static inline unsigned long nilfs_palloc_groups_per_desc_block(const struct inode *inode) { @@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode) sizeof(struct nilfs_palloc_group_desc); } +/** + * nilfs_palloc_groups_count - get maximum number of groups + * @inode: inode of metadata file using this allocator + */ static inline unsigned long nilfs_palloc_groups_count(const struct inode *inode) { return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); } +/** + * nilfs_palloc_init_blockgroup - initialize private variables for allocator + * @inode: inode of metadata file using this allocator + * @entry_size: size of the persistent object + */ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); @@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) return 0; } +/** + * nilfs_palloc_group - get group number and offset from an entry number + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @offset: pointer to store offset number in the group + */ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, unsigned long *offset) { @@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, return group; } +/** + * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_desc_blkoff() returns block offset of the descriptor + * block which contains a descriptor of the specified group. + */ static unsigned long nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) { @@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; } +/** + * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * + * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap + * block used to allocate/deallocate entries in the specified group. + */ static unsigned long nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) { @@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; } +/** + * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + */ static unsigned long nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, const struct nilfs_palloc_group_desc *desc) @@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, return nfree; } +/** + * nilfs_palloc_group_desc_add_entries - adjust count of free entries + * @inode: inode of metadata file using this allocator + * @group: group number + * @desc: pointer to descriptor structure for the group + * @n: delta to be added + */ static void nilfs_palloc_group_desc_add_entries(struct inode *inode, unsigned long group, @@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode, spin_unlock(nilfs_mdt_bgl_lock(inode, group)); } +/** + * nilfs_palloc_entry_blkoff - get block offset of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + */ static unsigned long nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) { @@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) group_offset / NILFS_MDT(inode)->mi_entries_per_block; } +/** + * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block + * @inode: inode of metadata file + * @bh: buffer head of the buffer to be initialized + * @kaddr: kernel address mapped for the page including the buffer + */ static void nilfs_palloc_desc_block_init(struct inode *inode, struct buffer_head *bh, void *kaddr) { @@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, return ret; } +/** + * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) @@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode, bhp, &cache->prev_desc, &cache->lock); } +/** + * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) @@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, &cache->prev_bitmap, &cache->lock); } +/** + * nilfs_palloc_get_entry_block - get buffer head of an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @create: create flag + * @bhp: pointer to store the resultant buffer head + */ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { @@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, &cache->prev_entry, &cache->lock); } +/** + * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * @inode: inode of metadata file using this allocator + * @group: group number + * @bh: buffer head of the buffer storing the group descriptor block + * @kaddr: kernel address mapped for the page including the buffer + */ static struct nilfs_palloc_group_desc * nilfs_palloc_block_get_group_desc(const struct inode *inode, unsigned long group, @@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode, group % nilfs_palloc_groups_per_desc_block(inode); } +/** + * nilfs_palloc_block_get_entry - get kernel address of an entry + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the buffer storing the entry block + * @kaddr: kernel address mapped for the page including the buffer + */ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, const struct buffer_head *bh, void *kaddr) { @@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, entry_offset * NILFS_MDT(inode)->mi_entry_size; } +/** + * nilfs_palloc_find_available_slot - find available slot in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @target: offset number of an entry in the group (start point) + * @bitmap: bitmap of the group + * @bsize: size in bits + */ static int nilfs_palloc_find_available_slot(struct inode *inode, unsigned long group, unsigned long target, unsigned char *bitmap, - int bsize) /* size in bits */ + int bsize) { int curr, pos, end, i; @@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode, return -ENOSPC; } +/** + * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups + * in a group descriptor block + * @inode: inode of metadata file using this allocator + * @curr: current group number + * @max: maximum number of groups + */ static unsigned long nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, unsigned long curr, unsigned long max) @@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, max - curr + 1); } +/** + * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, return ret; } +/** + * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode, brelse(req->pr_desc_bh); } +/** + * nilfs_palloc_commit_free_entry - finish deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, brelse(req->pr_desc_bh); } +/** + * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the allocation + */ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, req->pr_desc_bh = NULL; } +/** + * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ int nilfs_palloc_prepare_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode, return 0; } +/** + * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object + * @inode: inode of metadata file using this allocator + * @req: nilfs_palloc_req structure exchanged for the removal + */ void nilfs_palloc_abort_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { @@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode, req->pr_desc_bh = NULL; } +/** + * nilfs_palloc_group_is_in - judge if an entry is in a group + * @inode: inode of metadata file using this allocator + * @group: group number + * @nr: serial number of the entry (e.g. inode number) + */ static int nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) { @@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) return (nr >= first) && (nr <= last); } +/** + * nilfs_palloc_freev - deallocate a set of persistent objects + * @inode: inode of metadata file using this allocator + * @entry_nrs: array of entry numbers to be deallocated + * @nitems: number of entries stored in @entry_nrs + */ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) { struct buffer_head *desc_bh, *bitmap_bh; diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 5cccf87..9af34a7 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -29,6 +29,13 @@ #include <linux/buffer_head.h> #include <linux/fs.h> +/** + * nilfs_palloc_entries_per_group - get the number of entries per group + * @inode: inode of metadata file using this allocator + * + * The number of entries per group is defined by the number of bits + * that a bitmap block can maintain. + */ static inline unsigned long nilfs_palloc_entries_per_group(const struct inode *inode) { diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 76c38e3..b27a342 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,63 +31,16 @@ #include "alloc.h" #include "dat.h" -/** - * struct nilfs_btree_path - A path on which B-tree operations are executed - * @bp_bh: buffer head of node block - * @bp_sib_bh: buffer head of sibling node block - * @bp_index: index of child node - * @bp_oldreq: ptr end request for old ptr - * @bp_newreq: ptr alloc request for new ptr - * @bp_op: rebalance operation - */ -struct nilfs_btree_path { - struct buffer_head *bp_bh; - struct buffer_head *bp_sib_bh; - int bp_index; - union nilfs_bmap_ptr_req bp_oldreq; - union nilfs_bmap_ptr_req bp_newreq; - struct nilfs_btnode_chkey_ctxt bp_ctxt; - void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, - int, __u64 *, __u64 *); -}; - -/* - * B-tree path operations - */ - -static struct kmem_cache *nilfs_btree_path_cache; - -int __init nilfs_btree_path_cache_init(void) -{ - nilfs_btree_path_cache = - kmem_cache_create("nilfs2_btree_path_cache", - sizeof(struct nilfs_btree_path) * - NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); - return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM; -} - -void nilfs_btree_path_cache_destroy(void) -{ - kmem_cache_destroy(nilfs_btree_path_cache); -} - -static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) -{ - return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); -} - -static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) +static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - kmem_cache_free(nilfs_btree_path_cache, path); -} + struct nilfs_btree_path *path; + int level = NILFS_BTREE_LEVEL_DATA; -static void nilfs_btree_init_path(struct nilfs_btree_path *path) -{ - int level; + path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + if (path == NULL) + goto out; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { + for (; level < NILFS_BTREE_LEVEL_MAX; level++) { path[level].bp_bh = NULL; path[level].bp_sib_bh = NULL; path[level].bp_index = 0; @@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path) path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; path[level].bp_op = NULL; } + +out: + return path; } -static void nilfs_btree_release_path(struct nilfs_btree_path *path) +static void nilfs_btree_free_path(struct nilfs_btree_path *path) { - int level; + int level = NILFS_BTREE_LEVEL_DATA; - for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; - level++) + for (; level < NILFS_BTREE_LEVEL_MAX; level++) brelse(path[level].bp_bh); + + kmem_cache_free(nilfs_btree_path_cache, path); } /* @@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) @@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } @@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; @@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; @@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; @@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_release_path(path); nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 4b82d84..af638d5 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -30,9 +30,6 @@ #include "btnode.h" #include "bmap.h" -struct nilfs_btree; -struct nilfs_btree_path; - /** * struct nilfs_btree - B-tree structure * @bt_bmap: bmap base structure @@ -41,6 +38,25 @@ struct nilfs_btree { struct nilfs_bmap bt_bmap; }; +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; #define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE #define NILFS_BTREE_ROOT_NCHILDREN_MAX \ @@ -57,6 +73,7 @@ struct nilfs_btree { #define NILFS_BTREE_KEY_MIN ((__u64)0) #define NILFS_BTREE_KEY_MAX (~(__u64)0) +extern struct kmem_cache *nilfs_btree_path_cache; int nilfs_btree_path_cache_init(void); void nilfs_btree_path_cache_destroy(void); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 0957b58..5e226d4 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -451,7 +451,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, inode->i_op = &nilfs_special_inode_operations; init_special_inode( inode, inode->i_mode, - new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); } nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); brelse(bh); @@ -511,7 +511,7 @@ void nilfs_write_inode_common(struct inode *inode, nilfs_bmap_write(ii->i_bmap, raw_inode); else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_device_code = - cpu_to_le64(new_encode_dev(inode->i_rdev)); + cpu_to_le64(huge_encode_dev(inode->i_rdev)); /* When extending inode, nilfs->ns_inode_size should be checked for substitutions of appended fields */ } diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ba43146..bae2a51 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -105,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi, ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); + + /* need to verify ->ss_bytes field if read ->ss_cno */ } /** diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 17851f7..2e6a272 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -40,35 +40,10 @@ struct nilfs_write_info { sector_t blocknr; }; - static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs); static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); - -static struct kmem_cache *nilfs_segbuf_cachep; - -static void nilfs_segbuf_init_once(void *obj) -{ - memset(obj, 0, sizeof(struct nilfs_segment_buffer)); -} - -int __init nilfs_init_segbuf_cache(void) -{ - nilfs_segbuf_cachep = - kmem_cache_create("nilfs2_segbuf_cache", - sizeof(struct nilfs_segment_buffer), - 0, SLAB_RECLAIM_ACCOUNT, - nilfs_segbuf_init_once); - - return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0; -} - -void nilfs_destroy_segbuf_cache(void) -{ - kmem_cache_destroy(nilfs_segbuf_cachep); -} - struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) { struct nilfs_segment_buffer *segbuf; @@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; init_completion(&segbuf->sb_bio_event); atomic_set(&segbuf->sb_err, 0); @@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, - time_t ctime) + time_t ctime, __u64 cno) { int err; @@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; + segbuf->sb_sum.cno = cno; return 0; } @@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); raw_sum->ss_pad = 0; + raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); } /* * CRC calculation routines */ -void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, - u32 seed) +static void +nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; @@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_sumsum = cpu_to_le32(crc); } -void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, - u32 seed) +static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; @@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, raw_sum->ss_datasum = cpu_to_le32(crc); } +static void +nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct nilfs_super_root *raw_sr; + u32 crc; + + raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; @@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) { nilfs_release_buffers(&segbuf->sb_segsum_buffers); nilfs_release_buffers(&segbuf->sb_payload_buffers); + segbuf->sb_super_root = NULL; } /* @@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs) return ret; } +/** + * nilfs_add_checksums_on_logs - add checksums on the logs + * @logs: list of segment buffers storing target logs + * @seed: checksum seed value + */ +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, logs, sb_list) { + if (segbuf->sb_super_root) + nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + /* * BIO operations */ diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 94dfd35..fdf1c3b 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -37,6 +37,7 @@ * @sumbytes: Byte count of segment summary * @nfileblk: Total number of file blocks * @seg_seq: Segment sequence number + * @cno: Checkpoint number * @ctime: Creation time * @next: Block number of the next full segment */ @@ -48,6 +49,7 @@ struct nilfs_segsum_info { unsigned long sumbytes; unsigned long nfileblk; u64 seg_seq; + __u64 cno; time_t ctime; sector_t next; }; @@ -76,6 +78,7 @@ struct nilfs_segsum_info { * @sb_rest_blocks: Number of residual blocks in the current segment * @sb_segsum_buffers: List of buffers for segment summaries * @sb_payload_buffers: List of buffers for segment payload + * @sb_super_root: Pointer to buffer storing a super root block (if exists) * @sb_nbio: Number of flying bio requests * @sb_err: I/O error status * @sb_bio_event: Completion event of log writing @@ -95,6 +98,7 @@ struct nilfs_segment_buffer { /* Buffers */ struct list_head sb_segsum_buffers; struct list_head sb_payload_buffers; /* including super root */ + struct buffer_head *sb_super_root; /* io status */ int sb_nbio; @@ -121,6 +125,7 @@ struct nilfs_segment_buffer { b_assoc_buffers)) #define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) +extern struct kmem_cache *nilfs_segbuf_cachep; int __init nilfs_init_segbuf_cache(void); void nilfs_destroy_segbuf_cache(void); @@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); -void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); -void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); static inline void nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, @@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs, struct nilfs_segment_buffer *last); int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs); int nilfs_wait_on_logs(struct list_head *logs); +void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed); static inline void nilfs_destroy_logs(struct list_head *logs) { diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6a7dbd8..c920164 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, #define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) #define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) -/* - * Transaction - */ -static struct kmem_cache *nilfs_transaction_cachep; - -/** - * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info - * - * nilfs_init_transaction_cache() creates a slab cache for the struct - * nilfs_transaction_info. - * - * Return Value: On success, it returns 0. On error, one of the following - * negative error code is returned. - * - * %-ENOMEM - Insufficient memory available. - */ -int nilfs_init_transaction_cache(void) -{ - nilfs_transaction_cachep = - kmem_cache_create("nilfs2_transaction_cache", - sizeof(struct nilfs_transaction_info), - 0, SLAB_RECLAIM_ACCOUNT, NULL); - return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0; -} - -/** - * nilfs_destroy_transaction_cache - destroy the cache for transaction info - * - * nilfs_destroy_transaction_cache() frees the slab cache for the struct - * nilfs_transaction_info. - */ -void nilfs_destroy_transaction_cache(void) -{ - kmem_cache_destroy(nilfs_transaction_cachep); -} - static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) { struct nilfs_transaction_info *cur_ti = current->journal_info; @@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) if (nilfs_doing_gc()) flags = NILFS_SS_GC; - err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime, + sci->sc_sbi->s_nilfs->ns_cno); if (unlikely(err)) return err; @@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) return err; segbuf = sci->sc_curseg; } - err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); + err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root); if (likely(!err)) segbuf->sb_sum.flags |= NILFS_SS_SR; return err; @@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, *vblocknr = binfo->bi_v.bi_vblocknr; } -struct nilfs_sc_operations nilfs_sc_file_ops = { +static struct nilfs_sc_operations nilfs_sc_file_ops = { .collect_data = nilfs_collect_file_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_file_bmap, @@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, *binfo_dat = binfo->bi_dat; } -struct nilfs_sc_operations nilfs_sc_dat_ops = { +static struct nilfs_sc_operations nilfs_sc_dat_ops = { .collect_data = nilfs_collect_dat_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_dat_bmap, @@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = { .write_node_binfo = nilfs_write_dat_node_binfo, }; -struct nilfs_sc_operations nilfs_sc_dsync_ops = { +static struct nilfs_sc_operations nilfs_sc_dsync_ops = { .collect_data = nilfs_collect_file_data, .collect_node = NULL, .collect_bmap = NULL, @@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, } } -/* - * CRC calculation routines - */ -static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed) -{ - struct nilfs_super_root *raw_sr = - (struct nilfs_super_root *)bh_sr->b_data; - u32 crc; - - crc = crc32_le(seed, - (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), - NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); - raw_sr->sr_sum = cpu_to_le32(crc); -} - -static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci, - u32 seed) -{ - struct nilfs_segment_buffer *segbuf; - - if (sci->sc_super_root) - nilfs_fill_in_super_root_crc(sci->sc_super_root, seed); - - list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { - nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); - nilfs_segbuf_fill_in_data_crc(segbuf, seed); - } -} - static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct buffer_head *bh_sr = sci->sc_super_root; - struct nilfs_super_root *raw_sr = - (struct nilfs_super_root *)bh_sr->b_data; + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; unsigned isz = nilfs->ns_inode_size; + bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, /* Collection retry loop */ for (;;) { - sci->sc_super_root = NULL; sci->sc_nblk_this_inc = 0; sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); @@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, ssp.offset = sizeof(struct nilfs_segment_summary); list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) + if (bh == segbuf->sb_super_root) break; if (!finfo) { finfo = nilfs_segctor_map_segsum_entry( @@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == sci->sc_super_root) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { lock_page(bd_page); clear_page_dirty_for_io(bd_page); @@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err) } static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, - struct buffer_head *bh_sr, int err) + int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; @@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - if (bh == bh_sr) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; @@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, list_splice_tail_init(&sci->sc_write_logs, &logs); ret = nilfs_wait_on_logs(&logs); - nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err); + nilfs_abort_logs(&logs, NULL, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); nilfs_cancel_segusage(&logs, nilfs->ns_sufile); @@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, } nilfs_destroy_logs(&logs); - sci->sc_super_root = NULL; } static void nilfs_set_next_segment(struct the_nilfs *nilfs, @@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; - int update_sr = (sci->sc_super_root != NULL); + int update_sr = false; list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { struct buffer_head *bh; @@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) set_buffer_uptodate(bh); clear_buffer_dirty(bh); clear_buffer_nilfs_volatile(bh); - if (bh == sci->sc_super_root) { + if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; } + update_sr = true; break; } if (bh->b_page != fs_page) { @@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct nilfs_sb_info *sbi = sci->sc_sbi; struct the_nilfs *nilfs = sbi->s_nilfs; struct page *failed_page; - int err, has_sr = 0; + int err; sci->sc_stage.scnt = NILFS_ST_INIT; @@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (unlikely(err)) goto failed; - has_sr = (sci->sc_super_root != NULL); - /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { @@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); - if (has_sr) { + if (mode == SC_LSEG_SR && + sci->sc_stage.scnt >= NILFS_ST_CPFILE) { err = nilfs_segctor_fill_in_checkpoint(sci); if (unlikely(err)) goto failed_to_write; @@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Write partial segments */ err = nilfs_segctor_prepare_write(sci, &failed_page); if (err) { - nilfs_abort_logs(&sci->sc_segbufs, failed_page, - sci->sc_super_root, err); + nilfs_abort_logs(&sci->sc_segbufs, failed_page, err); goto failed_to_write; } - nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); + + nilfs_add_checksums_on_logs(&sci->sc_segbufs, + nilfs->ns_crc_seed); err = nilfs_segctor_write(sci, nilfs); if (unlikely(err)) @@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) } } while (sci->sc_stage.scnt != NILFS_ST_DONE); - sci->sc_super_root = NULL; - out: nilfs_segctor_check_out_files(sci, sbi); return err; @@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) { spin_lock(&sci->sc_state_lock); - if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { - sci->sc_timer->expires = jiffies + sci->sc_interval; - add_timer(sci->sc_timer); + if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer.expires = jiffies + sci->sc_interval; + add_timer(&sci->sc_timer); sci->sc_state |= NILFS_SEGCTOR_COMMIT; } spin_unlock(&sci->sc_state_lock); @@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci) spin_lock(&sci->sc_state_lock); sci->sc_seq_accepted = sci->sc_seq_request; spin_unlock(&sci->sc_state_lock); - - if (sci->sc_timer) - del_timer_sync(sci->sc_timer); + del_timer_sync(&sci->sc_timer); } /** @@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err) sci->sc_flush_request &= ~FLUSH_DAT_BIT; /* re-enable timer if checkpoint creation was not done */ - if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) && - time_before(jiffies, sci->sc_timer->expires)) - add_timer(sci->sc_timer); + if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_before(jiffies, sci->sc_timer.expires)) + add_timer(&sci->sc_timer); } spin_unlock(&sci->sc_state_lock); } @@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg) { struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; - struct timer_list timer; int timeout = 0; - init_timer(&timer); - timer.data = (unsigned long)current; - timer.function = nilfs_construction_timeout; - sci->sc_timer = &timer; + sci->sc_timer.data = (unsigned long)current; + sci->sc_timer.function = nilfs_construction_timeout; /* start sync. */ sci->sc_task = current; @@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg) should_sleep = 0; else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) should_sleep = time_before(jiffies, - sci->sc_timer->expires); + sci->sc_timer.expires); if (should_sleep) { spin_unlock(&sci->sc_state_lock); @@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg) } finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && - time_after_eq(jiffies, sci->sc_timer->expires)); + time_after_eq(jiffies, sci->sc_timer.expires)); if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); @@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg) end_thread: spin_unlock(&sci->sc_state_lock); - del_timer_sync(sci->sc_timer); - sci->sc_timer = NULL; /* end sync. */ sci->sc_task = NULL; @@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) } } -static int nilfs_segctor_init(struct nilfs_sc_info *sci) -{ - sci->sc_seq_done = sci->sc_seq_request; - - return nilfs_segctor_start_thread(sci); -} - /* * Setup & clean-up functions */ @@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_copied_buffers); + init_timer(&sci->sc_timer); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; @@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) down_write(&sbi->s_nilfs->ns_segctor_sem); + del_timer_sync(&sci->sc_timer); kfree(sci); } @@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) return -ENOMEM; nilfs_attach_writer(nilfs, sbi); - err = nilfs_segctor_init(NILFS_SC(sbi)); + err = nilfs_segctor_start_thread(NILFS_SC(sbi)); if (err) { nilfs_detach_writer(nilfs, sbi); kfree(sbi->s_sc_info); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 82dfd6a..dca1423 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -100,7 +100,6 @@ struct nilfs_segsum_pointer { * @sc_write_logs: List of segment buffers to hold logs under writing * @sc_segbuf_nblocks: Number of available blocks in segment buffers. * @sc_curseg: Current segment buffer - * @sc_super_root: Pointer to the super root buffer * @sc_stage: Collection stage * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary @@ -148,7 +147,6 @@ struct nilfs_sc_info { struct list_head sc_write_logs; unsigned long sc_segbuf_nblocks; struct nilfs_segment_buffer *sc_curseg; - struct buffer_head *sc_super_root; struct nilfs_cstage sc_stage; @@ -179,7 +177,7 @@ struct nilfs_sc_info { unsigned long sc_lseg_stime; /* in 1/HZ seconds */ unsigned long sc_watermark; - struct timer_list *sc_timer; + struct timer_list sc_timer; struct task_struct *sc_task; }; @@ -219,6 +217,8 @@ enum { */ #define NILFS_SC_DEFAULT_WATERMARK 3600 +/* super.c */ +extern struct kmem_cache *nilfs_transaction_cachep; /* segment.c */ extern int nilfs_init_transaction_cache(void); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 48145f5..03b34b7 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); +struct kmem_cache *nilfs_inode_cachep; +struct kmem_cache *nilfs_transaction_cachep; +struct kmem_cache *nilfs_segbuf_cachep; +struct kmem_cache *nilfs_btree_path_cache; + static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function, va_end(args); } -static struct kmem_cache *nilfs_inode_cachep; struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) { @@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode) kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } -static void init_once(void *obj) -{ - struct nilfs_inode_info *ii = obj; - - INIT_LIST_HEAD(&ii->i_dirty); -#ifdef CONFIG_NILFS_XATTR - init_rwsem(&ii->xattr_sem); -#endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); - ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; - inode_init_once(&ii->vfs_inode); -} - -static int nilfs_init_inode_cache(void) -{ - nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", - sizeof(struct nilfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT, - init_once); - - return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0; -} - -static inline void nilfs_destroy_inode_cache(void) -{ - kmem_cache_destroy(nilfs_inode_cachep); -} - static void nilfs_clear_inode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) int err; /* nilfs->sem must be locked by the caller. */ - if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { - if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) + if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) nilfs_swap_super_block(nilfs); else { printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", @@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); - if (nilfs_test_opt(sbi, ERRORS_RO)) - seq_printf(seq, ",errors=remount-ro"); if (nilfs_test_opt(sbi, ERRORS_PANIC)) seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, ERRORS_CONT)) + seq_printf(seq, ",errors=continue"); if (nilfs_test_opt(sbi, STRICT_ORDER)) seq_printf(seq, ",order=strict"); if (nilfs_test_opt(sbi, NORECOVERY)) @@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi, struct nilfs_super_block *sbp) { sbi->s_mount_opt = - NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; + NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; } static int nilfs_setup_super(struct nilfs_sb_info *sbi) @@ -778,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, goto failed_sbi; } cno = sbi->s_snapshot_cno; - } else - /* Read-only mount */ - sbi->s_snapshot_cno = cno; + } } err = nilfs_attach_checkpoint(sbi, cno); @@ -849,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct the_nilfs *nilfs = sbi->s_nilfs; unsigned long old_sb_flags; struct nilfs_mount_options old_opts; - int err; + int was_snapshot, err; lock_kernel(); @@ -857,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; old_opts.mount_opt = sbi->s_mount_opt; old_opts.snapshot_cno = sbi->s_snapshot_cno; + was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); if (!parse_options(data, sb)) { err = -EINVAL; @@ -864,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) } sb->s_flags = (sb->s_flags & ~MS_POSIXACL); - if ((*flags & MS_RDONLY) && - sbi->s_snapshot_cno != old_opts.snapshot_cno) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount to a different snapshot.\n", - sb->s_id); - err = -EINVAL; - goto restore_opts; + err = -EINVAL; + if (was_snapshot) { + if (!(*flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS (device %s): cannot remount " + "snapshot read/write.\n", + sb->s_id); + goto restore_opts; + } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) { + printk(KERN_ERR "NILFS (device %s): cannot " + "remount to a different snapshot.\n", + sb->s_id); + goto restore_opts; + } + } else { + if (nilfs_test_opt(sbi, SNAPSHOT)) { + printk(KERN_ERR "NILFS (device %s): cannot change " + "a regular mount to a snapshot.\n", + sb->s_id); + goto restore_opts; + } } if (!nilfs_valid_fs(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because the filesystem is in an " "incomplete recovery state.\n", sb->s_id); - err = -EINVAL; goto restore_opts; } @@ -888,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) nilfs_detach_segment_constructor(sbi); sb->s_flags |= MS_RDONLY; - sbi->s_snapshot_cno = nilfs_last_cno(nilfs); - /* nilfs_set_opt(sbi, SNAPSHOT); */ - /* * Remounting a valid RW partition RDONLY, so set * the RDONLY flag and then mark the partition as valid again. @@ -909,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ - if (nilfs->ns_current && nilfs->ns_current != sbi) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because an RW-mount exists.\n", - sb->s_id); - err = -EBUSY; - goto restore_opts; - } - if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because the current RO-mount is not " - "the latest one.\n", - sb->s_id); - err = -EINVAL; - goto restore_opts; - } sb->s_flags &= ~MS_RDONLY; - nilfs_clear_opt(sbi, SNAPSHOT); - sbi->s_snapshot_cno = 0; err = nilfs_attach_segment_constructor(sbi); if (err) @@ -935,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); up_write(&nilfs->ns_sem); - - nilfs->ns_current = sbi; } out: up_write(&nilfs->ns_super_sem); @@ -1022,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; + fmode_t mode = FMODE_READ; struct the_nilfs *nilfs; int err, need_to_close = 1; - sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return PTR_ERR(sd.bdev); @@ -1092,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, /* New superblock instance created */ s->s_flags = flags; + s->s_mode = mode; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); - err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); + err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0, + nilfs); if (err) goto cancel_new; @@ -1106,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); if (need_to_close) - close_bdev_exclusive(sd.bdev, flags); + close_bdev_exclusive(sd.bdev, mode); simple_set_mnt(mnt, s); return 0; @@ -1114,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); failed: - close_bdev_exclusive(sd.bdev, flags); + close_bdev_exclusive(sd.bdev, mode); return err; @@ -1124,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, put_nilfs(nilfs); deactivate_locked_super(s); /* - * deactivate_super() invokes close_bdev_exclusive(). + * deactivate_locked_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; * put_nilfs() needs the block device. */ @@ -1139,54 +1110,93 @@ struct file_system_type nilfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int __init init_nilfs_fs(void) +static void nilfs_inode_init_once(void *obj) { - int err; - - err = nilfs_init_inode_cache(); - if (err) - goto failed; + struct nilfs_inode_info *ii = obj; - err = nilfs_init_transaction_cache(); - if (err) - goto failed_inode_cache; + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + inode_init_once(&ii->vfs_inode); +} - err = nilfs_init_segbuf_cache(); - if (err) - goto failed_transaction_cache; +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} - err = nilfs_btree_path_cache_init(); - if (err) - goto failed_segbuf_cache; +static void nilfs_destroy_cachep(void) +{ + if (nilfs_inode_cachep) + kmem_cache_destroy(nilfs_inode_cachep); + if (nilfs_transaction_cachep) + kmem_cache_destroy(nilfs_transaction_cachep); + if (nilfs_segbuf_cachep) + kmem_cache_destroy(nilfs_segbuf_cachep); + if (nilfs_btree_path_cache) + kmem_cache_destroy(nilfs_btree_path_cache); +} - err = register_filesystem(&nilfs_fs_type); - if (err) - goto failed_btree_path_cache; +static int __init nilfs_init_cachep(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + if (!nilfs_inode_cachep) + goto fail; + + nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!nilfs_transaction_cachep) + goto fail; + + nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), 0, + SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); + if (!nilfs_segbuf_cachep) + goto fail; + + nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, + 0, 0, NULL); + if (!nilfs_btree_path_cache) + goto fail; return 0; - failed_btree_path_cache: - nilfs_btree_path_cache_destroy(); +fail: + nilfs_destroy_cachep(); + return -ENOMEM; +} + +static int __init init_nilfs_fs(void) +{ + int err; - failed_segbuf_cache: - nilfs_destroy_segbuf_cache(); + err = nilfs_init_cachep(); + if (err) + goto fail; - failed_transaction_cache: - nilfs_destroy_transaction_cache(); + err = register_filesystem(&nilfs_fs_type); + if (err) + goto free_cachep; - failed_inode_cache: - nilfs_destroy_inode_cache(); + printk(KERN_INFO "NILFS version 2 loaded\n"); + return 0; - failed: +free_cachep: + nilfs_destroy_cachep(); +fail: return err; } static void __exit exit_nilfs_fs(void) { - nilfs_destroy_segbuf_cache(); - nilfs_destroy_transaction_cache(); - nilfs_destroy_inode_cache(); - nilfs_btree_path_cache_destroy(); + nilfs_destroy_cachep(); unregister_filesystem(&nilfs_fs_type); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 33871f7..a756168 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, printk(KERN_WARNING "NILFS warning: unable to read secondary superblock\n"); + /* + * Compare two super blocks and set 1 in swp if the secondary + * super block is valid and newer. Otherwise, set 0 in swp. + */ valid[0] = nilfs_valid_sb(sbp[0]); valid[1] = nilfs_valid_sb(sbp[1]); - swp = valid[1] && - (!valid[0] || - le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); + swp = valid[1] && (!valid[0] || + le64_to_cpu(sbp[1]->s_last_cno) > + le64_to_cpu(sbp[0]->s_last_cno)); if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { brelse(sbh[1]); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 791c088..07d9fd8 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -29,6 +29,7 @@ ocfs2-objs := \ mmap.o \ namei.o \ refcounttree.o \ + reservations.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9f8bd91..215e12c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, int count, status, i; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; struct ocfs2_super *osb = OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; @@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, count = 0; while (count < wanted) { - status = ocfs2_claim_metadata(osb, - handle, + status = ocfs2_claim_metadata(handle, meta_ac, wanted - count, + &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); @@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, eb->h_fs_generation = cpu_to_le32(osb->fs_generation); eb->h_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); @@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, /* We'll also be dirtied by the caller, so * this isn't absolutely necessary. */ - status = ocfs2_journal_dirty(handle, bhs[i]); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bhs[i]); } count += num_got; @@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, goto out; } - status = ocfs2_extend_trans(handle, path_num_items(path) + - handle->h_buffer_credits); + status = ocfs2_extend_trans(handle, path_num_items(path)); if (status < 0) { mlog_errno(status); goto out; @@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle, if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - + ocfs2_journal_dirty(handle, bh); next_blkno = le64_to_cpu(eb->h_blkno); } @@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle, eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); - status = ocfs2_journal_dirty(handle, *last_eb_bh); - if (status < 0) - mlog_errno(status); - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) - mlog_errno(status); - if (eb_bh) { - status = ocfs2_journal_dirty(handle, eb_bh); - if (status < 0) - mlog_errno(status); - } + ocfs2_journal_dirty(handle, *last_eb_bh); + ocfs2_journal_dirty(handle, et->et_root_bh); + if (eb_bh) + ocfs2_journal_dirty(handle, eb_bh); /* * Some callers want to track the rightmost leaf so pass it @@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle, for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) eb_el->l_recs[i] = root_el->l_recs[i]; - status = ocfs2_journal_dirty(handle, new_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_eb_bh); status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle, if (root_el->l_tree_depth == cpu_to_le16(1)) ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, et->et_root_bh); *ret_new_eb_bh = new_eb_bh; new_eb_bh = NULL; @@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, struct ocfs2_path *right_path, int subtree_index) { - int ret, i, idx; + int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; @@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle, ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, right_el); - ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); - if (ret) - mlog_errno(ret); - - ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + ocfs2_journal_dirty(handle, right_path->p_node[i].bh); /* * Setup our list pointers now so that the current @@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, root_bh = left_path->p_node[subtree_index].bh; - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, root_bh); } static int ocfs2_rotate_subtree_right(handle_t *handle, @@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, ocfs2_create_empty_extent(right_el); - ret = ocfs2_journal_dirty(handle, right_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, right_leaf_bh); /* Do the copy now. */ i = le16_to_cpu(left_el->l_next_free_rec) - 1; @@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle, memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); le16_add_cpu(&left_el->l_next_free_rec, 1); - ret = ocfs2_journal_dirty(handle, left_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, left_leaf_bh); ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); @@ -2249,8 +2210,8 @@ out: * * Will return zero if the path passed in is already the leftmost path. */ -static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { - int ret; + int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) { + if (handle->h_buffer_credits < credits) ret = ocfs2_extend_trans(handle, credits - handle->h_buffer_credits); - if (ret) - return ret; - if (unlikely(handle->h_buffer_credits < credits)) - return ocfs2_extend_trans(handle, credits); - } - - return 0; + return ret; } /* @@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle, * records for all the bh in the path. * So we have to allocate extra credits and access them. */ - ret = ocfs2_extend_trans(handle, - handle->h_buffer_credits + subtree_index); + ret = ocfs2_extend_trans(handle, subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, ocfs2_remove_empty_extent(right_leaf_el); } - ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); - ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, @@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, et_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et_root_bh); *deleted = 1; } else @@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, } ocfs2_remove_empty_extent(el); - - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out: return ret; @@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); if (right_path) { - ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); } @@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); if (left_path) { - ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); /* * In the situation that the right_rec is empty and the extent @@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); } } @@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle, struct buffer_head *leaf_bh = path_leaf_bh(right_path); if (left_path) { - int credits = handle->h_buffer_credits; - /* * There's a chance that left_path got passed back to * us without being accounted for in the * journal. Extend our transaction here to be sure we * can change those blocks. */ - credits += left_path->p_tree_depth; - - ret = ocfs2_extend_trans(handle, credits); + ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); if (ret < 0) { mlog_errno(ret); goto out; @@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle, * dirty this for us. */ if (left_path) - ret = ocfs2_journal_dirty(handle, - path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); } else ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), insert); - ret = ocfs2_journal_dirty(handle, leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, leaf_bh); if (left_path) { /* @@ -4384,9 +4307,7 @@ out_update_clusters: ocfs2_et_update_clusters(et, le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, et->et_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et->et_root_bh); out: ocfs2_free_path(left_path); @@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, goto leave; } - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + status = __ocfs2_claim_clusters(handle, data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (status < 0) { if (status != -ENOSPC) @@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, goto leave; } - status = ocfs2_journal_dirty(handle, et->et_root_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, et->et_root_bh); clusters_to_add -= num_bits; *logical_offset += num_bits; @@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { - int ret, depth, credits = handle->h_buffer_credits; + int ret, depth, credits; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + - ocfs2_extend_meta_needed(et->et_root_el); + credits = path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -5671,19 +5588,97 @@ out: return ret; } +/* + * ocfs2_reserve_blocks_for_rec_trunc() would look basically the + * same as ocfs2_lock_alloctors(), except for it accepts a blocks + * number to reserve some extra blocks, and it only handles meta + * data allocations. + * + * Currently, only ocfs2_remove_btree_range() uses it for truncating + * and punching holes. + */ +static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 extents_to_split, + struct ocfs2_alloc_context **ac, + int extra_blocks) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *ac = NULL; + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + if (extra_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + +out: + if (ret) { + if (*ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + } + + return ret; +} + int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc) { - int ret; + int ret, credits = 0, extra_blocks = 0; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; + + if ((flags & OCFS2_EXT_REFCOUNTED) && len) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } - ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); + ret = ocfs2_prepare_refcount_change_for_del(inode, + refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, + extra_blocks); if (ret) { mlog_errno(ret); return ret; @@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode, } } - handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb) + credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); @@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode, ocfs2_et_update_clusters(et, -len); - ret = ocfs2_journal_dirty(handle, et->et_root_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, et->et_root_bh); - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); - if (ret) - mlog_errno(ret); + if (phys_blkno) { + if (flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + phys_blkno), + len, meta_ac, + dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + phys_blkno, len); + if (ret) + mlog_errno(ret); + + } out_commit: ocfs2_commit_trans(osb, handle); @@ -5742,6 +5745,9 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + return ret; } @@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, } tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); bail: mlog_exit(status); @@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, tl->tl_used = cpu_to_le16(i); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); /* TODO: Perhaps we can calculate the bulk of the * credits up front rather than extending like @@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) */ struct ocfs2_cached_block_free { struct ocfs2_cached_block_free *free_next; + u64 free_bg; u64 free_blk; unsigned int free_bit; }; @@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, } while (head) { - bg_blkno = ocfs2_which_suballoc_group(head->free_blk, - head->free_bit); + if (head->free_bg) + bg_blkno = head->free_bg; + else + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); mlog(0, "Free bit: (bit %u, blkno %llu)\n", head->free_bit, (unsigned long long)head->free_blk); @@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int ret = 0; struct ocfs2_cached_block_free *item; - item = kmalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (item == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type, } int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, - unsigned int bit) + int type, int slot, u64 suballoc, + u64 blkno, unsigned int bit) { int ret; struct ocfs2_per_slot_free_list *fl; @@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, goto out; } - item = kmalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (item == NULL) { ret = -ENOMEM; mlog_errno(ret); @@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", type, slot, bit, (unsigned long long)blkno); + item->free_bg = suballoc; item->free_blk = blkno; item->free_bit = bit; item->free_next = fl->f_first; @@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, { return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_suballoc_loc), le64_to_cpu(eb->h_blkno), le16_to_cpu(eb->h_suballoc_bit)); } -/* This function will figure out whether the currently last extent - * block will be deleted, and if it will, what the new last extent - * block will be so we can update his h_next_leaf_blk field, as well - * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct inode *inode, - unsigned int clusters_to_del, - struct ocfs2_path *path, - struct buffer_head **new_last_eb) -{ - int next_free, ret = 0; - u32 cpos; - struct ocfs2_extent_rec *rec; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - struct buffer_head *bh = NULL; - - *new_last_eb = NULL; - - /* we have no tree, so of course, no last_eb. */ - if (!path->p_tree_depth) - goto out; - - /* trunc to zero special case - this makes tree_depth = 0 - * regardless of what it is. */ - if (OCFS2_I(inode)->ip_clusters == clusters_to_del) - goto out; - - el = path_leaf_el(path); - BUG_ON(!el->l_next_free_rec); - - /* - * Make sure that this extent list will actually be empty - * after we clear away the data. We can shortcut out if - * there's more than one non-empty extent in the - * list. Otherwise, a check of the remaining extent is - * necessary. - */ - next_free = le16_to_cpu(el->l_next_free_rec); - rec = NULL; - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - if (next_free > 2) - goto out; - - /* We may have a valid extent in index 1, check it. */ - if (next_free == 2) - rec = &el->l_recs[1]; - - /* - * Fall through - no more nonempty extents, so we want - * to delete this leaf. - */ - } else { - if (next_free > 1) - goto out; - - rec = &el->l_recs[0]; - } - - if (rec) { - /* - * Check it we'll only be trimming off the end of this - * cluster. - */ - if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) - goto out; - } - - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - - /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). - * Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); - - *new_last_eb = bh; - get_bh(*new_last_eb); - mlog(0, "returning block %llu, (cpos: %u)\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); -out: - brelse(bh); - - return ret; -} - -/* - * Trim some clusters off the rightmost edge of a tree. Only called - * during truncate. - * - * The caller needs to: - * - start journaling of each path component. - * - compute and fully set up any new last ext block - */ -static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, - handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start, u8 *flags) -{ - int ret, i, index = path->p_tree_depth; - u32 new_edge = 0; - u64 deleted_eb = 0; - struct buffer_head *bh; - struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *rec; - - *delete_start = 0; - *flags = 0; - - while (index >= 0) { - bh = path->p_node[index].bh; - el = path->p_node[index].el; - - mlog(0, "traveling tree (index = %d, block = %llu)\n", - index, (unsigned long long)bh->b_blocknr); - - BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - - if (index != - (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { - ocfs2_error(inode->i_sb, - "Inode %lu has invalid ext. block %llu", - inode->i_ino, - (unsigned long long)bh->b_blocknr); - ret = -EROFS; - goto out; - } - -find_tail_record: - i = le16_to_cpu(el->l_next_free_rec) - 1; - rec = &el->l_recs[i]; - - mlog(0, "Extent list before: record %d: (%u, %u, %llu), " - "next = %u\n", i, le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); - - if (le16_to_cpu(el->l_tree_depth) == 0) { - /* - * If the leaf block contains a single empty - * extent and no records, we can just remove - * the block. - */ - if (i == 0 && ocfs2_is_empty_extent(rec)) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - el->l_next_free_rec = cpu_to_le16(0); - - goto delete; - } - - /* - * Remove any empty extents by shifting things - * left. That should make life much easier on - * the code below. This condition is rare - * enough that we shouldn't see a performance - * hit. - */ - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - le16_add_cpu(&el->l_next_free_rec, -1); - - for(i = 0; - i < le16_to_cpu(el->l_next_free_rec); i++) - el->l_recs[i] = el->l_recs[i + 1]; - - memset(&el->l_recs[i], 0, - sizeof(struct ocfs2_extent_rec)); - - /* - * We've modified our extent list. The - * simplest way to handle this change - * is to being the search from the - * start again. - */ - goto find_tail_record; - } - - le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); - - /* - * We'll use "new_edge" on our way back up the - * tree to know what our rightmost cpos is. - */ - new_edge = le16_to_cpu(rec->e_leaf_clusters); - new_edge += le32_to_cpu(rec->e_cpos); - - /* - * The caller will use this to delete data blocks. - */ - *delete_start = le64_to_cpu(rec->e_blkno) - + ocfs2_clusters_to_blocks(inode->i_sb, - le16_to_cpu(rec->e_leaf_clusters)); - *flags = rec->e_flags; - - /* - * If it's now empty, remove this record. - */ - if (le16_to_cpu(rec->e_leaf_clusters) == 0) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - } - } else { - if (le64_to_cpu(rec->e_blkno) == deleted_eb) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - - goto delete; - } - - /* Can this actually happen? */ - if (le16_to_cpu(el->l_next_free_rec) == 0) - goto delete; - - /* - * We never actually deleted any clusters - * because our leaf was empty. There's no - * reason to adjust the rightmost edge then. - */ - if (new_edge == 0) - goto delete; - - rec->e_int_clusters = cpu_to_le32(new_edge); - le32_add_cpu(&rec->e_int_clusters, - -le32_to_cpu(rec->e_cpos)); - - /* - * A deleted child record should have been - * caught above. - */ - BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); - } - -delete: - ret = ocfs2_journal_dirty(handle, bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - mlog(0, "extent list container %llu, after: record %d: " - "(%u, %u, %llu), next = %u.\n", - (unsigned long long)bh->b_blocknr, i, - le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - /* - * We must be careful to only attempt delete of an - * extent block (and not the root inode block). - */ - if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { - struct ocfs2_extent_block *eb = - (struct ocfs2_extent_block *)bh->b_data; - - /* - * Save this for use when processing the - * parent block. - */ - deleted_eb = le64_to_cpu(eb->h_blkno); - - mlog(0, "deleting this extent block.\n"); - - ocfs2_remove_from_cache(INODE_CACHE(inode), bh); - - BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); - BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); - BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); - - ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); - /* An error here is not fatal. */ - if (ret < 0) - mlog_errno(ret); - } else { - deleted_eb = 0; - } - - index--; - } - - ret = 0; -out: - return ret; -} - -static int ocfs2_do_truncate(struct ocfs2_super *osb, - unsigned int clusters_to_del, - struct inode *inode, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_truncate_context *tc, - struct ocfs2_path *path, - struct ocfs2_alloc_context *meta_ac) -{ - int status; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *last_eb = NULL; - struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh = NULL; - u64 delete_blk = 0; - u8 rec_flags; - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, - path, &last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - /* - * Each component will be touched, so we might as well journal - * here to avoid having to handle errors later. - */ - status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb_bh) { - status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - } - - el = &(fe->id2.i_list); - - /* - * Lower levels depend on this never happening, but it's best - * to check it up here before changing the tree. - */ - if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { - ocfs2_error(inode->i_sb, - "Inode %lu has an empty extent record, depth %u\n", - inode->i_ino, le16_to_cpu(el->l_tree_depth)); - status = -EROFS; - goto bail; - } - - dquot_free_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - - clusters_to_del; - spin_unlock(&OCFS2_I(inode)->ip_lock); - le32_add_cpu(&fe->i_clusters, -clusters_to_del); - inode->i_blocks = ocfs2_inode_sector_count(inode); - - status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk, &rec_flags); - if (status) { - mlog_errno(status); - goto bail; - } - - if (le32_to_cpu(fe->i_clusters) == 0) { - /* trunc to zero is a special case. */ - el->l_tree_depth = 0; - fe->i_last_eb_blk = 0; - } else if (last_eb) - fe->i_last_eb_blk = last_eb->h_blkno; - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb) { - /* If there will be a new last extent block, then by - * definition, there cannot be any leaves to the right of - * him. */ - last_eb->h_next_leaf_blk = 0; - status = ocfs2_journal_dirty(handle, last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - if (delete_blk) { - if (rec_flags & OCFS2_EXT_REFCOUNTED) - status = ocfs2_decrease_refcount(inode, handle, - ocfs2_blocks_to_clusters(osb->sb, - delete_blk), - clusters_to_del, meta_ac, - &tc->tc_dealloc, 1); - else - status = ocfs2_truncate_log_append(osb, handle, - delete_blk, - clusters_to_del); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - status = 0; -bail: - brelse(last_eb_bh); - mlog_exit(status); - return status; -} - static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); @@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); if (ret) { mlog_errno(ret); @@ -7406,26 +6989,29 @@ out: */ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context *tc) + struct buffer_head *di_bh) { - int status, i, credits, tl_sem = 0; - u32 clusters_to_del, new_highest_cpos, range; + int status = 0, i, flags = 0; + u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; u64 blkno = 0; struct ocfs2_extent_list *el; - handle_t *handle = NULL; - struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_extent_rec *rec; struct ocfs2_path *path = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; - struct ocfs2_alloc_context *meta_ac = NULL; - struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *root_el = &(di->id2.i_list); + u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); + struct ocfs2_extent_tree et; + struct ocfs2_cached_dealloc_ctxt dealloc; mlog_entry_void(); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_path(fe_bh, &di->id2.i_list, + path = ocfs2_new_path(di_bh, &di->id2.i_list, ocfs2_journal_access_di); if (!path) { status = -ENOMEM; @@ -7444,8 +7030,6 @@ start: goto bail; } - credits = 0; - /* * Truncate always works against the rightmost tree branch. */ @@ -7480,101 +7064,62 @@ start: } i = le16_to_cpu(el->l_next_free_rec) - 1; - range = le32_to_cpu(el->l_recs[i].e_cpos) + - ocfs2_rec_clusters(el, &el->l_recs[i]); - if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { - clusters_to_del = 0; - } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { - clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); - blkno = le64_to_cpu(el->l_recs[i].e_blkno); + rec = &el->l_recs[i]; + flags = rec->e_flags; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (i == 0 && ocfs2_is_empty_extent(rec)) { + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (root_el->l_tree_depth && rec->e_int_clusters == 0) { + ocfs2_error(inode->i_sb, "Inode %lu has an empty " + "extent record, depth %u\n", inode->i_ino, + le16_to_cpu(root_el->l_tree_depth)); + status = -EROFS; + goto bail; + } + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; + } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { + /* + * Truncate entire record. + */ + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = ocfs2_rec_clusters(el, rec); + blkno = le64_to_cpu(rec->e_blkno); } else if (range > new_highest_cpos) { - clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + - le32_to_cpu(el->l_recs[i].e_cpos)) - - new_highest_cpos; - blkno = le64_to_cpu(el->l_recs[i].e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, - ocfs2_rec_clusters(el, &el->l_recs[i]) - - clusters_to_del); + /* + * Partial truncate. it also should be + * the last truncate we're doing. + */ + trunc_cpos = new_highest_cpos; + trunc_len = range - new_highest_cpos; + coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); + blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); } else { + /* + * Truncate completed, leave happily. + */ status = 0; goto bail; } - mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", - clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); - - if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - - status = ocfs2_lock_refcount_tree(osb, - le64_to_cpu(di->i_refcount_loc), - 1, &ref_tree, NULL); - if (status) { - mlog_errno(status); - goto bail; - } - - status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, - blkno, - clusters_to_del, - &credits, - &meta_ac); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - mutex_lock(&tl_inode->i_mutex); - tl_sem = 1; - /* ocfs2_truncate_log_needs_flush guarantees us at least one - * record is free for use. If there isn't any, we flush to get - * an empty truncate log. */ - if (ocfs2_truncate_log_needs_flush(osb)) { - status = __ocfs2_flush_truncate_log(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - (struct ocfs2_dinode *)fe_bh->b_data, - el); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto bail; - } - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path, meta_ac); + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, &dealloc, + refcount_loc); if (status < 0) { mlog_errno(status); goto bail; } - mutex_unlock(&tl_inode->i_mutex); - tl_sem = 0; - - ocfs2_commit_trans(osb, handle); - handle = NULL; - ocfs2_reinit_path(path, 1); - if (meta_ac) { - ocfs2_free_alloc_context(meta_ac); - meta_ac = NULL; - } - - if (ref_tree) { - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - ref_tree = NULL; - } - /* * The check above will catch the case where we've truncated * away all allocation. @@ -7585,25 +7130,10 @@ bail: ocfs2_schedule_truncate_log_flush(osb, 1); - if (tl_sem) - mutex_unlock(&tl_inode->i_mutex); - - if (handle) - ocfs2_commit_trans(osb, handle); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - - if (ref_tree) - ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - - ocfs2_run_deallocs(osb, &tc->tc_dealloc); + ocfs2_run_deallocs(osb, &dealloc); ocfs2_free_path(path); - /* This will drop the ext_alloc cluster lock for us */ - ocfs2_free_truncate_context(tc); - mlog_exit(status); return status; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 1db4359..55762b5 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc); + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct ocfs2_extent_tree *et); @@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, u64 blkno, unsigned int bit); int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, + int type, int slot, u64 suballoc, u64 blkno, unsigned int bit); static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) { @@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct ocfs2_truncate_context **tc); int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context *tc); + struct buffer_head *di_bh); int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, unsigned int start, unsigned int end, int trunc); @@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, struct ocfs2_path *path); int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, struct ocfs2_path *left, struct ocfs2_path *right); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 21441dd..3623ca2 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, goto out; } + if (data_ac) + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list, clusters_to_alloc); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 3bb928a..c7fba39 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), + define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 3dfddbe..fd96e2a 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -119,6 +119,7 @@ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ +#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 73e743e..aa75ca3 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk) o2net_sc_queue_work(sc, &sc->sc_connect_work); break; default: + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); break; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index efd77d0..f04ebcf 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, else de->inode = 0; dir->i_version++; - status = ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, bh); goto bail; } i += le16_to_cpu(de->rec_len); @@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle, ocfs2_recalc_free_list(dir, handle, lookup); dir->i_version++; - status = ocfs2_journal_dirty(handle, insert_bh); + ocfs2_journal_dirty(handle, insert_bh); retval = 0; goto bail; } @@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, } ocfs2_fill_initial_dirents(inode, parent, data->id_data, size); - ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out; - } i_size_write(inode, size); inode->i_nlink = 2; @@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_init_dir_trailer(inode, new_bh, size); } - status = ocfs2_journal_dirty(handle, new_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_bh); i_size_write(inode, inode->i_sb->s_blocksize); inode->i_nlink = 2; @@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, int ret; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; u16 dr_suballoc_bit; - u64 dr_blkno; + u64 suballoc_loc, dr_blkno; unsigned int num_bits; struct buffer_head *dx_root_bh = NULL; struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb); - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit, - &num_bits, &dr_blkno); + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, + &dr_suballoc_bit, &num_bits, &dr_blkno); if (ret) { mlog_errno(ret); goto out; @@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, memset(dx_root, 0, osb->sb->s_blocksize); strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE); dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc); dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit); dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation); dx_root->dr_blkno = cpu_to_le64(dr_blkno); @@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, dx_root->dr_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb)); } - - ret = ocfs2_journal_dirty(handle, dx_root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, dx_root_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); *ret_dx_root_bh = dx_root_bh; dx_root_bh = NULL; @@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir, * chance of contiguousness as the directory grows in number * of entries. */ - ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num); + ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num); if (ret) { mlog_errno(ret); goto out; @@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * if we only get one now, that's enough to continue. The rest * will be claimed after the conversion to extents. */ - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &oi->ip_la_data_resv; + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); goto out_commit; @@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } - ret = ocfs2_journal_dirty(handle, dirdata_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { /* @@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, */ dir->i_blocks = ocfs2_inode_sector_count(dir); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, di_bh); if (ocfs2_supports_indexed_dirs(osb)) { ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh, @@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * pass. Claim the 2nd cluster as a separate extent. */ if (alloc > len) { - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } + if (ocfs2_dir_resv_allowed(osb)) + data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); @@ -3423,11 +3407,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } - status = ocfs2_journal_dirty(handle, new_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; i_size_write(dir, dir_i_size); @@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp, dx_leaf_sort_swap); - ret = ocfs2_journal_dirty(handle, dx_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, dx_leaf_bh); ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash, &split_hash); @@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, blk = le64_to_cpu(dx_root->dr_blkno); bit = le16_to_cpu(dx_root->dr_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (dx_root->dr_suballoc_loc) + bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh, bit, bg_blkno, 1); if (ret) @@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); - ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, - &dealloc); + ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, + &dealloc, 0); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 12d5eb7..f449991 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) return 0; } -static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { mlog_entry_void(); @@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) } -static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) { mlog_entry_void(); @@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, lock->ml.node, &status); if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key, + lock->ml.node); else { if (status == DLM_RECOVERING) { mlog(ML_ERROR, "sent AST to node %u, it thinks this " diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 0102be3..4b6ae2c 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,7 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE # define DLM_HASH_PAGES 1 #else @@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 90803b4..9f30491 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) dlm_error(ret); } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key, + res->owner); if (dlm_is_host_down(tmpret)) { /* instead of logging the same network error over * and over, sleep here and wait for the heartbeat diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 988c905..6b5a492 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm) assert_spin_locked(&dlm->spinlock); - printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); + printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { @@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, node = exit_msg->node_idx; - printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); @@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, &leave_msg, sizeof(leave_msg), node, NULL); - + if (status < 0) + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node); mlog(0, "status return %d from o2net_send_message\n", status); return status; @@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, set_bit(assert->node_idx, dlm->domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", assert->node_idx, dlm->name); __dlm_print_nodes(dlm); @@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, &cancel_msg, sizeof(cancel_msg), node, NULL); if (status < 0) { - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + node); goto bail; } @@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm, byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES); status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, - sizeof(join_msg), node, - &join_resp); + sizeof(join_msg), node, &join_resp); if (status < 0 && status != -ENOPROTOOPT) { - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + node); goto bail; } dlm_query_join_wire_to_packet(join_resp, &packet); @@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, &assert_msg, sizeof(assert_msg), node, NULL); if (status < 0) - mlog_errno(status); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + node); return status; } @@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); + dlm->name = kstrdup(domain, GFP_KERNEL); if (dlm->name == NULL) { mlog_errno(-ENOMEM); kfree(dlm); @@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, for (i = 0; i < DLM_HASH_BUCKETS; i++) INIT_HLIST_HEAD(dlm_master_hash(dlm, i)); - strcpy(dlm->name, domain); dlm->key = key; dlm->node_num = o2nm_this_node(); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 73333777..69cf369 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, BUG(); } } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, + res->owner); if (dlm_is_host_down(tmpret)) { ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " @@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); + lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); if (!lock) return NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9289b43..4a7506a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res = NULL; - res = (struct dlm_lock_resource *) - kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); + res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); if (!res) goto error; - res->lockname.name = (char *) - kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); + res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); if (!res->lockname.name) goto error; @@ -757,8 +755,7 @@ lookup: spin_unlock(&dlm->spinlock); mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ - alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -1542,8 +1539,7 @@ way_up_top: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); - mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); @@ -1666,7 +1662,9 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(0, "assert_master returned %d!\n", tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", tmpret, + DLM_ASSERT_MASTER_MSG, dlm->key, to); if (!dlm_is_host_down(tmpret)) { mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); @@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, &deref, sizeof(deref), res->owner, &r); if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, + res->owner); else if (r < 0) { /* BAD. other node says I did not have a ref. */ mlog(ML_ERROR,"while dropping ref on %s:%.*s " @@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, goto leave; } - mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, &migrate, sizeof(migrate), nodenum, &status); if (ret < 0) { - mlog(0, "migrate_request returned %d!\n", ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, + dlm->key, nodenum); if (!dlm_is_host_down(ret)) { mlog(ML_ERROR, "unhandled error=%d!\n", ret); BUG(); @@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ - mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_NOFS); + mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { ret = -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b4f99de..f8b75ce 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, /* negative status is handled by caller */ if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, + dlm->key, request_from); // return from here, then // sleep until all received or error @@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); if (ret < 0) { + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, + dlm->key, send_to); if (!dlm_is_host_down(ret)) { - mlog_errno(ret); - mlog(ML_ERROR, "%s: unknown error sending data-done " - "to %u\n", dlm->name, send_to); BUG(); } } else @@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, + dlm->key, send_to); } else { /* might get an -ENOMEM back here */ ret = status; @@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, &req, sizeof(req), nodenum, &status); /* XXX: negative status not handled properly here. */ if (ret < 0) - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, + dlm->key, nodenum); else { BUG_ON(status < 0); BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); @@ -2640,7 +2646,7 @@ retry: if (dlm_is_host_down(ret)) { /* node is down. not involved in recovery * so just keep going */ - mlog(0, "%s: node %u was down when sending " + mlog(ML_NOTICE, "%s: node %u was down when sending " "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } @@ -2660,11 +2666,12 @@ retry: } if (ret < 0) { struct dlm_lock_resource *res; + /* this is now a serious problem, possibly ENOMEM * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " - " returned %d\n", dlm->name, nodenum, ret); + "returned %d\n", dlm->name, nodenum, ret); res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN); if (res) { @@ -2789,7 +2796,9 @@ stage2: if (ret >= 0) ret = status; if (ret < 0) { - mlog_errno(ret); + mlog(ML_ERROR, "Error %d when sending message %u (key " + "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG, + dlm->key, nodenum); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 11a6d1f..d4f73ca 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, * spinlock, and because we know that it is not migrating/ * recovering/in-progress, it is fine to reserve asts and * basts right before queueing them all throughout */ + assert_spin_locked(&dlm->ast_lock); assert_spin_locked(&res->spinlock); BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| DLM_LOCK_RES_RECOVERING| @@ -337,7 +338,7 @@ converting: /* queue the BAST if not already */ if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } /* update the highest_blocked if needed */ if (lock->ml.highest_blocked < target->ml.convert_type) @@ -355,7 +356,7 @@ converting: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.convert_type) lock->ml.highest_blocked = @@ -383,7 +384,7 @@ converting: spin_unlock(&target->spinlock); __dlm_lockres_reserve_ast(res); - dlm_queue_ast(dlm, target); + __dlm_queue_ast(dlm, target); /* go back and check for more */ goto converting; } @@ -402,7 +403,7 @@ blocked: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.type) lock->ml.highest_blocked = target->ml.type; @@ -418,7 +419,7 @@ blocked: can_grant = 0; if (lock->ml.highest_blocked == LKM_IVMODE) { __dlm_lockres_reserve_ast(res); - dlm_queue_bast(dlm, lock); + __dlm_queue_bast(dlm, lock); } if (lock->ml.highest_blocked < target->ml.type) lock->ml.highest_blocked = target->ml.type; @@ -444,7 +445,7 @@ blocked: spin_unlock(&target->spinlock); __dlm_lockres_reserve_ast(res); - dlm_queue_ast(dlm, target); + __dlm_queue_ast(dlm, target); /* go back and check for more */ goto converting; } @@ -674,6 +675,7 @@ static int dlm_thread(void *data) /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ + spin_lock(&dlm->ast_lock); spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { __dlm_print_one_lock_resource(res); @@ -694,6 +696,7 @@ static int dlm_thread(void *data) /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); mlog(0, "delaying list shuffling for in-" "progress lockres %.*s, state=%d\n", res->lockname.len, res->lockname.name, @@ -715,6 +718,7 @@ static int dlm_thread(void *data) dlm_shuffle_lists(dlm, res); res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); + spin_unlock(&dlm->ast_lock); dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index b47c1b9..817287c 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, mlog(0, "master was in-progress. retry\n"); ret = status; } else { - mlog_errno(tmpret); + mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " + "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner); if (dlm_is_host_down(tmpret)) { /* NOTE: this seems strange, but it is what we want. * when the master goes down during a cancel or diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a5fbd9c..f74f140 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); @@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, fe_bh); out_commit: ocfs2_commit_trans(osb, handle); @@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode, int status = 0; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_truncate_context *tc = NULL; mlog_entry("(inode = %llu, new_i_size = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode, down_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_resv_discard(&osb->osb_la_resmap, + &OCFS2_I(inode)->ip_la_data_resv); + /* * The inode lock forced other nodes to sync and drop their * pages, which (correctly) happens even if we have a truncate @@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } - status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto bail_unlock_sem; - } - - status = ocfs2_commit_truncate(osb, inode, di_bh, tc); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); goto bail_unlock_sem; @@ -666,11 +657,7 @@ restarted_transaction: goto leave; } - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); @@ -1195,9 +1182,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out_trans: ocfs2_commit_trans(osb, handle); @@ -1434,16 +1419,90 @@ out: return ret; } +static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) +{ + int i; + struct ocfs2_extent_rec *rec = NULL; + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) < pos) + break; + } + + return i; +} + +/* + * Helper to calculate the punching pos and length in one run, we handle the + * following three cases in order: + * + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (hole-punching completed) +*/ +static void ocfs2_calc_trunc_pos(struct inode *inode, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec, + u32 trunc_start, u32 *trunc_cpos, + u32 *trunc_len, u32 *trunc_end, + u64 *blkno, int *done) +{ + int ret = 0; + u32 coff, range; + + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (le32_to_cpu(rec->e_cpos) >= trunc_start) { + *trunc_cpos = le32_to_cpu(rec->e_cpos); + /* + * Skip holes if any. + */ + if (range < *trunc_end) + *trunc_end = range; + *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno); + *trunc_end = le32_to_cpu(rec->e_cpos); + } else if (range > trunc_start) { + *trunc_cpos = trunc_start; + *trunc_len = *trunc_end - trunc_start; + coff = trunc_start - le32_to_cpu(rec->e_cpos); + *blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); + *trunc_end = trunc_start; + } else { + /* + * It may have two following possibilities: + * + * - last record has been removed + * - trunc_start was within a hole + * + * both two cases mean the completion of hole punching. + */ + ret = 1; + } + + *done = ret; +} + static int ocfs2_remove_inode_range(struct inode *inode, struct buffer_head *di_bh, u64 byte_start, u64 byte_len) { - int ret = 0; - u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; + int ret = 0, flags = 0, done = 0, i; + u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; + u32 cluster_in_el; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; struct address_space *mapping = inode->i_mapping; struct ocfs2_extent_tree et; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el = NULL; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -1469,17 +1528,35 @@ static int ocfs2_remove_inode_range(struct inode *inode, goto out; } + /* + * For reflinks, we may need to CoW 2 clusters which might be + * partially zero'd later, if hole's start and end offset were + * within one cluster(means is not exactly aligned to clustersize). + */ + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); - trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; - if (trunc_len >= trunc_start) - trunc_len -= trunc_start; - else - trunc_len = 0; + trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; + cluster_in_el = trunc_end; - mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", + mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)byte_start, - (unsigned long long)byte_len, trunc_start, trunc_len); + (unsigned long long)byte_len, trunc_start, trunc_end); ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); if (ret) { @@ -1487,31 +1564,79 @@ static int ocfs2_remove_inode_range(struct inode *inode, goto out; } - cpos = trunc_start; - while (trunc_len) { - ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, - &alloc_size, NULL); + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + while (trunc_end > trunc_start) { + + ret = ocfs2_find_path(INODE_CACHE(inode), path, + cluster_in_el); if (ret) { mlog_errno(ret); goto out; } - if (alloc_size > trunc_len) - alloc_size = trunc_len; + el = path_leaf_el(path); - /* Only do work for non-holes */ - if (phys_cpos != 0) { - ret = ocfs2_remove_btree_range(inode, &et, cpos, - phys_cpos, alloc_size, - &dealloc); + i = ocfs2_find_rec(el, trunc_end); + /* + * Need to go to previous extent block. + */ + if (i < 0) { + if (path->p_tree_depth == 0) + break; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + path, + &cluster_in_el); if (ret) { mlog_errno(ret); goto out; } + + /* + * We've reached the leftmost extent block, + * it's safe to leave. + */ + if (cluster_in_el == 0) + break; + + /* + * The 'pos' searched for previous extent block is + * always one cluster less than actual trunc_end. + */ + trunc_end = cluster_in_el + 1; + + ocfs2_reinit_path(path, 1); + + continue; + + } else + rec = &el->l_recs[i]; + + ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, + &trunc_len, &trunc_end, &blkno, &done); + if (done) + break; + + flags = rec->e_flags; + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + + ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, + &dealloc, refcount_loc); + if (ret < 0) { + mlog_errno(ret); + goto out; } - cpos += alloc_size; - trunc_len -= alloc_size; + cluster_in_el = trunc_end; + + ocfs2_reinit_path(path, 1); } ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index af18988..abb0a95 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_last_used_slot = 0; OCFS2_I(inode)->ip_last_used_group = 0; + + if (S_ISDIR(inode->i_mode)) + ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, + OCFS2_RESV_FLAG_DIR); mlog_exit_void(); } @@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct buffer_head *fe_bh) { int status = 0; - struct ocfs2_truncate_context *tc = NULL; struct ocfs2_dinode *fe; handle_t *handle = NULL; @@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, ocfs2_commit_trans(osb, handle); handle = NULL; - status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto out; - } - - status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + status = ocfs2_commit_truncate(osb, inode, fe_bh); if (status < 0) { mlog_errno(status); goto out; @@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode, di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); - - status = ocfs2_journal_dirty(handle, di_bh); - if (status < 0) { - mlog_errno(status); - goto bail_commit; - } + ocfs2_journal_dirty(handle, di_bh); ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); dquot_free_inode(inode); @@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, void ocfs2_delete_inode(struct inode *inode) { int wipe, status; - sigset_t blocked, oldset; + sigset_t oldset; struct buffer_head *di_bh = NULL; mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); @@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode) * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned * forever. */ - sigfillset(&blocked); - status = sigprocmask(SIG_BLOCK, &blocked, &oldset); - if (status < 0) { - mlog_errno(status); - ocfs2_cleanup_delete_inode(inode, 1); - goto bail; - } + ocfs2_block_signals(&oldset); /* * Synchronize us against ocfs2_get_dentry. We take this in @@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync: ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); bail_unblock: - status = sigprocmask(SIG_SETMASK, &oldset, NULL); - if (status < 0) - mlog_errno(status); + ocfs2_unblock_signals(&oldset); bail: clear_inode(inode); mlog_exit_void(); @@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + &oi->ip_la_data_resv); + ocfs2_resv_init_once(&oi->ip_la_data_resv); + /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster * locks until the journal has finished with it. The only @@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) - mlog_errno(status); - - status = 0; + ocfs2_journal_dirty(handle, bh); leave: - mlog_exit(status); return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 0b28e19..9f5f5fc 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -70,6 +70,8 @@ struct ocfs2_inode_info /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; + + struct ocfs2_alloc_reservation ip_la_data_resv; }; /* diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9336c60..47878cf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, } /* - * 'nblocks' is what you want to add to the current - * transaction. extend_trans will either extend the current handle by - * nblocks, or commit it and start a new one with nblocks credits. + * 'nblocks' is what you want to add to the current transaction. * * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling @@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, */ int ocfs2_extend_trans(handle_t *handle, int nblocks) { - int status; + int status, old_nblocks; BUG_ON(!handle); - BUG_ON(!nblocks); + BUG_ON(nblocks < 0); + + if (!nblocks) + return 0; + old_nblocks = handle->h_buffer_credits; mlog_entry_void(); mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); @@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) mlog(0, "jbd2_journal_extend failed, trying " "jbd2_journal_restart\n"); - status = jbd2_journal_restart(handle, nblocks); + status = jbd2_journal_restart(handle, + old_nblocks + nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, return __ocfs2_journal_access(handle, ci, bh, NULL, type); } -int ocfs2_journal_dirty(handle_t *handle, - struct buffer_head *bh) +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) { int status; @@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle, (unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - if (status < 0) - mlog(ML_ERROR, "Could not dirty metadata buffer. " - "(bh->b_blocknr=%llu)\n", - (unsigned long long)bh->b_blocknr); + BUG_ON(status); - mlog_exit(status); - return status; + mlog_exit_void(); } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3f74e09..b5baaa8 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, * <modify the bh> * ocfs2_journal_dirty(handle, bh); */ -int ocfs2_journal_dirty(handle_t *handle, - struct buffer_head *bh); +void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); /* * Credit Macros: @@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, return blocks; } +/* + * Allocating a discontiguous block group requires the credits from + * ocfs2_calc_group_alloc_credits() as well as enough credits to fill + * the group descriptor's extent list. The caller already has started + * the transaction with ocfs2_calc_group_alloc_credits(). They extend + * it with these credits. + */ +static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) +{ + return ocfs2_extent_recs_per_gd(sb); +} + static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, unsigned int clusters_to_del, struct ocfs2_dinode *fe, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c983715..3d74196 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, struct ocfs2_dinode *alloc, - u32 numbits); + u32 *numbits, + struct ocfs2_alloc_reservation *resv); static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); @@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); +/* + * ocfs2_la_default_mb() - determine a default size, in megabytes of + * the local alloc. + * + * Generally, we'd like to pick as large a local alloc as + * possible. Performance on large workloads tends to scale + * proportionally to la size. In addition to that, the reservations + * code functions more efficiently as it can reserve more windows for + * write. + * + * Some things work against us when trying to choose a large local alloc: + * + * - We need to ensure our sizing is picked to leave enough space in + * group descriptors for other allocations (such as block groups, + * etc). Picking default sizes which are a multiple of 4 could help + * - block groups are allocated in 2mb and 4mb chunks. + * + * - Likewise, we don't want to starve other nodes of bits on small + * file systems. This can easily be taken care of by limiting our + * default to a reasonable size (256M) on larger cluster sizes. + * + * - Some file systems can't support very large sizes - 4k and 8k in + * particular are limited to less than 128 and 256 megabytes respectively. + * + * The following reference table shows group descriptor and local + * alloc maximums at various cluster sizes (4k blocksize) + * + * csize: 4K group: 126M la: 121M + * csize: 8K group: 252M la: 243M + * csize: 16K group: 504M la: 486M + * csize: 32K group: 1008M la: 972M + * csize: 64K group: 2016M la: 1944M + * csize: 128K group: 4032M la: 3888M + * csize: 256K group: 8064M la: 7776M + * csize: 512K group: 16128M la: 15552M + * csize: 1024K group: 32256M la: 31104M + */ +#define OCFS2_LA_MAX_DEFAULT_MB 256 +#define OCFS2_LA_OLD_DEFAULT 8 +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) +{ + unsigned int la_mb; + unsigned int gd_mb; + unsigned int megs_per_slot; + struct super_block *sb = osb->sb; + + gd_mb = ocfs2_clusters_to_megabytes(osb->sb, + 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat)); + + /* + * This takes care of files systems with very small group + * descriptors - 512 byte blocksize at cluster sizes lower + * than 16K and also 1k blocksize with 4k cluster size. + */ + if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192) + || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) + return OCFS2_LA_OLD_DEFAULT; + + /* + * Leave enough room for some block groups and make the final + * value we work from a multiple of 4. + */ + gd_mb -= 16; + gd_mb &= 0xFFFFFFFB; + + la_mb = gd_mb; + + /* + * Keep window sizes down to a reasonable default + */ + if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { + /* + * Some clustersize / blocksize combinations will have + * given us a larger than OCFS2_LA_MAX_DEFAULT_MB + * default size, but get poor distribution when + * limited to exactly 256 megabytes. + * + * As an example, 16K clustersize at 4K blocksize + * gives us a cluster group size of 504M. Paring the + * local alloc size down to 256 however, would give us + * only one window and around 200MB left in the + * cluster group. Instead, find the first size below + * 256 which would give us an even distribution. + * + * Larger cluster group sizes actually work out pretty + * well when pared to 256, so we don't have to do this + * for any group that fits more than two + * OCFS2_LA_MAX_DEFAULT_MB windows. + */ + if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB)) + la_mb = 256; + else { + unsigned int gd_mult = gd_mb; + + while (gd_mult > 256) + gd_mult = gd_mult >> 1; + + la_mb = gd_mult; + } + } + + megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots; + megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); + /* Too many nodes, too few disk clusters. */ + if (megs_per_slot < la_mb) + la_mb = megs_per_slot; + + return la_mb; +} + +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) +{ + struct super_block *sb = osb->sb; + unsigned int la_default_mb = ocfs2_la_default_mb(osb); + unsigned int la_max_mb; + + la_max_mb = ocfs2_clusters_to_megabytes(sb, + ocfs2_local_alloc_size(sb) * 8); + + mlog(0, "requested: %dM, max: %uM, default: %uM\n", + requested_mb, la_max_mb, la_default_mb); + + if (requested_mb == -1) { + /* No user request - use defaults */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_default_mb); + } else if (requested_mb > la_max_mb) { + /* Request is too big, we give the maximum available */ + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, la_max_mb); + } else { + osb->local_alloc_default_bits = + ocfs2_megabytes_to_clusters(sb, requested_mb); + } + + osb->local_alloc_bits = osb->local_alloc_default_bits; +} + static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { return (osb->local_alloc_state == OCFS2_LA_THROTTLED || @@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) osb->local_alloc_bits, (osb->bitmap_cpg - 1)); osb->local_alloc_bits = ocfs2_megabytes_to_clusters(osb->sb, - OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); + ocfs2_la_default_mb(osb)); } /* read the alloc off disk */ @@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) osb->local_alloc_state = OCFS2_LA_DISABLED; + ocfs2_resmap_uninit(&osb->osb_la_resmap); + main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } ocfs2_clear_local_alloc(alloc); - - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_commit; - } + ocfs2_journal_dirty(handle, bh); brelse(bh); osb->local_alloc_bh = NULL; @@ -481,46 +617,6 @@ out: return status; } -/* Check to see if the local alloc window is within ac->ac_max_block */ -static int ocfs2_local_alloc_in_range(struct inode *inode, - struct ocfs2_alloc_context *ac, - u32 bits_wanted) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *alloc; - struct ocfs2_local_alloc *la; - int start; - u64 block_off; - - if (!ac->ac_max_block) - return 1; - - alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; - la = OCFS2_LOCAL_ALLOC(alloc); - - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); - if (start == -1) { - mlog_errno(-ENOSPC); - return 0; - } - - /* - * Converting (bm_off + start + bits_wanted) to blocks gives us - * the blkno just past our actual allocation. This is perfect - * to compare with ac_max_block. - */ - block_off = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(la->la_bm_off) + - start + bits_wanted); - mlog(0, "Checking %llu against %llu\n", - (unsigned long long)block_off, - (unsigned long long)ac->ac_max_block); - if (block_off > ac->ac_max_block) - return 0; - - return 1; -} - /* * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. @@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog(0, "Calling in_range for max block %llu\n", (unsigned long long)ac->ac_max_block); - if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, - bits_wanted)) { - /* - * The window is outside ac->ac_max_block. - * This errno tells the caller to keep localalloc enabled - * but to get the allocation from the main bitmap. - */ - status = -EFBIG; - goto bail; - } - ac->ac_inode = local_alloc_inode; /* We should never use localalloc from another slot */ ac->ac_alloc_slot = osb->slot_num; @@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; la = OCFS2_LOCAL_ALLOC(alloc); - start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted, + ac->ac_resv); if (start == -1) { /* TODO: Shouldn't we just BUG here? */ status = -ENOSPC; @@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, bitmap = la->la_bitmap; *bit_off = le32_to_cpu(la->la_bm_off) + start; - /* local alloc is always contiguous by nature -- we never - * delete bits from it! */ *num_bits = bits_wanted; status = ocfs2_journal_access_di(handle, @@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, goto bail; } + ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start, + bits_wanted); + while(bits_wanted--) ocfs2_set_bit(start++, bitmap); le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits); + ocfs2_journal_dirty(handle, osb->local_alloc_bh); - status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = 0; bail: mlog_exit(status); return status; @@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) } static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, - struct ocfs2_dinode *alloc, - u32 numbits) + struct ocfs2_dinode *alloc, + u32 *numbits, + struct ocfs2_alloc_reservation *resv) { int numfound, bitoff, left, startoff, lastzero; + int local_resv = 0; + struct ocfs2_alloc_reservation r; void *bitmap = NULL; + struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; - mlog_entry("(numbits wanted = %u)\n", numbits); + mlog_entry("(numbits wanted = %u)\n", *numbits); if (!alloc->id1.bitmap1.i_total) { mlog(0, "No bits in my window!\n"); @@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, goto bail; } + if (!resv) { + local_resv = 1; + ocfs2_resv_init_once(&r); + ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP); + resv = &r; + } + + numfound = *numbits; + if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) { + if (numfound < *numbits) + *numbits = numfound; + goto bail; + } + + /* + * Code error. While reservations are enabled, local + * allocation should _always_ go through them. + */ + BUG_ON(osb->osb_resv_level != 0); + + /* + * Reservations are disabled. Handle this the old way. + */ + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; numfound = bitoff = startoff = 0; @@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, startoff = bitoff+1; } /* we got everything we needed */ - if (numfound == numbits) { + if (numfound == *numbits) { /* mlog(0, "Found it all!\n"); */ break; } @@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, numfound); - if (numfound == numbits) + if (numfound == *numbits) bitoff = startoff - numfound; else bitoff = -1; bail: + if (local_resv) + ocfs2_resv_discard(resmap, resv); + mlog_exit(bitoff); return bitoff; } @@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); if (status == -ENOSPC) { retry_enospc: @@ -1063,7 +1175,7 @@ retry_enospc: goto bail; ac->ac_bits_wanted = osb->local_alloc_default_bits; - status = ocfs2_claim_clusters(osb, handle, ac, + status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); @@ -1098,6 +1210,9 @@ retry_enospc: memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, le16_to_cpu(la->la_size)); + ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, + OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); + mlog(0, "New window allocated:\n"); mlog(0, "window la_bm_off = %u\n", OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); @@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } ocfs2_clear_local_alloc(alloc); - - status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, osb->local_alloc_bh); status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, main_bm_inode, main_bm_bh); @@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, atomic_inc(&osb->alloc_stats.moves); - status = 0; bail: if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index ac5ea9f8..1be9b58 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb); void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); +void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb); +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb); + int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, int node_num, struct ocfs2_dinode **alloc_copy); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 7898bd3..af2b8fe 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -41,44 +41,20 @@ #include "file.h" #include "inode.h" #include "mmap.h" +#include "super.h" -static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) -{ - /* The best way to deal with signals in the vm path is - * to block them upfront, rather than allowing the - * locking paths to return -ERESTARTSYS. */ - sigfillset(blocked); - - /* We should technically never get a bad return value - * from sigprocmask */ - return sigprocmask(SIG_BLOCK, blocked, oldset); -} - -static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) -{ - return sigprocmask(SIG_SETMASK, oldset, NULL); -} static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) { - sigset_t blocked, oldset; - int error, ret; + sigset_t oldset; + int ret; mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff); - error = ocfs2_vm_op_block_sigs(&blocked, &oldset); - if (error < 0) { - mlog_errno(error); - ret = VM_FAULT_SIGBUS; - goto out; - } - + ocfs2_block_signals(&oldset); ret = filemap_fault(area, vmf); + ocfs2_unblock_signals(&oldset); - error = ocfs2_vm_op_unblock_sigs(&oldset); - if (error < 0) - mlog_errno(error); -out: mlog_exit_ptr(vmf->page); return ret; } @@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct buffer_head *di_bh = NULL; - sigset_t blocked, oldset; - int ret, ret2; + sigset_t oldset; + int ret; - ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); - if (ret < 0) { - mlog_errno(ret); - return ret; - } + ocfs2_block_signals(&oldset); /* * The cluster locks taken will block a truncate from another @@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ocfs2_inode_unlock(inode, 1); out: - ret2 = ocfs2_vm_op_unblock_sigs(&oldset); - if (ret2 < 0) - mlog_errno(ret2); + ocfs2_unblock_signals(&oldset); if (ret) ret = VM_FAULT_SIGBUS; return ret; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4cbb18f..db5dd3e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -239,6 +239,8 @@ static int ocfs2_mknod(struct inode *dir, }; int did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -350,6 +352,10 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + status = dquot_alloc_inode(inode); if (status) goto leave; @@ -384,11 +390,7 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } ocfs2_add_links_count(dirfe, 1); - status = ocfs2_journal_dirty(handle, parent_fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, parent_fe_bh); inc_nlink(dir); } @@ -439,6 +441,8 @@ leave: ocfs2_commit_trans(osb, handle); ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); if (status == -ENOSPC) mlog(0, "Disk is full\n"); @@ -487,14 +491,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, int status = 0; struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 fe_blkno = 0; + u64 suballoc_loc, fe_blkno = 0; u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, - inode_ac, &suballoc_bit, &fe_blkno); + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); if (status < 0) { mlog_errno(status); goto leave; @@ -531,6 +536,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_generation = cpu_to_le32(inode->i_generation); fe->i_fs_generation = cpu_to_le32(osb->fs_generation); fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_loc = cpu_to_le64(suballoc_loc); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); fe->i_uid = cpu_to_le32(inode->i_uid); @@ -567,11 +573,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); } - status = ocfs2_journal_dirty(handle, *new_fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, *new_fe_bh); ocfs2_populate_inode(inode, fe, 1); ocfs2_ci_set_new(osb, INODE_CACHE(inode)); @@ -637,6 +639,7 @@ static int ocfs2_link(struct dentry *old_dentry, struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, old_dentry->d_name.len, old_dentry->d_name.name, @@ -693,6 +696,9 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { @@ -705,14 +711,7 @@ static int ocfs2_link(struct dentry *old_dentry, ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - - err = ocfs2_journal_dirty(handle, fe_bh); - if (err < 0) { - ocfs2_add_links_count(fe, -1); - drop_nlink(inode); - mlog_errno(err); - goto out_commit; - } + ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, @@ -736,6 +735,7 @@ static int ocfs2_link(struct dentry *old_dentry, out_commit: ocfs2_commit_trans(osb, handle); + ocfs2_unblock_signals(&oldset); out_unlock_inode: ocfs2_inode_unlock(inode, 1); @@ -909,12 +909,7 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, fe_bh); dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (S_ISDIR(inode->i_mode)) @@ -1332,12 +1327,7 @@ static int ocfs2_rename(struct inode *old_dir, ocfs2_set_links_count(newfe, 0); else ocfs2_add_links_count(newfe, -1); - - status = ocfs2_journal_dirty(handle, newfe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, newfe_bh); } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1356,10 +1346,7 @@ static int ocfs2_rename(struct inode *old_dir, old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); - - status = ocfs2_journal_dirty(handle, old_inode_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1431,7 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); - status = ocfs2_journal_dirty(handle, old_dir_bh); + ocfs2_journal_dirty(handle, old_dir_bh); } } ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); @@ -1563,11 +1550,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, (bytes_left > sb->s_blocksize) ? sb->s_blocksize : bytes_left); - status = ocfs2_journal_dirty(handle, bhs[virtual]); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bhs[virtual]); virtual++; p_blkno++; @@ -1611,6 +1594,8 @@ static int ocfs2_symlink(struct inode *dir, }; int did_quota = 0, did_quota_inode = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; + sigset_t oldset; + int did_block_signals = 0; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1706,6 +1691,10 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + /* Starting to change things, restart is no longer possible. */ + ocfs2_block_signals(&oldset); + did_block_signals = 1; + status = dquot_alloc_inode(inode); if (status) goto bail; @@ -1814,6 +1803,8 @@ bail: ocfs2_commit_trans(osb, handle); ocfs2_inode_unlock(dir, 1); + if (did_block_signals) + ocfs2_unblock_signals(&oldset); brelse(new_fe_bh); brelse(parent_fe_bh); @@ -1961,12 +1952,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, 1); orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); - - status = ocfs2_journal_dirty(handle, orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, OCFS2_ORPHAN_NAMELEN, inode, @@ -2065,12 +2051,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, if (S_ISDIR(inode->i_mode)) ocfs2_add_links_count(orphan_fe, -1); orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe); - - status = ocfs2_journal_dirty(handle, orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, orphan_dir_bh); leave: ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index adf5e2e..c67003b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -47,6 +47,7 @@ /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" +#include "reservations.h" /* Caching of metadata buffers */ @@ -341,6 +342,9 @@ struct ocfs2_super */ unsigned int local_alloc_bits; unsigned int local_alloc_default_bits; + /* osb_clusters_at_boot can become stale! Do not trust it to + * be up to date. */ + unsigned int osb_clusters_at_boot; enum ocfs2_local_alloc_state local_alloc_state; /* protected * by osb_lock */ @@ -349,6 +353,11 @@ struct ocfs2_super u64 la_last_gd; + struct ocfs2_reservation_map osb_la_resmap; + + unsigned int osb_resv_level; + unsigned int osb_dir_resv_level; + /* Next three fields are for local node slot recovery during * mount. */ int dirty; @@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) + return 1; + return 0; +} + static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb) { if (ocfs2_supports_indexed_dirs(osb)) @@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } +static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, + unsigned int clusters) +{ + return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { ext2_set_bit(bit, bitmap); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index bb37218..33f1c9a 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -100,7 +100,8 @@ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ - | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ + | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -165,6 +166,9 @@ /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 +/* Discontigous block groups */ +#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -283,14 +287,6 @@ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) /* - * Default local alloc size (in megabytes) - * - * The value chosen should be such that most allocations, including new - * block groups, use local alloc. - */ -#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 - -/* * Inline extended attribute size (in bytes) * The value chosen should be aligned to 16 byte boundaries. */ @@ -512,7 +508,10 @@ struct ocfs2_extent_block block group */ __le32 h_fs_generation; /* Must match super block */ __le64 h_blkno; /* Offset on disk, in blocks */ -/*20*/ __le64 h_reserved3; +/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this + eb belongs to. Only valid + if allocated from a + discontiguous block group */ __le64 h_next_leaf_blk; /* Offset on disk, in blocks, of next leaf header pointing to data */ @@ -679,7 +678,11 @@ struct ocfs2_dinode { /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ /*90*/ __le64 i_refcount_loc; - __le64 i_reserved2[4]; + __le64 i_suballoc_loc; /* Suballocator block group this + inode belongs to. Only valid + if allocated from a + discontiguous block group */ +/*A0*/ __le64 i_reserved2[3]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -814,7 +817,12 @@ struct ocfs2_dx_root_block { __le32 dr_reserved2; __le64 dr_free_blk; /* Pointer to head of free * unindexed block list. */ - __le64 dr_reserved3[15]; + __le64 dr_suballoc_loc; /* Suballocator block group + this root belongs to. + Only valid if allocated + from a discontiguous + block group */ + __le64 dr_reserved3[14]; union { struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 * bits for maximum space @@ -840,6 +848,13 @@ struct ocfs2_dx_leaf { }; /* + * Largest bitmap for a block (suballocator) group in bytes. This limit + * does not affect cluster groups (global allocator). Cluster group + * bitmaps run to the end of the block. + */ +#define OCFS2_MAX_BG_BITMAP_SIZE 256 + +/* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc @@ -860,7 +875,29 @@ struct ocfs2_group_desc __le64 bg_blkno; /* Offset on disk, in blocks */ /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; -/*40*/ __u8 bg_bitmap[0]; +/*40*/ union { + __u8 bg_bitmap[0]; + struct { + /* + * Block groups may be discontiguous when + * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. + * The extents of a discontigous block group are + * stored in bg_list. It is a flat list. + * l_tree_depth must always be zero. A + * discontiguous group is signified by a non-zero + * bg_list->l_next_free_rec. Only block groups + * can be discontiguous; Cluster groups cannot. + * We've never made a block group with more than + * 2048 blocks (256 bytes of bg_bitmap). This + * codifies that limit so that we can fit bg_list. + * bg_size of a discontiguous block group will + * be 256 to match bg_bitmap_filler. + */ + __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; +/*140*/ struct ocfs2_extent_list bg_list; + }; + }; +/* Actual on-disk size is one block */ }; struct ocfs2_refcount_rec { @@ -905,7 +942,11 @@ struct ocfs2_refcount_block { /*40*/ __le32 rf_generation; /* generation number. all be the same * for the same refcount tree. */ __le32 rf_reserved0; - __le64 rf_reserved1[7]; + __le64 rf_suballoc_loc; /* Suballocator block group this + refcount block belongs to. Only + valid if allocated from a + discontiguous block group */ +/*50*/ __le64 rf_reserved1[6]; /*80*/ union { struct ocfs2_refcount_list rf_records; /* List of refcount records */ @@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block { real xattr or a xattr tree. */ __le16 xb_reserved0; __le32 xb_reserved1; - __le64 xb_reserved2; + __le64 xb_suballoc_loc; /* Suballocator block group this + xattr block belongs to. Only + valid if allocated from a + discontiguous block group */ /*30*/ union { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ @@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) { int size; @@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb) return size; } -static inline int ocfs2_group_bitmap_size(struct super_block *sb) +static inline int ocfs2_group_bitmap_size(struct super_block *sb, + int suballocator, + u32 feature_incompat) { - int size; - - size = sb->s_blocksize - + int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + return size; } @@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize) return size / sizeof(struct ocfs2_extent_rec); } -static inline int ocfs2_local_alloc_size(int blocksize) +static inline int ocfs2_extent_recs_per_gd(int blocksize) { int size; size = blocksize - - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + offsetof(struct ocfs2_group_desc, bg_list.l_recs); - return size; + return size / sizeof(struct ocfs2_extent_rec); } -static inline int ocfs2_group_bitmap_size(int blocksize) +static inline int ocfs2_local_alloc_size(int blocksize) { int size; size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize, + int suballocator, + uint32_t feature_incompat) +{ + int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); + /* + * The cluster allocator uses the entire block. Suballocators have + * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older + * code expects bg_size set to the maximum. Thus we must keep + * bg_size as-is unless discontig_bg is enabled. + */ + if (suballocator && + (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) + size = OCFS2_MAX_BG_BITMAP_SIZE; + return size; } @@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) +{ + if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + + le16_to_cpu(gd->bg_size)) != + offsetof(struct ocfs2_group_desc, bg_list)) + return 0; + /* + * Only valid to check l_next_free_rec if + * bg_bitmap + bg_size == bg_list. + */ + if (!gd->bg_list.l_next_free_rec) + return 0; + return 1; +} #endif /* _OCFS2_FS_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index ab42a74..04ae76d 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -261,10 +261,8 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, brelse(bh); goto out; } - err = ocfs2_journal_dirty(handle, bh); + ocfs2_journal_dirty(handle, bh); brelse(bh); - if (err < 0) - goto out; out: if (err) { mutex_unlock(&gqinode->i_mutex); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 9ad4930..884b641 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -119,12 +119,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, lock_buffer(bh); modify(bh, private); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - ocfs2_commit_trans(OCFS2_SB(sb), handle); - return status; - } + ocfs2_journal_dirty(handle, bh); + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); if (status < 0) { mlog_errno(status); @@ -523,9 +519,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, ocfs2_clear_bit(bit, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(qbh); - status = ocfs2_journal_dirty(handle, qbh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, qbh); out_commit: mutex_unlock(&sb_dqopt(sb)->dqio_mutex); ocfs2_commit_trans(OCFS2_SB(sb), handle); @@ -631,9 +625,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, lock_buffer(bh); ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, bh); out_trans: ocfs2_commit_trans(osb, handle); out_bh: @@ -1009,11 +1001,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, bh); /* Initialize new block with structures */ down_read(&OCFS2_I(lqinode)->ip_alloc_sem); @@ -1040,11 +1028,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( lock_buffer(dbh); memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(dbh); - status = ocfs2_journal_dirty(handle, dbh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, dbh); /* Update local quotafile info */ oinfo->dqi_blocks += 2; @@ -1155,11 +1139,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); unlock_buffer(bh); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, bh); + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), chunk->qc_headerbh, @@ -1173,11 +1154,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( lock_buffer(chunk->qc_headerbh); le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); unlock_buffer(chunk->qc_headerbh); - status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); - if (status < 0) { - mlog_errno(status); - goto out_trans; - } + ocfs2_journal_dirty(handle, chunk->qc_headerbh); + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); @@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot) ocfs2_clear_bit(offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(od->dq_chunk->qc_headerbh); - status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); - if (status < 0) { - mlog_errno(status); - goto out; - } - status = 0; + ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + out: /* Clear the read bit so that next time someone uses this * dquot he reads fresh info from disk and allocates local diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5cbcd0f..4793f36 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); @@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, goto out_commit; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); if (ret) { @@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, memset(rb, 0, inode->i_sb->s_blocksize); strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); rb->rf_blkno = cpu_to_le64(first_blkno); @@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) if (le32_to_cpu(rb->rf_count) == 1) { blk = le64_to_cpu(rb->rf_blkno); bit = le16_to_cpu(rb->rf_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (rb->rf_suballoc_loc) + bg_blkno = le64_to_cpu(rb->rf_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, @@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle, } else if (merge) ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: return ret; } @@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; @@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle, new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_cpos = cpu_to_le32(0); @@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, int ret; u16 suballoc_bit_start; u32 num_got, new_cpos; - u64 blkno; + u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; @@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, goto out; } - ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { @@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle, memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); + new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); @@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle, * 2 more credits, one for the leaf refcount block, one for * the extent block contains the extent rec. */ - ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); + ret = ocfs2_extend_trans(handle, 2); if (ret < 0) { mlog_errno(ret); goto out; @@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle, if (merge) ocfs2_refcount_rec_merge(rb, index); - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, ref_leaf_bh); if (index == 0) { ret = ocfs2_adjust_refcount_rec(handle, ci, @@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, ocfs2_refcount_rec_merge(rb, index); } - ret = ocfs2_journal_dirty(handle, ref_leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, ref_leaf_bh); out: brelse(new_bh); @@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle, */ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_suballoc_loc), le64_to_cpu(rb->rf_blkno), le16_to_cpu(rb->rf_suballoc_bit)); if (ret) { @@ -2516,20 +2515,19 @@ out: * * Normally the refcount blocks store these refcount should be * contiguous also, so that we can get the number easily. - * As for meta_ac, we will at most add split 2 refcount record and - * 2 more refcount block, so just check it in a rough way. + * We will at most add split 2 refcount records and 2 more + * refcount blocks, so just check it in a rough way. * * Caller must hold refcount tree lock. */ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, - struct buffer_head *di_bh, + u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, - struct ocfs2_alloc_context **meta_ac) + int *ref_blocks) { - int ret, ref_blocks = 0; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; @@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_refcount_loc), &tree); + refcount_loc, &tree); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_read_refcount_block(&tree->rf_ci, - le64_to_cpu(di->i_refcount_loc), + ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, &ref_root_bh); if (ret) { mlog_errno(ret); @@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, &tree->rf_ci, ref_root_bh, start_cpos, clusters, - &ref_blocks, credits); + ref_blocks, credits); if (ret) { mlog_errno(ret); goto out; } - mlog(0, "reserve new metadata %d, credits = %d\n", - ref_blocks, *credits); - - if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), - ref_blocks, meta_ac); - if (ret) - mlog_errno(ret); - } + mlog(0, "reserve new metadata %d blocks, credits = %d\n", + *ref_blocks, *credits); out: brelse(ref_root_bh); @@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, } memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret) { - mlog_errno(ret); - break; - } + ocfs2_journal_dirty(handle, new_bh); brelse(new_bh); brelse(old_bh); @@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, } else { delete = 1; - ret = __ocfs2_claim_clusters(osb, handle, + ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, set_len, &new_bit, &new_len); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index c1d19b1..9983ba1 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete); int ocfs2_prepare_refcount_change_for_del(struct inode *inode, - struct buffer_head *di_bh, + u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, - struct ocfs2_alloc_context **meta_ac); + int *ref_blocks); int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c new file mode 100644 index 0000000..4065002 --- /dev/null +++ b/fs/ocfs2/reservations.c @@ -0,0 +1,847 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.c + * + * Allocation reservations implementation + * + * Some code borrowed from fs/ext3/balloc.c and is: + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * The rest is copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bitops.h> +#include <linux/list.h> + +#define MLOG_MASK_PREFIX ML_RESERVATIONS +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#ifdef CONFIG_OCFS2_DEBUG_FS +#define OCFS2_CHECK_RESERVATIONS +#endif + +DEFINE_SPINLOCK(resv_lock); + +#define OCFS2_MIN_RESV_WINDOW_BITS 8 +#define OCFS2_MAX_RESV_WINDOW_BITS 1024 + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) +{ + return (osb->osb_resv_level && osb->osb_dir_resv_level); +} + +static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + struct ocfs2_super *osb = resmap->m_osb; + unsigned int bits; + + if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { + /* 8, 16, 32, 64, 128, 256, 512, 1024 */ + bits = 4 << osb->osb_resv_level; + } else { + bits = 4 << osb->osb_dir_resv_level; + } + return bits; +} + +static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_len) + return resv->r_start + resv->r_len - 1; + return resv->r_start; +} + +static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) +{ + return !!(resv->r_len == 0); +} + +static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) +{ + if (resmap->m_osb->osb_resv_level == 0) + return 1; + return 0; +} + +static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) +{ + struct ocfs2_super *osb = resmap->m_osb; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + int i = 0; + + mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", + osb->dev_str, resmap->m_bitmap_len); + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" + "\tlast_len: %u\n", resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + node = rb_next(node); + i++; + } + + mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i); + + i = 0; + list_for_each_entry(resv, &resmap->m_lru, r_lru) { + mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t" + "last_start: %u\tlast_len: %u\n", i, resv->r_start, + ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, + resv->r_last_len); + + i++; + } +} + +#ifdef OCFS2_CHECK_RESERVATIONS +static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap, + int i, + struct ocfs2_alloc_reservation *resv) +{ + char *disk_bitmap = resmap->m_disk_bitmap; + unsigned int start = resv->r_start; + unsigned int end = ocfs2_resv_end(resv); + + while (start <= end) { + if (ocfs2_test_bit(start, disk_bitmap)) { + mlog(ML_ERROR, + "reservation %d covers an allocated area " + "starting at bit %u!\n", i, start); + return 1; + } + + start++; + } + return 0; +} + +static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + unsigned int off = 0; + int i = 0; + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (i > 0 && resv->r_start <= off) { + mlog(ML_ERROR, "reservation %d has bad start off!\n", + i); + goto bad; + } + + if (resv->r_len == 0) { + mlog(ML_ERROR, "reservation %d has no length!\n", + i); + goto bad; + } + + if (resv->r_start > ocfs2_resv_end(resv)) { + mlog(ML_ERROR, "reservation %d has invalid range!\n", + i); + goto bad; + } + + if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) { + mlog(ML_ERROR, "reservation %d extends past bitmap!\n", + i); + goto bad; + } + + if (ocfs2_validate_resmap_bits(resmap, i, resv)) + goto bad; + + off = ocfs2_resv_end(resv); + node = rb_next(node); + + i++; + } + return; + +bad: + ocfs2_dump_resv(resmap); + BUG(); +} +#else +static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) +{ + +} +#endif + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) +{ + memset(resv, 0, sizeof(*resv)); + INIT_LIST_HEAD(&resv->r_lru); +} + +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags) +{ + BUG_ON(flags & ~OCFS2_RESV_TYPES); + + resv->r_flags |= flags; +} + +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap) +{ + memset(resmap, 0, sizeof(*resmap)); + + resmap->m_osb = osb; + resmap->m_reservations = RB_ROOT; + /* m_bitmap_len is initialized to zero by the above memset. */ + INIT_LIST_HEAD(&resmap->m_lru); + + return 0; +} + +static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + if (!list_empty(&resv->r_lru)) + list_del_init(&resv->r_lru); + + list_add_tail(&resv->r_lru, &resmap->m_lru); +} + +static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) +{ + resv->r_len = 0; + resv->r_start = 0; +} + +static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) { + list_del_init(&resv->r_lru); + rb_erase(&resv->r_node, &resmap->m_reservations); + resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE; + } +} + +static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + assert_spin_locked(&resv_lock); + + __ocfs2_resv_trunc(resv); + /* + * last_len and last_start no longer make sense if + * we're changing the range of our allocations. + */ + resv->r_last_len = resv->r_last_start = 0; + + ocfs2_resv_remove(resmap, resv); +} + +/* does nothing if 'resv' is null */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv) +{ + if (resv) { + spin_lock(&resv_lock); + __ocfs2_resv_discard(resmap, resv); + spin_unlock(&resv_lock); + } +} + +static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) +{ + struct rb_node *node; + struct ocfs2_alloc_reservation *resv; + + assert_spin_locked(&resv_lock); + + while ((node = rb_last(&resmap->m_reservations)) != NULL) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + __ocfs2_resv_discard(resmap, resv); + } +} + +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap) +{ + if (ocfs2_resmap_disabled(resmap)) + return; + + spin_lock(&resv_lock); + + ocfs2_resmap_clear_all_resv(resmap); + resmap->m_bitmap_len = clen; + resmap->m_disk_bitmap = disk_bitmap; + + spin_unlock(&resv_lock); +} + +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) +{ + /* Does nothing for now. Keep this around for API symmetry */ +} + +static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *new) +{ + struct rb_root *root = &resmap->m_reservations; + struct rb_node *parent = NULL; + struct rb_node **p = &root->rb_node; + struct ocfs2_alloc_reservation *tmp; + + assert_spin_locked(&resv_lock); + + mlog(0, "Insert reservation start: %u len: %u\n", new->r_start, + new->r_len); + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); + + if (new->r_start < tmp->r_start) { + p = &(*p)->rb_left; + + /* + * This is a good place to check for + * overlapping reservations. + */ + BUG_ON(ocfs2_resv_end(new) >= tmp->r_start); + } else if (new->r_start > ocfs2_resv_end(tmp)) { + p = &(*p)->rb_right; + } else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate reservation window!\n"); + BUG(); + } + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, root); + new->r_flags |= OCFS2_RESV_FLAG_INUSE; + + ocfs2_resv_mark_lru(resmap, new); + + ocfs2_check_resmap(resmap); +} + +/** + * ocfs2_find_resv_lhs() - find the window which contains goal + * @resmap: reservation map to search + * @goal: which bit to search for + * + * If a window containing that goal is not found, we return the window + * which comes before goal. Returns NULL on empty rbtree or no window + * before goal. + */ +static struct ocfs2_alloc_reservation * +ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal) +{ + struct ocfs2_alloc_reservation *resv = NULL; + struct ocfs2_alloc_reservation *prev_resv = NULL; + struct rb_node *node = resmap->m_reservations.rb_node; + + assert_spin_locked(&resv_lock); + + if (!node) + return NULL; + + node = rb_first(&resmap->m_reservations); + while (node) { + resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); + + if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal) + break; + + /* Check if we overshot the reservation just before goal? */ + if (resv->r_start > goal) { + resv = prev_resv; + break; + } + + prev_resv = resv; + node = rb_next(node); + } + + return resv; +} + +/* + * We are given a range within the bitmap, which corresponds to a gap + * inside the reservations tree (search_start, search_len). The range + * can be anything from the whole bitmap, to a gap between + * reservations. + * + * The start value of *rstart is insignificant. + * + * This function searches the bitmap range starting at search_start + * with length search_len for a set of contiguous free bits. We try + * to find up to 'wanted' bits, but can sometimes return less. + * + * Returns the length of allocation, 0 if no free bits are found. + * + * *cstart and *clen will also be populated with the result. + */ +static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, + unsigned int wanted, + unsigned int search_start, + unsigned int search_len, + unsigned int *rstart, + unsigned int *rlen) +{ + void *bitmap = resmap->m_disk_bitmap; + unsigned int best_start, best_len = 0; + int offset, start, found; + + mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n", + wanted, search_start, search_len, resmap->m_bitmap_len); + + found = best_start = best_len = 0; + + start = search_start; + while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, + start)) != -1) { + /* Search reached end of the region */ + if (offset >= (search_start + search_len)) + break; + + if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_len) { + best_len = found; + best_start = start - found; + } + + if (found >= wanted) + break; + } + + if (best_len == 0) + return 0; + + if (best_len >= wanted) + best_len = wanted; + + *rlen = best_len; + *rstart = best_start; + + mlog(0, "Found start: %u len: %u\n", best_start, best_len); + + return *rlen; +} + +static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int goal, unsigned int wanted) +{ + struct rb_root *root = &resmap->m_reservations; + unsigned int gap_start, gap_end, gap_len; + struct ocfs2_alloc_reservation *prev_resv, *next_resv; + struct rb_node *prev, *next; + unsigned int cstart, clen; + unsigned int best_start = 0, best_len = 0; + + /* + * Nasty cases to consider: + * + * - rbtree is empty + * - our window should be first in all reservations + * - our window should be last in all reservations + * - need to make sure we don't go past end of bitmap + */ + + mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n", + resv->r_start, ocfs2_resv_end(resv), goal, wanted); + + assert_spin_locked(&resv_lock); + + if (RB_EMPTY_ROOT(root)) { + /* + * Easiest case - empty tree. We can just take + * whatever window of free bits we want. + */ + + mlog(0, "Empty root\n"); + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + resmap->m_bitmap_len - goal, + &cstart, &clen); + + /* + * This should never happen - the local alloc window + * will always have free bits when we're called. + */ + BUG_ON(goal == 0 && clen == 0); + + if (clen == 0) + return; + + resv->r_start = cstart; + resv->r_len = clen; + + ocfs2_resv_insert(resmap, resv); + return; + } + + prev_resv = ocfs2_find_resv_lhs(resmap, goal); + + if (prev_resv == NULL) { + mlog(0, "Goal on LHS of leftmost window\n"); + + /* + * A NULL here means that the search code couldn't + * find a window that starts before goal. + * + * However, we can take the first window after goal, + * which is also by definition, the leftmost window in + * the entire tree. If we can find free bits in the + * gap between goal and the LHS window, then the + * reservation can safely be placed there. + * + * Otherwise we fall back to a linear search, checking + * the gaps in between windows for a place to + * allocate. + */ + + next = rb_first(root); + next_resv = rb_entry(next, struct ocfs2_alloc_reservation, + r_node); + + /* + * The search should never return such a window. (see + * comment above + */ + if (next_resv->r_start <= goal) { + mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n", + goal, next_resv->r_start, next_resv->r_len); + ocfs2_dump_resv(resmap); + BUG(); + } + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, + next_resv->r_start - goal, + &cstart, &clen); + if (clen) { + best_len = clen; + best_start = cstart; + if (best_len == wanted) + goto out_insert; + } + + prev_resv = next_resv; + next_resv = NULL; + } + + prev = &prev_resv->r_node; + + /* Now we do a linear search for a window, starting at 'prev_rsv' */ + while (1) { + next = rb_next(prev); + if (next) { + mlog(0, "One more resv found in linear search\n"); + next_resv = rb_entry(next, + struct ocfs2_alloc_reservation, + r_node); + + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_end = next_resv->r_start - 1; + gap_len = gap_end - gap_start + 1; + } else { + mlog(0, "No next node\n"); + /* + * We're at the rightmost edge of the + * tree. See if a reservation between this + * window and the end of the bitmap will work. + */ + gap_start = ocfs2_resv_end(prev_resv) + 1; + gap_len = resmap->m_bitmap_len - gap_start; + gap_end = resmap->m_bitmap_len - 1; + } + + /* + * No need to check this gap if we have already found + * a larger region of free bits. + */ + if (gap_len <= best_len) + goto next_resv; + + clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start, + gap_len, &cstart, &clen); + if (clen == wanted) { + best_len = clen; + best_start = cstart; + goto out_insert; + } else if (clen > best_len) { + best_len = clen; + best_start = cstart; + } + +next_resv: + if (!next) + break; + + prev = next; + prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, + r_node); + } + +out_insert: + if (best_len) { + resv->r_start = best_start; + resv->r_len = best_len; + ocfs2_resv_insert(resmap, resv); + } +} + +static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + struct ocfs2_alloc_reservation *lru_resv; + int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP); + unsigned int min_bits; + + if (!tmpwindow) + min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1; + else + min_bits = wanted; /* We at know the temp window will use all + * of these bits */ + + /* + * Take the first reservation off the LRU as our 'target'. We + * don't try to be smart about it. There might be a case for + * searching based on size but I don't have enough data to be + * sure. --Mark (3/16/2010) + */ + lru_resv = list_first_entry(&resmap->m_lru, + struct ocfs2_alloc_reservation, r_lru); + + mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start, + lru_resv->r_len, ocfs2_resv_end(lru_resv)); + + /* + * Cannibalize (some or all) of the target reservation and + * feed it to the current window. + */ + if (lru_resv->r_len <= min_bits) { + /* + * Discard completely if size is less than or equal to a + * reasonable threshold - 50% of window bits for non temporary + * windows. + */ + resv->r_start = lru_resv->r_start; + resv->r_len = lru_resv->r_len; + + __ocfs2_resv_discard(resmap, lru_resv); + } else { + unsigned int shrink; + if (tmpwindow) + shrink = min_bits; + else + shrink = lru_resv->r_len / 2; + + lru_resv->r_len -= shrink; + + resv->r_start = ocfs2_resv_end(lru_resv) + 1; + resv->r_len = shrink; + } + + mlog(0, "Reservation now looks like: r_start: %u r_end: %u " + "r_len: %u r_last_start: %u r_last_len: %u\n", + resv->r_start, ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, resv->r_last_len); + + ocfs2_resv_insert(resmap, resv); +} + +static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int wanted) +{ + unsigned int goal = 0; + + BUG_ON(!ocfs2_resv_empty(resv)); + + /* + * Begin by trying to get a window as close to the previous + * one as possible. Using the most recent allocation as a + * start goal makes sense. + */ + if (resv->r_last_len) { + goal = resv->r_last_start + resv->r_last_len; + if (goal >= resmap->m_bitmap_len) + goal = 0; + } + + __ocfs2_resv_find_window(resmap, resv, goal, wanted); + + /* Search from last alloc didn't work, try once more from beginning. */ + if (ocfs2_resv_empty(resv) && goal != 0) + __ocfs2_resv_find_window(resmap, resv, 0, wanted); + + if (ocfs2_resv_empty(resv)) { + /* + * Still empty? Pull oldest one off the LRU, remove it from + * tree, put this one in it's place. + */ + ocfs2_cannibalize_resv(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); +} + +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen) +{ + unsigned int wanted = *clen; + + if (resv == NULL || ocfs2_resmap_disabled(resmap)) + return -ENOSPC; + + spin_lock(&resv_lock); + + /* + * We don't want to over-allocate for temporary + * windows. Otherwise, we run the risk of fragmenting the + * allocation space. + */ + wanted = ocfs2_resv_window_bits(resmap, resv); + if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) + wanted = *clen; + + if (ocfs2_resv_empty(resv)) { + mlog(0, "empty reservation, find new window\n"); + + /* + * Try to get a window here. If it works, we must fall + * through and test the bitmap . This avoids some + * ping-ponging of windows due to non-reserved space + * being allocation before we initialize a window for + * that inode. + */ + ocfs2_resv_find_window(resmap, resv, wanted); + } + + BUG_ON(ocfs2_resv_empty(resv)); + + *cstart = resv->r_start; + *clen = resv->r_len; + + spin_unlock(&resv_lock); + return 0; +} + +static void + ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + unsigned int start, unsigned int end) +{ + unsigned int rhs = 0; + unsigned int old_end = ocfs2_resv_end(resv); + + BUG_ON(start != resv->r_start || old_end < end); + + /* + * Completely used? We can remove it then. + */ + if (old_end == end) { + __ocfs2_resv_discard(resmap, resv); + return; + } + + rhs = old_end - end; + + /* + * This should have been trapped above. + */ + BUG_ON(rhs == 0); + + resv->r_start = end + 1; + resv->r_len = old_end - resv->r_start + 1; +} + +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen) +{ + unsigned int cend = cstart + clen - 1; + + if (resmap == NULL || ocfs2_resmap_disabled(resmap)) + return; + + if (resv == NULL) + return; + + BUG_ON(cstart != resv->r_start); + + spin_lock(&resv_lock); + + mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u " + "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n", + cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), + resv->r_len, resv->r_last_start, resv->r_last_len); + + BUG_ON(cstart < resv->r_start); + BUG_ON(cstart > ocfs2_resv_end(resv)); + BUG_ON(cend > ocfs2_resv_end(resv)); + + ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend); + resv->r_last_start = cstart; + resv->r_last_len = clen; + + /* + * May have been discarded above from + * ocfs2_adjust_resv_from_alloc(). + */ + if (!ocfs2_resv_empty(resv)) + ocfs2_resv_mark_lru(resmap, resv); + + mlog(0, "Reservation now looks like: r_start: %u r_end: %u " + "r_len: %u r_last_start: %u r_last_len: %u\n", + resv->r_start, ocfs2_resv_end(resv), resv->r_len, + resv->r_last_start, resv->r_last_len); + + ocfs2_check_resmap(resmap); + + spin_unlock(&resv_lock); +} diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h new file mode 100644 index 0000000..1e49cc2 --- /dev/null +++ b/fs/ocfs2/reservations.h @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * reservations.h + * + * Allocation reservations function prototypes and structures. + * + * Copyright (C) 2010 Novell. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_RESERVATIONS_H +#define OCFS2_RESERVATIONS_H + +#include <linux/rbtree.h> + +#define OCFS2_DEFAULT_RESV_LEVEL 2 +#define OCFS2_MAX_RESV_LEVEL 9 +#define OCFS2_MIN_RESV_LEVEL 0 + +struct ocfs2_alloc_reservation { + struct rb_node r_node; + + unsigned int r_start; /* Begining of current window */ + unsigned int r_len; /* Length of the window */ + + unsigned int r_last_len; /* Length of most recent alloc */ + unsigned int r_last_start; /* Start of most recent alloc */ + struct list_head r_lru; /* LRU list head */ + + unsigned int r_flags; +}; + +#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */ +#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be + * destroyed immedately after use */ +#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed + * directory btree */ + +struct ocfs2_reservation_map { + struct rb_root m_reservations; + char *m_disk_bitmap; + + struct ocfs2_super *m_osb; + + /* The following are not initialized to meaningful values until a disk + * bitmap is provided. */ + u32 m_bitmap_len; /* Number of valid + * bits available */ + + struct list_head m_lru; /* LRU of reservations + * structures. */ + +}; + +void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv); + +#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR) +void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, + unsigned int flags); + +int ocfs2_dir_resv_allowed(struct ocfs2_super *osb); + +/** + * ocfs2_resv_discard() - truncate a reservation + * @resmap: + * @resv: the reservation to truncate. + * + * After this function is called, the reservation will be empty, and + * unlinked from the rbtree. + */ +void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv); + + +/** + * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @resmap: struct ocfs2_reservation_map to initialize + * @obj: unused for now + * @ops: unused for now + * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) + * + * Only possible return value other than '0' is -ENOMEM for failure to + * allocation mirror bitmap. + */ +int ocfs2_resmap_init(struct ocfs2_super *osb, + struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_restart() - "restart" a reservation bitmap + * @resmap: reservations bitmap + * @clen: Number of valid bits in the bitmap + * @disk_bitmap: the disk bitmap this resmap should refer to. + * + * Re-initialize the parameters of a reservation bitmap. This is + * useful for local alloc window slides. + * + * This function will call ocfs2_trunc_resv against all existing + * reservations. A future version will recalculate existing + * reservations based on the new bitmap. + */ +void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, + unsigned int clen, char *disk_bitmap); + +/** + * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure + * @resmap: the struct ocfs2_reservation_map to uninitialize + */ +void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap); + +/** + * ocfs2_resmap_resv_bits() - Return still-valid reservation bits + * @resmap: reservations bitmap + * @resv: reservation to base search from + * @cstart: start of proposed allocation + * @clen: length (in clusters) of proposed allocation + * + * Using the reservation data from resv, this function will compare + * resmap and resmap->m_disk_bitmap to determine what part (if any) of + * the reservation window is still clear to use. If resv is empty, + * this function will try to allocate a window for it. + * + * On success, zero is returned and the valid allocation area is set in cstart + * and clen. + * + * Returns -ENOSPC if reservations are disabled. + */ +int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + int *cstart, int *clen); + +/** + * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used. + * @resmap: reservations bitmap + * @resv: optional reservation to recalulate based on new bitmap + * @cstart: start of allocation in clusters + * @clen: end of allocation in clusters. + * + * Tell the reservation code that bits were used to fulfill allocation in + * resmap. The bits don't have to have been part of any existing + * reservation. But we must always call this function when bits are claimed. + * Internally, the reservations code will use this information to mark the + * reservations bitmap. If resv is passed, it's next allocation window will be + * calculated. It also expects that 'cstart' is the same as we passed back + * from ocfs2_resmap_resv_bits(). + */ +void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, + struct ocfs2_alloc_reservation *resv, + u32 cstart, u32 clen); + +#endif /* OCFS2_RESERVATIONS_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 3c3d673..dacd553 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } - ret = ocfs2_journal_dirty(handle, group_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_rollback; - } + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, @@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != - ocfs2_group_bitmap_size(osb->sb) * 8) { + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small. " "Force to do offline resize."); ret = -EINVAL; @@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) fe = (struct ocfs2_dinode *)main_bm_bh->b_data; if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != - ocfs2_group_bitmap_size(osb->sb) * 8) { + ocfs2_group_bitmap_size(osb->sb, 0, + osb->s_feature_incompat) * 8) { mlog(ML_ERROR, "The disk is too old and small." " Force to do offline resize."); ret = -EINVAL; @@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) group = (struct ocfs2_group_desc *)group_bh->b_data; group->bg_next_group = cr->c_blkno; - - ret = ocfs2_journal_dirty(handle, group_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, group_bh); ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 19ba00f..f4c2a9e 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -53,6 +53,15 @@ #define OCFS2_MAX_TO_STEAL 1024 +struct ocfs2_suballoc_result { + u64 sr_bg_blkno; /* The bg we allocated from. Set + to 0 when a block group is + contiguous. */ + u64 sr_blkno; /* The first allocated block */ + unsigned int sr_bit_offset; /* The bit in the bg */ + unsigned int sr_bits; /* How many bits we claimed */ +}; + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, @@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found); + struct ocfs2_suballoc_result *res); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found); -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, + struct ocfs2_suballoc_result *res); +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno); + struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); static inline int ocfs2_block_group_set_bits(handle_t *handle, @@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) } brelse(ac->ac_bh); ac->ac_bh = NULL; + ac->ac_resv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -325,14 +333,38 @@ out: return rc; } +static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, + struct ocfs2_group_desc *bg, + struct ocfs2_chain_list *cl, + u64 p_blkno, u32 clusters) +{ + struct ocfs2_extent_list *el = &bg->bg_list; + struct ocfs2_extent_rec *rec; + + BUG_ON(!ocfs2_supports_discontig_bg(osb)); + if (!el->l_next_free_rec) + el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; + rec->e_blkno = cpu_to_le64(p_blkno); + rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / + le16_to_cpu(cl->cl_bpc)); + rec->e_leaf_clusters = cpu_to_le32(clusters); + le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&bg->bg_free_bits_count, + clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&el->l_next_free_rec, 1); +} + static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct super_block * sb = alloc_inode->i_sb; @@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); - bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); - bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, + osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); bg->bg_blkno = cpu_to_le64(group_blkno); + if (group_clusters == le16_to_cpu(cl->cl_cpg)) + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + else + ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, + group_clusters); + /* set the 1st bit in the bitmap to account for the descriptor block */ ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, bg_bh); /* There is no need to zero out or otherwise initialize the * other blocks in a group - All valid FS metadata in a block @@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) return best; } +static struct buffer_head * +ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + struct buffer_head *bg_bh; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + + status = ocfs2_claim_clusters(handle, ac, + le16_to_cpu(cl->cl_cpg), &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %llu\n", + alloc_rec, (unsigned long long)bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + brelse(bg_bh); + mlog_errno(status); + } + +bail: + return status ? ERR_PTR(status) : bg_bh; +} + +static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + unsigned int min_bits, + u32 *bit_off, u32 *num_bits) +{ + int status = 0; + + while (min_bits) { + status = ocfs2_claim_clusters(handle, ac, min_bits, + bit_off, num_bits); + if (status != -ENOSPC) + break; + + min_bits >>= 1; + } + + return status; +} + +static int ocfs2_block_group_grow_discontig(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl, + unsigned int min_bits) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = + (struct ocfs2_group_desc *)bg_bh->b_data; + unsigned int needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + u32 p_cpos, clusters; + u64 p_blkno; + struct ocfs2_extent_list *el = &bg->bg_list; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count))) { + if (min_bits > needed) + min_bits = needed; + status = ocfs2_block_group_claim_bits(osb, handle, ac, + min_bits, &p_cpos, + &clusters); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); + ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, + clusters); + + min_bits = clusters; + needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + } + + if (needed > 0) { + /* + * We have used up all the extent rec but can't fill up + * the cpg. So bail out. + */ + status = -ENOSPC; + goto bail; + } + + ocfs2_journal_dirty(handle, bg_bh); + +bail: + return status; +} + +static void ocfs2_bg_alloc_cleanup(handle_t *handle, + struct ocfs2_alloc_context *cluster_ac, + struct inode *alloc_inode, + struct buffer_head *bg_bh) +{ + int i, ret; + struct ocfs2_group_desc *bg; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + if (!bg_bh) + return; + + bg = (struct ocfs2_group_desc *)bg_bh->b_data; + el = &bg->bg_list; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, + cluster_ac->ac_bh, + le64_to_cpu(rec->e_blkno), + le32_to_cpu(rec->e_leaf_clusters)); + if (ret) + mlog_errno(ret); + /* Try all the clusters to free */ + } + + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); + brelse(bg_bh); +} + +static struct buffer_head * +ocfs2_block_group_alloc_discontig(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; + struct buffer_head *bg_bh = NULL; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + + if (!ocfs2_supports_discontig_bg(osb)) { + status = -ENOSPC; + goto bail; + } + + status = ocfs2_extend_trans(handle, + ocfs2_calc_bg_discontig_credits(osb->sb)); + if (status) { + mlog_errno(status); + goto bail; + } + + /* + * We're going to be grabbing from multiple cluster groups. + * We don't have enough credits to relink them all, and the + * cluster groups will be staying in cache for the duration of + * this operation. + */ + ac->ac_allow_chain_relink = 0; + + /* Claim the first region */ + status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + min_bits = num_bits; + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %llu\n", + alloc_rec, (unsigned long long)bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_grow_discontig(handle, alloc_inode, + bg_bh, ac, cl, min_bits); + if (status) + mlog_errno(status); + +bail: + if (status) + ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); + return status ? ERR_PTR(status) : bg_bh; +} + /* * We expect the block group allocator to already be locked. */ @@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct ocfs2_chain_list *cl; struct ocfs2_alloc_context *ac = NULL; handle_t *handle = NULL; - u32 bit_off, num_bits; u16 alloc_rec; - u64 bg_blkno; struct buffer_head *bg_bh = NULL; struct ocfs2_group_desc *bg; @@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, (unsigned long long)*last_alloc_group); ac->ac_last_group = *last_alloc_group; } - status = ocfs2_claim_clusters(osb, - handle, - ac, - le16_to_cpu(cl->cl_cpg), - &bit_off, - &num_bits); - if (status < 0) { + + bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, + ac, cl); + if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + bg_bh = ocfs2_block_group_alloc_discontig(handle, + alloc_inode, + ac, cl); + if (IS_ERR(bg_bh)) { + status = PTR_ERR(bg_bh); + bg_bh = NULL; if (status != -ENOSPC) mlog_errno(status); goto bail; } - - alloc_rec = ocfs2_find_smallest_chain(cl); - - /* setup the group */ - bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "new descriptor, record %u, at block %llu\n", - alloc_rec, (unsigned long long)bg_blkno); - - bg_bh = sb_getblk(osb->sb, bg_blkno); - if (!bg_bh) { - status = -EIO; - mlog_errno(status); - goto bail; - } - ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); - - status = ocfs2_block_group_fill(handle, - alloc_inode, - bg_bh, - bg_blkno, - alloc_rec, - cl); - if (status < 0) { - mlog_errno(status); - goto bail; - } - bg = (struct ocfs2_group_desc *) bg_bh->b_data; status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), @@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, goto bail; } + alloc_rec = le16_to_cpu(bg->bg_chain); le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count)); - le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, + le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); @@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(alloc_inode)->ip_lock); OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, (u32)osb->slot_num, NULL, - ALLOC_NEW_GROUP); + ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); if (status >= 0) { @@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if (status == -EFBIG) { - /* The local alloc window is outside ac_max_block. - * use the main bitmap. */ - status = -ENOSPC; - } else if ((status < 0) && (status != -ENOSPC)) { + if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; } @@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, unsigned int total_bits, - u16 *bit_off, - u16 *bits_found) + struct ocfs2_suballoc_result *res) { void *bitmap; u16 best_offset, best_size; @@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } - /* XXX: I think the first clause is equivalent to the second - * - jlbec */ - if (found == bits_wanted) { - *bit_off = start - found; - *bits_found = found; - } else if (best_size) { - *bit_off = best_offset; - *bits_found = best_size; + if (best_size) { + res->sr_bit_offset = best_offset; + res->sr_bits = best_size; } else { status = -ENOSPC; /* No error log here -- see the comment above @@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); - status = ocfs2_journal_dirty(handle, - group_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, group_bh); bail: mlog_exit(status); @@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle, } prev_bg->bg_next_group = bg->bg_next_group; - - status = ocfs2_journal_dirty(handle, prev_bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle, } bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; - - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle, } fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + ocfs2_journal_dirty(handle, fe_bh); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } - - status = 0; out_rollback: if (status < 0) { fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); @@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found) + struct ocfs2_suballoc_result *res) { int search = -ENOSPC; int ret; u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); @@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode, ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, - max_bits, - &tmp_off, &tmp_found); + max_bits, res); if (ret) return ret; if (max_block) { blkoff = ocfs2_clusters_to_blocks(inode->i_sb, gd_cluster_off + - tmp_off + tmp_found); + res->sr_bit_offset + + res->sr_bits); mlog(0, "Checking %llu against %llu\n", (unsigned long long)blkoff, (unsigned long long)max_block); @@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode, * return success, but we still want to return * -ENOSPC unless it found the minimum number * of bits. */ - if (min_bits <= tmp_found) { - *bit_off = tmp_off; - *bits_found = tmp_found; + if (min_bits <= res->sr_bits) search = 0; /* success */ - } else if (tmp_found) { + else if (res->sr_bits) { /* * Don't show bits which we'll be returning * for allocation to the local alloc bitmap. */ - ocfs2_local_alloc_seen_free_bits(osb, tmp_found); + ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); } } @@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, - u16 *bit_off, u16 *bits_found) + struct ocfs2_suballoc_result *res) { int ret = -ENOSPC; u64 blkoff; @@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode, ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), - bit_off, bits_found); + res); if (!ret && max_block) { - blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + - *bits_found; + blkoff = le64_to_cpu(bg->bg_blkno) + + res->sr_bit_offset + res->sr_bits; mlog(0, "Checking %llu against %llu\n", (unsigned long long)blkoff, (unsigned long long)max_block); @@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); out: return ret; } +static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, + struct ocfs2_extent_rec *rec, + struct ocfs2_chain_list *cl) +{ + unsigned int bpc = le16_to_cpu(cl->cl_bpc); + unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; + unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc; + + if (res->sr_bit_offset < bitoff) + return 0; + if (res->sr_bit_offset >= (bitoff + bitcount)) + return 0; + res->sr_blkno = le64_to_cpu(rec->e_blkno) + + (res->sr_bit_offset - bitoff); + if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) + res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; + return 1; +} + +static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, + struct ocfs2_group_desc *bg, + struct ocfs2_suballoc_result *res) +{ + int i; + u64 bg_blkno = res->sr_bg_blkno; /* Save off */ + struct ocfs2_extent_rec *rec; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { + res->sr_blkno = 0; + return; + } + + res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; + res->sr_bg_blkno = 0; /* Clear it for contig block groups */ + if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || + !bg->bg_list.l_next_free_rec) + return; + + for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { + rec = &bg->bg_list.l_recs[i]; + if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { + res->sr_bg_blkno = bg_blkno; /* Restore */ + break; + } + } +} + static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 gd_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int ret; - u16 found; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, - &group_bh); + ret = ocfs2_read_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, gd = (struct ocfs2_group_desc *) group_bh->b_data; ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - ac->ac_max_block, bit_off, &found); + ac->ac_max_block, res); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - *num_bits = found; + if (!ret) + ocfs2_bg_discontig_fix_result(ac, gd, res); ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, - *num_bits, + res->sr_bits, le16_to_cpu(gd->bg_chain)); if (ret < 0) { mlog_errno(ret); @@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - *bit_off, *num_bits); + res->sr_bit_offset, res->sr_bits); if (ret < 0) mlog_errno(ret); @@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int status; - u16 chain, tmp_bits; + u16 chain; u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; @@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - ac->ac_max_block, bit_off, - &tmp_bits)) == -ENOSPC) { + ac->ac_max_block, + res)) == -ENOSPC) { if (!bg->bg_next_group) break; @@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", - tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); + res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); - *num_bits = tmp_bits; + res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); + + BUG_ON(res->sr_bits == 0); + if (!status) + ocfs2_bg_discontig_fix_result(ac, bg, res); - BUG_ON(*num_bits == 0); /* * Keep track of previous block descriptor read. When @@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, */ if (ac->ac_allow_chain_relink && (prev_group_bh) && - (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { + (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, ac->ac_bh, group_bh, prev_group_bh, chain); @@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); - - status = ocfs2_journal_dirty(handle, - ac->ac_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); + ocfs2_journal_dirty(handle, ac->ac_bh); status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, group_bh, - *bit_off, - *num_bits); + res->sr_bit_offset, + res->sr_bits); if (status < 0) { mlog_errno(status); goto bail; } - mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); - *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -1593,19 +1836,15 @@ bail: } /* will give out up to bits_wanted contiguous bits. */ -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno) + struct ocfs2_suballoc_result *res) { int status; u16 victim, i; u16 bits_left = 0; - u64 hint_blkno = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " + ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used " "bits but only %u total.", (unsigned long long)le64_to_cpu(fe->i_blkno), le32_to_cpu(fe->id1.bitmap1.i_used), @@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } - if (hint_blkno) { + res->sr_bg_blkno = ac->ac_last_group; + if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used * allocation group. This helps us mantain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, - min_bits, bit_off, num_bits, - hint_blkno, &bits_left); - if (!status) { - /* Be careful to update *bg_blkno here as the - * caller is expecting it to be filled in, and - * ocfs2_search_one_group() won't do that for - * us. */ - *bg_blkno = hint_blkno; + min_bits, res, &bits_left); + if (!status) goto set_hint; - } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = victim; ac->ac_allow_chain_relink = 1; - status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, - num_bits, bg_blkno, &bits_left); + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); if (!status) goto set_hint; if (status < 0 && status != -ENOSPC) { @@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, - bit_off, num_bits, bg_blkno, - &bits_left); + res, &bits_left); if (!status) break; if (status < 0 && status != -ENOSPC) { @@ -1703,7 +1936,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = *bg_blkno; + ac->ac_last_group = res->sr_bg_blkno; } bail: @@ -1711,37 +1944,37 @@ bail: return status; } -int ocfs2_claim_metadata(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u64 *suballoc_loc, u16 *suballoc_bit_start, unsigned int *num_bits, u64 *blkno_start) { int status; - u64 bg_blkno; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; BUG_ON(!ac); BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); BUG_ON(ac->ac_which != OCFS2_AC_USE_META); - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, 1, - suballoc_bit_start, - num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - *blkno_start = bg_blkno + (u64) *suballoc_bit_start; - ac->ac_bits_given += (*num_bits); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit_start = res.sr_bit_offset; + *blkno_start = res.sr_blkno; + ac->ac_bits_given += res.sr_bits; + *num_bits = res.sr_bits; status = 0; bail: mlog_exit(status); @@ -1749,10 +1982,10 @@ bail: } static void ocfs2_init_inode_ac_group(struct inode *dir, - struct buffer_head *parent_fe_bh, + struct buffer_head *parent_di_bh, struct ocfs2_alloc_context *ac) { - struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; /* * Try to allocate inodes from some specific group. * @@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir, if (OCFS2_I(dir)->ip_last_used_group && OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; - else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot) - ac->ac_last_group = ocfs2_which_suballoc_group( - le64_to_cpu(fe->i_blkno), - le16_to_cpu(fe->i_suballoc_bit)); + else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { + if (di->i_suballoc_loc) + ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); + else + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(di->i_blkno), + le16_to_cpu(di->i_suballoc_bit)); + } } static inline void ocfs2_save_inode_ac_group(struct inode *dir, @@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } -int ocfs2_claim_new_inode(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno) { int status; - unsigned int num_bits; - u64 bg_blkno; + struct ocfs2_suballoc_result res; mlog_entry_void(); @@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb, ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, - suballoc_bit, - &num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - BUG_ON(num_bits != 1); + BUG_ON(res.sr_bits != 1); - *fe_blkno = bg_blkno + (u64) (*suballoc_bit); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit = res.sr_bit_offset; + *fe_blkno = res.sr_blkno; ac->ac_bits_given++; ocfs2_save_inode_ac_group(dir, ac); status = 0; @@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, * contig. allocation, set to '1' to indicate we can deal with extents * of any size. */ -int __ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, @@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, { int status; unsigned int bits_wanted = max_clusters; - u64 bg_blkno = 0; - u16 bg_bit_off; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); mlog_entry_void(); @@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, && ac->ac_which != OCFS2_AC_USE_MAIN); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + WARN_ON(min_clusters > 1); + status = ocfs2_claim_local_alloc_bits(osb, handle, ac, @@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, if (bits_wanted > (osb->bitmap_cpg - 1)) bits_wanted = osb->bitmap_cpg - 1; - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, min_clusters, - &bg_bit_off, - num_clusters, - &bg_blkno); + &res); if (!status) { + BUG_ON(res.sr_blkno); /* cluster alloc can't set */ *cluster_start = ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, - bg_blkno, - bg_bit_off); + res.sr_bg_blkno, + res.sr_bit_offset); atomic_inc(&osb->alloc_stats.bitmap_data); + *num_clusters = res.sr_bits; } } if (status < 0) { @@ -1958,8 +2193,7 @@ bail: return status; } -int ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, @@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, { unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; - return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, + return __ocfs2_claim_clusters(handle, ac, min_clusters, bits_wanted, cluster_start, num_clusters); } @@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, if (undo_fn) jbd_unlock_bh_state(group_bh); - status = ocfs2_journal_dirty(handle, group_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, group_bh); bail: return status; } @@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, count); tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); - - status = ocfs2_journal_dirty(handle, alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, alloc_bh); bail: brelse(group_bh); @@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle, u16 bit = le16_to_cpu(di->i_suballoc_bit); u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (di->i_suballoc_loc) + bg_blkno = le64_to_cpu(di->i_suballoc_loc); return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, inode_alloc_bh, bit, bg_blkno, 1); } @@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct buffer_head *alloc_bh, u64 blkno, u16 bit, int *res) { - struct ocfs2_dinode *alloc_fe; + struct ocfs2_dinode *alloc_di; struct ocfs2_group_desc *group; struct buffer_head *group_bh = NULL; u64 bg_blkno; @@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, (unsigned int)bit); - alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; - if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) { + alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", (unsigned int)bit, - ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); + ocfs2_bits_per_group(&alloc_di->id2.i_chain)); status = -EINVAL; goto bail; } - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); - status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno, + if (alloc_di->i_suballoc_loc) + bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { mlog(ML_ERROR, "read group %llu failed %d\n", diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index e0f46df..a017dd3 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -26,13 +26,14 @@ #ifndef _CHAINALLOC_H_ #define _CHAINALLOC_H_ +struct ocfs2_suballoc_result; typedef int (group_search_t)(struct inode *, struct buffer_head *, u32, /* bits_wanted */ u32, /* min_bits */ u64, /* max_block */ - u16 *, /* *bit_off */ - u16 *); /* *bits_found */ + struct ocfs2_suballoc_result *); + /* found bits */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -54,6 +55,8 @@ struct ocfs2_alloc_context { u64 ac_last_group; u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + + struct ocfs2_alloc_reservation *ac_resv; }; void ocfs2_init_steal_slots(struct ocfs2_super *osb); @@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); -int ocfs2_claim_metadata(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u64 *suballoc_loc, u16 *suballoc_bit_start, u32 *num_bits, u64 *blkno_start); -int ocfs2_claim_new_inode(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno); -int ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, @@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, * Use this variant of ocfs2_claim_clusters to specify a maxiumum * number of clusters smaller than the allocation reserved. */ -int __ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index dee0319..1c2c39f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -94,7 +94,9 @@ struct mount_options unsigned long mount_opt; unsigned int atime_quantum; signed short slot; - unsigned int localalloc_opt; + int localalloc_opt; + unsigned int resv_level; + int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; }; @@ -176,6 +178,8 @@ enum { Opt_noacl, Opt_usrquota, Opt_grpquota, + Opt_resv_level, + Opt_dir_resv_level, Opt_err, }; @@ -202,6 +206,8 @@ static const match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_resv_level, "resv_level=%u"}, + {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_err, NULL} }; @@ -1028,8 +1034,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); - osb->local_alloc_bits = osb->local_alloc_default_bits; + + ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt); + osb->osb_resv_level = parsed_options.resv_level; + osb->osb_dir_resv_level = parsed_options.resv_level; + if (parsed_options.dir_resv_level == -1) + osb->osb_dir_resv_level = parsed_options.resv_level; + else + osb->osb_dir_resv_level = parsed_options.dir_resv_level; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1285,11 +1297,13 @@ static int ocfs2_parse_options(struct super_block *sb, options ? options : "(none)"); mopt->commit_interval = 0; - mopt->mount_opt = 0; + mopt->mount_opt = OCFS2_MOUNT_NOINTR; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; - mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + mopt->localalloc_opt = -1; mopt->cluster_stack[0] = '\0'; + mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; + mopt->dir_resv_level = -1; if (!options) { status = 1; @@ -1380,7 +1394,7 @@ static int ocfs2_parse_options(struct super_block *sb, status = 0; goto bail; } - if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) + if (option >= 0) mopt->localalloc_opt = option; break; case Opt_localflocks: @@ -1433,6 +1447,28 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; + case Opt_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->resv_level = option; + break; + case Opt_dir_resv_level: + if (is_remount) + break; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= OCFS2_MIN_RESV_LEVEL && + option < OCFS2_MAX_RESV_LEVEL) + mopt->dir_resv_level = option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1487,7 +1523,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) (unsigned) (osb->osb_commit_interval / HZ)); local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); - if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + if (local_alloc_megs != ocfs2_la_default_mb(osb)) seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) @@ -1514,6 +1550,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) else seq_printf(s, ",noacl"); + if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) + seq_printf(s, ",resv_level=%d", osb->osb_resv_level); + + if (osb->osb_dir_resv_level != osb->osb_resv_level) + seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + return 0; } @@ -1688,6 +1730,8 @@ static void ocfs2_inode_init_once(void *data) oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + ocfs2_resv_init_once(&oi->ip_la_data_resv); + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); @@ -2042,6 +2086,12 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); + status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); + if (status) { + mlog_errno(status); + goto bail; + } + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); @@ -2224,9 +2274,11 @@ static int ocfs2_initialize_super(struct super_block *sb, } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; iput(inode); - osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; + osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, + osb->s_feature_incompat) * 8; status = ocfs2_init_slot_info(osb); if (status < 0) { @@ -2509,5 +2561,25 @@ void __ocfs2_abort(struct super_block* sb, ocfs2_handle_error(sb); } +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset) +{ + int rc; + sigset_t blocked; + + sigfillset(&blocked); + rc = sigprocmask(SIG_BLOCK, &blocked, oldset); + BUG_ON(rc); +} + +void ocfs2_unblock_signals(sigset_t *oldset) +{ + int rc = sigprocmask(SIG_SETMASK, oldset, NULL); + BUG_ON(rc); +} + module_init(ocfs2_init); module_exit(ocfs2_exit); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 783f527..40c7de0 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb, #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) +/* + * Void signal blockers, because in-kernel sigprocmask() only fails + * when SIG_* is wrong. + */ +void ocfs2_block_signals(sigset_t *oldset); +void ocfs2_unblock_signals(sigset_t *oldset); + #endif /* OCFS2_SUPER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3e77730..98ee6c4 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt { struct ocfs2_alloc_context *meta_ac; struct ocfs2_alloc_context *data_ac; struct ocfs2_cached_dealloc_ctxt dealloc; + int set_abort; }; #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) @@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, goto leave; } - status = ocfs2_journal_dirty(handle, vb->vb_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } + ocfs2_journal_dirty(handle, vb->vb_bh); clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; @@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, } le32_add_cpu(&vb->vb_xv->xr_clusters, -len); - - ret = ocfs2_journal_dirty(handle, vb->vb_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, vb->vb_bh); if (ext_flags & OCFS2_EXT_REFCOUNTED) ret = ocfs2_decrease_refcount(inode, handle, @@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, memset(bh->b_data + cp_len, 0, blocksize - cp_len); - ret = ocfs2_journal_dirty(handle, bh); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, bh); brelse(bh); bh = NULL; @@ -2148,15 +2136,19 @@ alloc_value: orig_clusters = ocfs2_xa_value_clusters(loc); rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt); if (rc < 0) { - /* - * If we tried to grow an existing external value, - * ocfs2_xa_cleanuP-value_truncate() is going to - * let it stand. We have to restore its original - * value size. - */ - loc->xl_entry->xe_value_size = orig_value_size; + ctxt->set_abort = 1; ocfs2_xa_cleanup_value_truncate(loc, "growing", orig_clusters); + /* + * If we were growing an existing value, + * ocfs2_xa_cleanup_value_truncate() won't remove + * the entry. We need to restore the original value + * size. + */ + if (loc->xl_entry) { + BUG_ON(!orig_value_size); + loc->xl_entry->xe_value_size = orig_value_size; + } mlog_errno(rc); } } @@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode, xb = (struct ocfs2_xattr_block *)blk_bh->b_data; blk = le64_to_cpu(xb->xb_blkno); bit = le16_to_cpu(xb->xb_suballoc_bit); - bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (xb->xb_suballoc_loc) + bg_blkno = le64_to_cpu(xb->xb_suballoc_loc); + else + bg_blkno = ocfs2_which_suballoc_group(blk, bit); xb_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, @@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: @@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - ret = ocfs2_journal_dirty(ctxt->handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(ctxt->handle, di_bh); out: return ret; @@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode, int ret; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *new_bh = NULL; struct ocfs2_xattr_block *xblk; @@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode, goto end; } - ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1, - &suballoc_bit_start, &num_got, - &first_blkno); + ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1, + &suballoc_loc, &suballoc_bit_start, + &num_got, &first_blkno); if (ret < 0) { mlog_errno(ret); goto end; @@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode, memset(xblk, 0, inode->i_sb->s_blocksize); strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot); + xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); - xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_fs_generation = + cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); if (indexed) { struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; @@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, ret = ocfs2_xa_set(&loc, xi, ctxt); if (!ret) xs->here = loc.xl_entry; - else if (ret != -ENOSPC) + else if ((ret != -ENOSPC) || ctxt->set_abort) goto end; else { ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); @@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); - } else if (ret == -ENOSPC) { + } else if ((ret == -ENOSPC) && !ctxt->set_abort) { if (di->i_xattr_loc && !xbs->xattr_bh) { ret = ocfs2_xattr_block_find(inode, xi->xi_name_index, @@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; @@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - ret = ocfs2_extend_trans(ctxt->handle, credits + - ctxt->handle->h_buffer_credits); + ret = ocfs2_extend_trans(ctxt->handle, credits); if (ret) { mlog_errno(ret); goto out; @@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, u32 bit_off, len; u64 blkno; handle_t *handle = ctxt->handle; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *xb_bh = xs->xattr_bh; struct ocfs2_xattr_block *xb = @@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, goto out; } - ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, 1, &bit_off, &len); if (ret) { mlog_errno(ret); @@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, * We need to update the first bucket of the old extent and all * the buckets going to the new extent. */ - credits = ((num_buckets + 1) * blks_per_bucket) + - handle->h_buffer_credits; + credits = ((num_buckets + 1) * blks_per_bucket); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, u32 *first_hash) { u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; + int ret, credits = 2 * blk_per_bucket; BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); @@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, goto leave; } - ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, + ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (ret < 0) { if (ret != -ENOSPC) @@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, goto leave; } - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, root_bh); leave: return ret; @@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode, * existing bucket. Then we add the last existing bucket, the * new bucket, and the first bucket (3 * blk_per_bucket). */ - credits = (end_blk - target_blk) + (3 * blk_per_bucket) + - handle->h_buffer_credits; + credits = (end_blk - target_blk) + (3 * blk_per_bucket); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, } le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); - - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_journal_dirty(handle, root_bh); ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) @@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(osb, handle, data_ac, + ret = ocfs2_claim_clusters(handle, data_ac, len, &p_cluster, &num_clusters); if (ret) { mlog_errno(ret); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index c82af6a..b44bb83 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -3,7 +3,6 @@ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> * Released under GPL v2. */ -#include <linux/version.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index 8418fcc..c7f9f23 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -730,6 +730,7 @@ out_no_task: static const struct file_operations proc_info_file_operations = { .read = proc_info_read, + .llseek = generic_file_llseek, }; static int proc_single_show(struct seq_file *m, void *v) @@ -987,6 +988,7 @@ out_no_task: static const struct file_operations proc_environ_operations = { .read = environ_read, + .llseek = generic_file_llseek, }; static ssize_t oom_adjust_read(struct file *file, char __user *buf, @@ -1060,6 +1062,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, static const struct file_operations proc_oom_adjust_operations = { .read = oom_adjust_read, .write = oom_adjust_write, + .llseek = generic_file_llseek, }; #ifdef CONFIG_AUDITSYSCALL @@ -1131,6 +1134,7 @@ out_free_page: static const struct file_operations proc_loginuid_operations = { .read = proc_loginuid_read, .write = proc_loginuid_write, + .llseek = generic_file_llseek, }; static ssize_t proc_sessionid_read(struct file * file, char __user * buf, @@ -1151,6 +1155,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf, static const struct file_operations proc_sessionid_operations = { .read = proc_sessionid_read, + .llseek = generic_file_llseek, }; #endif @@ -1202,6 +1207,7 @@ static ssize_t proc_fault_inject_write(struct file * file, static const struct file_operations proc_fault_inject_operations = { .read = proc_fault_inject_read, .write = proc_fault_inject_write, + .llseek = generic_file_llseek, }; #endif @@ -1943,7 +1949,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, } static const struct file_operations proc_fdinfo_file_operations = { - .open = nonseekable_open, + .open = nonseekable_open, .read = proc_fdinfo_read, }; @@ -2227,6 +2233,7 @@ out_no_task: static const struct file_operations proc_pid_attr_operations = { .read = proc_pid_attr_read, .write = proc_pid_attr_write, + .llseek = generic_file_llseek, }; static const struct pid_entry attr_dir_stuff[] = { @@ -2347,6 +2354,7 @@ static ssize_t proc_coredump_filter_write(struct file *file, static const struct file_operations proc_coredump_filter_operations = { .read = proc_coredump_filter_read, .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, }; #endif diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d35b232..aea8502 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -232,9 +232,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne if (rv == -ENOIOCTLCMD) rv = -EINVAL; } else if (ioctl) { - lock_kernel(); + WARN_ONCE(1, "Procfs ioctl handlers must use unlocked_ioctl, " + "%pf will be called without the Bkl held\n", ioctl); rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg); - unlock_kernel(); } pde_users_dec(pde); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 19979a2..c837a77 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -558,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp) static const struct file_operations proc_kcore_operations = { .read = read_kcore, .open = open_kcore, + .llseek = generic_file_llseek, }; #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index cfe90a4..bd4b5a7 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = { .poll = kmsg_poll, .open = kmsg_open, .release = kmsg_release, + .llseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9fbc99e..91c817f 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -163,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, + .llseek = generic_file_llseek, }; static struct vmcore* __init get_new_element(void) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c948534..f47cd21 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -214,7 +214,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) return 0; } -static int ramfs_fill_super(struct super_block * sb, void * data, int silent) +int ramfs_fill_super(struct super_block *sb, void *data, int silent) { struct ramfs_fs_info *fsi; struct inode *inode = NULL; diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index e9d2935..4e321f7 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -46,9 +46,9 @@ struct bin_buffer { }; static int -fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) +fill_read(struct file *file, char *buffer, loff_t off, size_t count) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; int rc; @@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) rc = -EIO; if (attr->read) - rc = attr->read(kobj, attr, buffer, off, count); + rc = attr->read(file, kobj, attr, buffer, off, count); sysfs_put_active(attr_sd); @@ -70,8 +70,7 @@ static ssize_t read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - struct dentry *dentry = file->f_path.dentry; - int size = dentry->d_inode->i_size; + int size = file->f_path.dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) mutex_lock(&bb->mutex); - count = fill_read(dentry, bb->buffer, offs, count); + count = fill_read(file, bb->buffer, offs, count); if (count < 0) { mutex_unlock(&bb->mutex); goto out_free; @@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) } static int -flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) +flush_write(struct file *file, char *buffer, loff_t offset, size_t count) { - struct sysfs_dirent *attr_sd = dentry->d_fsdata; + struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; int rc; @@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) rc = -EIO; if (attr->write) - rc = attr->write(kobj, attr, buffer, offset, count); + rc = attr->write(file, kobj, attr, buffer, offset, count); sysfs_put_active(attr_sd); @@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, size_t bytes, loff_t *off) { struct bin_buffer *bb = file->private_data; - struct dentry *dentry = file->f_path.dentry; - int size = dentry->d_inode->i_size; + int size = file->f_path.dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); char *temp; @@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, memcpy(bb->buffer, temp, count); - count = flush_write(dentry, bb->buffer, offs, count); + count = flush_write(file, bb->buffer, offs, count); mutex_unlock(&bb->mutex); if (count > 0) @@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma) if (!attr->mmap) goto out_put; - rc = attr->mmap(kobj, attr, vma); + rc = attr->mmap(file, kobj, attr, vma); if (rc) goto out_put; @@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj, void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { - sysfs_hash_and_remove(kobj->sd, attr->attr.name); + sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5907178..7e54bac 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { struct sysfs_inode_attrs *ps_iattr; - if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) + if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) return -EEXIST; sd->s_parent = sysfs_get(acxt->parent_sd); @@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) * Pointer to sysfs_dirent if found, NULL if not. */ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name) { struct sysfs_dirent *sd; - for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) + for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) { + if (ns && sd->s_ns && (sd->s_ns != ns)) + continue; if (!strcmp(sd->s_name, name)) return sd; + } return NULL; } @@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, * Pointer to sysfs_dirent if found, NULL if not. */ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name) { struct sysfs_dirent *sd; mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, name); + sd = sysfs_find_dirent(parent_sd, ns, name); sysfs_get(sd); mutex_unlock(&sysfs_mutex); @@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - const char *name, struct sysfs_dirent **p_sd) + enum kobj_ns_type type, const void *ns, const char *name, + struct sysfs_dirent **p_sd) { umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; struct sysfs_addrm_cxt acxt; @@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, sd = sysfs_new_dirent(name, mode, SYSFS_DIR); if (!sd) return -ENOMEM; + + sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); + sd->s_ns = ns; sd->s_dir.kobj = kobj; /* link in */ @@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd) { - return create_dir(kobj, kobj->sd, name, p_sd); + return create_dir(kobj, kobj->sd, + KOBJ_NS_TYPE_NONE, NULL, name, p_sd); +} + +/** + * sysfs_read_ns_type: return associated ns_type + * @kobj: the kobject being queried + * + * Each kobject can be tagged with exactly one namespace type + * (i.e. network or user). Return the ns_type associated with + * this object if any + */ +static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) +{ + const struct kobj_ns_type_operations *ops; + enum kobj_ns_type type; + + ops = kobj_child_ns_ops(kobj); + if (!ops) + return KOBJ_NS_TYPE_NONE; + + type = ops->type; + BUG_ON(type <= KOBJ_NS_TYPE_NONE); + BUG_ON(type >= KOBJ_NS_TYPES); + BUG_ON(!kobj_ns_type_registered(type)); + + return type; } /** @@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, */ int sysfs_create_dir(struct kobject * kobj) { + enum kobj_ns_type type; struct sysfs_dirent *parent_sd, *sd; + const void *ns = NULL; int error = 0; BUG_ON(!kobj); @@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj) else parent_sd = &sysfs_root; - error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); + if (sysfs_ns_type(parent_sd)) + ns = kobj->ktype->namespace(kobj); + type = sysfs_read_ns_type(kobj); + + error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd); if (!error) kobj->sd = sd; return error; @@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct dentry *ret = NULL; - struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata; + struct dentry *parent = dentry->d_parent; + struct sysfs_dirent *parent_sd = parent->d_fsdata; struct sysfs_dirent *sd; struct inode *inode; + enum kobj_ns_type type; + const void *ns; mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, dentry->d_name.name); + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dir->i_sb)->ns[type]; + + sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name); /* no such entry */ if (!sd) { @@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj) } int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const char *new_name) + struct sysfs_dirent *new_parent_sd, const void *new_ns, + const char *new_name) { const char *dup_name = NULL; int error; @@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd, mutex_lock(&sysfs_mutex); error = 0; - if ((sd->s_parent == new_parent_sd) && + if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && (strcmp(sd->s_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_name)) + if (sysfs_find_dirent(new_parent_sd, new_ns, new_name)) goto out; /* rename sysfs_dirent */ @@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd, sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); } + sd->s_ns = new_ns; error = 0; out: @@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd, int sysfs_rename_dir(struct kobject *kobj, const char *new_name) { - return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name); + struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + const void *new_ns = NULL; + + if (sysfs_ns_type(parent_sd)) + new_ns = kobj->ktype->namespace(kobj); + + return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name); } int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) { struct sysfs_dirent *sd = kobj->sd; struct sysfs_dirent *new_parent_sd; + const void *new_ns = NULL; BUG_ON(!sd->s_parent); + if (sysfs_ns_type(sd->s_parent)) + new_ns = kobj->ktype->namespace(kobj); new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; - return sysfs_rename(sd, new_parent_sd, sd->s_name); + return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name); } /* Relationship between s_mode and the DT_xxx types */ @@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp) return 0; } -static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, - ino_t ino, struct sysfs_dirent *pos) +static struct sysfs_dirent *sysfs_dir_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { if (pos) { int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && pos->s_parent == parent_sd && ino == pos->s_ino; sysfs_put(pos); - if (valid) - return pos; + if (!valid) + pos = NULL; } - pos = NULL; - if ((ino > 1) && (ino < INT_MAX)) { + if (!pos && (ino > 1) && (ino < INT_MAX)) { pos = parent_sd->s_dir.children; while (pos && (ino > pos->s_ino)) pos = pos->s_sibling; } + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; return pos; } -static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, - ino_t ino, struct sysfs_dirent *pos) +static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, + struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) { - pos = sysfs_dir_pos(parent_sd, ino, pos); + pos = sysfs_dir_pos(ns, parent_sd, ino, pos); if (pos) pos = pos->s_sibling; + while (pos && pos->s_ns && pos->s_ns != ns) + pos = pos->s_sibling; return pos; } @@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) struct dentry *dentry = filp->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; struct sysfs_dirent *pos = filp->private_data; + enum kobj_ns_type type; + const void *ns; ino_t ino; + type = sysfs_ns_type(parent_sd); + ns = sysfs_info(dentry->d_sb)->ns[type]; + if (filp->f_pos == 0) { ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) @@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; } mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); + for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); pos; - pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { + pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { const char * name; unsigned int type; int len, ret; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e222b25..1beaa73 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr) mutex_lock(&sysfs_mutex); if (sd && dir) - sd = sysfs_find_dirent(sd, dir); + /* Only directories are tagged, so no need to pass + * a tag explicitly. + */ + sd = sysfs_find_dirent(sd, NULL, dir); if (sd && attr) - sd = sysfs_find_dirent(sd, attr); + sd = sysfs_find_dirent(sd, NULL, attr); if (sd) sysfs_notify_dirent(sd); @@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj, int error; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); else dir_sd = sysfs_get(kobj->sd); @@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) mutex_lock(&sysfs_mutex); rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, attr->name); + sd = sysfs_find_dirent(kobj->sd, NULL, attr->name); if (!sd) goto out; @@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) { - sysfs_hash_and_remove(kobj->sd, attr->name); + sysfs_hash_and_remove(kobj->sd, NULL, attr->name); } void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) @@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj, struct sysfs_dirent *dir_sd; if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group); else dir_sd = sysfs_get(kobj->sd); if (dir_sd) { - sysfs_hash_and_remove(dir_sd, attr->name); + sysfs_hash_and_remove(dir_sd, NULL, attr->name); sysfs_put(dir_sd); } } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index fe61194..23c1e59 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, int i; for (i = 0, attr = grp->attrs; *attr; i++, attr++) - sysfs_hash_and_remove(dir_sd, (*attr)->name); + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); } static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, @@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * visibility. Do this by first removing then * re-adding (if required) the file */ if (update) - sysfs_hash_and_remove(dir_sd, (*attr)->name); + sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) @@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj, struct sysfs_dirent *sd; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, grp->name); + sd = sysfs_get_dirent(dir_sd, NULL, grp->name); if (!sd) { WARN(!sd, KERN_WARNING "sysfs group %p not found for " "kobject '%s'\n", grp, kobject_name(kobj)); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index a4a0a94..bbd77e9 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode) sysfs_put(sd); } -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name) { struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; @@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name) sysfs_addrm_start(&acxt, dir_sd); - sd = sysfs_find_dirent(dir_sd, name); + sd = sysfs_find_dirent(dir_sd, ns, name); + if (sd && (sd->s_ns != ns)) + sd = NULL; if (sd) sysfs_remove_one(&acxt, sd); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 7761378..281c0c9 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = { struct sysfs_dirent sysfs_root = { .s_name = "", .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR, + .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), .s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, .s_ino = 1, }; @@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } +static int sysfs_test_super(struct super_block *sb, void *data) +{ + struct sysfs_super_info *sb_info = sysfs_info(sb); + struct sysfs_super_info *info = data; + enum kobj_ns_type type; + int found = 1; + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { + if (sb_info->ns[type] != info->ns[type]) + found = 0; + } + return found; +} + +static int sysfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + static int sysfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); + struct sysfs_super_info *info; + enum kobj_ns_type type; + struct super_block *sb; + int error; + + error = -ENOMEM; + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + goto out; + + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + info->ns[type] = kobj_ns_current(type); + + sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) { + error = PTR_ERR(sb); + goto out; + } + if (!sb->s_root) { + sb->s_flags = flags; + error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (error) { + deactivate_locked_super(sb); + goto out; + } + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + error = 0; +out: + return error; +} + +static void sysfs_kill_sb(struct super_block *sb) +{ + struct sysfs_super_info *info = sysfs_info(sb); + + /* Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing sysfs_super_info. + */ + kill_anon_super(sb); + kfree(info); } static struct file_system_type sysfs_fs_type = { .name = "sysfs", .get_sb = sysfs_get_sb, - .kill_sb = kill_anon_super, + .kill_sb = sysfs_kill_sb, }; +void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) +{ + struct super_block *sb; + + mutex_lock(&sysfs_mutex); + spin_lock(&sb_lock); + list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { + struct sysfs_super_info *info = sysfs_info(sb); + /* + * If we see a superblock on the fs_supers/s_instances + * list the unmount has not completed and sb->s_fs_info + * points to a valid struct sysfs_super_info. + */ + /* Ignore superblocks with the wrong ns */ + if (info->ns[type] != ns) + continue; + info->ns[type] = NULL; + } + spin_unlock(&sb_lock); + mutex_unlock(&sysfs_mutex); +} + int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index b93ec51..f71246b 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; + if (sysfs_ns_type(parent_sd)) + sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ @@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, } /** + * sysfs_delete_link - remove symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @name: name of the symlink to remove. + * + * Unlike sysfs_remove_link sysfs_delete_link has enough information + * to successfully delete symlinks in tagged directories. + */ +void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, + const char *name) +{ + const void *ns = NULL; + spin_lock(&sysfs_assoc_lock); + if (targ->sd) + ns = targ->sd->s_ns; + spin_unlock(&sysfs_assoc_lock); + sysfs_hash_and_remove(kobj->sd, ns, name); +} + +/** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. @@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) else parent_sd = kobj->sd; - sysfs_hash_and_remove(parent_sd, name); + sysfs_hash_and_remove(parent_sd, NULL, name); } /** @@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, const char *old, const char *new) { struct sysfs_dirent *parent_sd, *sd = NULL; + const void *old_ns = NULL, *new_ns = NULL; int result; if (!kobj) @@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, else parent_sd = kobj->sd; + if (targ->sd) + old_ns = targ->sd->s_ns; + result = -ENOENT; - sd = sysfs_get_dirent(parent_sd, old); + sd = sysfs_get_dirent(parent_sd, old_ns, old); if (!sd) goto out; @@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, if (sd->s_symlink.target_sd->s_dir.kobj != targ) goto out; - result = sysfs_rename(sd, parent_sd, new); + if (sysfs_ns_type(parent_sd)) + new_ns = targ->ktype->namespace(targ); + + result = sysfs_rename(sd, parent_sd, new_ns, new); out: sysfs_put(sd); @@ -261,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = { EXPORT_SYMBOL_GPL(sysfs_create_link); EXPORT_SYMBOL_GPL(sysfs_remove_link); +EXPORT_SYMBOL_GPL(sysfs_rename_link); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 30f5a44..6a13105 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -58,6 +58,7 @@ struct sysfs_dirent { struct sysfs_dirent *s_sibling; const char *s_name; + const void *s_ns; /* namespace tag */ union { struct sysfs_elem_dir s_dir; struct sysfs_elem_symlink s_symlink; @@ -81,14 +82,27 @@ struct sysfs_dirent { #define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) #define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) -#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK -#define SYSFS_FLAG_REMOVED 0x0200 +/* identify any namespace tag on sysfs_dirents */ +#define SYSFS_NS_TYPE_MASK 0xff00 +#define SYSFS_NS_TYPE_SHIFT 8 + +#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) +#define SYSFS_FLAG_REMOVED 0x020000 static inline unsigned int sysfs_type(struct sysfs_dirent *sd) { return sd->s_flags & SYSFS_TYPE_MASK; } +/* + * Return any namespace tags on this dirent. + * enum kobj_ns_type is defined in linux/kobject.h + */ +static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) +{ + return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_dirent_init_lockdep(sd) \ do { \ @@ -114,6 +128,16 @@ struct sysfs_addrm_cxt { /* * mount.c */ + +/* + * Each sb is associated with a set of namespace tags (i.e. + * the network namespace of the task which mounted this sysfs + * instance). + */ +struct sysfs_super_info { + const void *ns[KOBJ_NS_TYPES]; +}; +#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; extern struct kmem_cache *sysfs_dir_cachep; @@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name); struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, + const void *ns, const unsigned char *name); struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); @@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, void sysfs_remove_subdir(struct sysfs_dirent *sd); int sysfs_rename(struct sysfs_dirent *sd, - struct sysfs_dirent *new_parent_sd, const char *new_name); + struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name); static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { @@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); +int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name); int sysfs_inode_init(void); /* diff --git a/fs/timerfd.c b/fs/timerfd.c index 98158de..b86ab8e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; - DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ticks)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); - res = -EAGAIN; - if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { - __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ctx->ticks) { - res = 0; - break; - } - if (signal_pending(current)) { - res = -ERESTARTSYS; - break; - } - spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); - } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); - } + if (file->f_flags & O_NONBLOCK) + res = -EAGAIN; + else + res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); if (ctx->ticks) { ticks = ctx->ticks; if (ctx->expired && ctx->tintv.tv64) { diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 77d5cf4..bcf5a16 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) if (!c->ro_media) { c->ro_media = 1; c->no_chk_data_crc = 0; + c->vfs_sb->s_flags |= MS_RDONLY; ubifs_warn("switched to read-only mode, error %d", err); dbg_dump_stack(); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 0f8b996..089eaca 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -45,6 +45,15 @@ #include <linux/pagevec.h> #include <linux/writeback.h> +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_READ, /* mapping for a read */ + IO_DELAY, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_NEW /* just allocated */ +}; /* * Prime number of hash buckets since address is used as the key. @@ -103,8 +112,9 @@ xfs_count_page_state( STATIC struct block_device * xfs_find_bdev_for_inode( - struct xfs_inode *ip) + struct inode *inode) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) @@ -183,7 +193,7 @@ xfs_setfilesize( xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IOMAP_READ); + ASSERT(ioend->io_type != IO_READ); if (unlikely(ioend->io_error)) return 0; @@ -214,7 +224,7 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct workqueue_struct *wq; - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + wq = (ioend->io_type == IO_UNWRITTEN) ? xfsconvertd_workqueue : xfsdatad_workqueue; queue_work(wq, &ioend->io_work); if (wait) @@ -237,7 +247,7 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IOMAP_UNWRITTEN && + if (ioend->io_type == IO_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, @@ -250,7 +260,7 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) { + if (ioend->io_type != IO_READ) { error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); } @@ -309,21 +319,25 @@ xfs_map_blocks( struct inode *inode, loff_t offset, ssize_t count, - xfs_iomap_t *mapp, + struct xfs_bmbt_irec *imap, int flags) { int nmaps = 1; + int new = 0; - return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); + return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); } STATIC int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; } /* @@ -554,19 +568,23 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -574,17 +592,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, imap, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -713,11 +731,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); + acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); + acceptable = (type == IO_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); + acceptable = (type == IO_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -740,7 +758,7 @@ xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, - xfs_iomap_t *mp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -750,7 +768,6 @@ xfs_convert_page( xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); @@ -802,19 +819,19 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh)) { if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; else - type = IOMAP_DELAY; + type = IO_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - xfs_map_at_offset(bh, offset, bbits, mp); + xfs_map_at_offset(inode, bh, imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -826,7 +843,7 @@ xfs_convert_page( page_dirty--; count++; } else { - type = IOMAP_NEW; + type = IO_NEW; if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, @@ -866,7 +883,7 @@ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, - xfs_iomap_t *iomapp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -885,7 +902,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, startio, all_bh); if (done) break; } @@ -930,7 +947,7 @@ xfs_aops_discard_page( loff_t offset = page_offset(page); ssize_t len = 1 << inode->i_blkbits; - if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1042,15 +1059,15 @@ xfs_page_state_convert( int unmapped) /* also implies page uptodate */ { struct buffer_head *bh, *head; - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; + int flags, err, imap_valid = 0, uptodate = 1; int page_dirty, count = 0; int trylock = 0; int all_bh = unmapped; @@ -1097,7 +1114,7 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; - type = IOMAP_NEW; + type = IO_NEW; /* TODO: cleanup count and page_dirty */ @@ -1111,12 +1128,12 @@ xfs_page_state_convert( * the iomap is actually still valid, but the ioend * isn't. shouldn't happen too often. */ - iomap_valid = 0; + imap_valid = 0; continue; } - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); /* * First case, map an unwritten extent and prepare for @@ -1137,20 +1154,20 @@ xfs_page_state_convert( * Make sure we don't use a read-only iomap */ if (flags == BMAPI_READ) - iomap_valid = 0; + imap_valid = 0; if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; + type = IO_DELAY; flags = BMAPI_ALLOCATE | trylock; } else { - type = IOMAP_NEW; + type = IO_NEW; flags = BMAPI_WRITE | BMAPI_MMAP; } - if (!iomap_valid) { + if (!imap_valid) { /* * if we didn't have a valid mapping then we * need to ensure that we put the new mapping @@ -1160,7 +1177,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { + if (type == IO_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); } else { @@ -1168,14 +1185,14 @@ xfs_page_state_convert( } err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); + if (imap_valid) { + xfs_map_at_offset(inode, bh, &imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, @@ -1194,40 +1211,41 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || flags != BMAPI_READ) { + if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } /* - * We set the type to IOMAP_NEW in case we are doing a + * We set the type to IO_NEW in case we are doing a * small write at EOF that is extending the file but * without needing an allocation. We need to update the * file size on I/O completion in this case so it is * the same case as having just allocated a new extent * that we are writing into for the first time. */ - type = IOMAP_NEW; + type = IO_NEW; if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); - if (iomap_valid) + if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); + &ioend, !imap_valid); page_dirty--; count++; } else { - iomap_valid = 0; + imap_valid = 0; } } else if ((buffer_uptodate(bh) || PageUptodate(page)) && (unmapped || startio)) { - iomap_valid = 0; + imap_valid = 0; } if (!iohead) @@ -1241,12 +1259,23 @@ xfs_page_state_convert( if (startio) xfs_start_page_writeback(page, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); + if (ioend && imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, startio, all_bh, end_index); } if (iohead) @@ -1448,10 +1477,11 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; - int niomap = 1; + int nimap = 1; + int new = 0; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; @@ -1462,22 +1492,21 @@ __xfs_get_blocks( return 0; error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); + create ? flags : BMAPI_READ, &imap, &nimap, &new); if (error) return -error; - if (niomap == 0) + if (nimap == 0) return 0; - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { /* * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); @@ -1488,7 +1517,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1502,10 +1531,10 @@ __xfs_get_blocks( if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || (offset >= i_size_read(inode)) || - (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) + (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (iomap.iomap_flags & IOMAP_DELAY) { + if (imap.br_startblock == DELAYSTARTBLOCK) { BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); @@ -1514,11 +1543,23 @@ __xfs_get_blocks( } } + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; } return 0; @@ -1576,7 +1617,7 @@ xfs_end_io_direct( */ ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IOMAP_READ) { + if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); @@ -1587,7 +1628,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - ioend->io_type = IOMAP_NEW; + ioend->io_type = IO_NEW; xfs_finish_ioend(ioend, 0); } @@ -1612,10 +1653,10 @@ xfs_vm_direct_IO( struct block_device *bdev; ssize_t ret; - bdev = xfs_find_bdev_for_inode(XFS_I(inode)); + bdev = xfs_find_bdev_for_inode(inode); iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IOMAP_UNWRITTEN : IOMAP_READ); + IO_UNWRITTEN : IO_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 44c2b0e..f01de3c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1007,25 +1007,20 @@ xfs_bwrite( struct xfs_mount *mp, struct xfs_buf *bp) { - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; + int error; bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags |= XBF_WRITE; - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - } - + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); return error; } @@ -1614,7 +1609,8 @@ xfs_mapping_buftarg( STATIC int xfs_alloc_delwrite_queue( - xfs_buftarg_t *btp) + xfs_buftarg_t *btp, + const char *fsname) { int error = 0; @@ -1622,7 +1618,7 @@ xfs_alloc_delwrite_queue( INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) { error = PTR_ERR(btp->bt_task); goto out_error; @@ -1635,7 +1631,8 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( struct block_device *bdev, - int external) + int external, + const char *fsname) { xfs_buftarg_t *btp; @@ -1647,7 +1644,7 @@ xfs_alloc_buftarg( goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp)) + if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; xfs_alloc_bufhash(btp, external); return btp; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 386e736..5fbecef 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 42dd3bc..d8fb1b5 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -115,6 +115,8 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); + xfs_ioend_wait(ip); + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 7b26cc2..699b60c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -527,6 +527,10 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 593c05b..9287135 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index e65a793..9c8019c 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,7 +673,10 @@ xfs_vn_fiemap( bm.bmv_length = BTOBB(length); /* We add one because in getbmap world count includes the header */ - bm.bmv_count = fieinfo->fi_extents_max + 1; + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 29f1edc..e900251 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -789,18 +789,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -902,7 +902,8 @@ xfsaild_start( struct xfs_ail *ailp) { ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); if (IS_ERR(ailp->xa_task)) return -PTR_ERR(ailp->xa_task); return 0; @@ -1092,6 +1093,7 @@ xfs_fs_write_inode( * the code will only flush the inode if it isn't already * being flushed. */ + xfs_ioend_wait(ip); xfs_ilock(ip, XFS_ILOCK_SHARED); if (ip->i_update_core) { error = xfs_log_inode(ip); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a427c63..3884e20 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -356,68 +356,23 @@ xfs_commit_dummy_trans( STATIC int xfs_sync_fsdata( - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { struct xfs_buf *bp; - struct xfs_buf_log_item *bip; - int error = 0; /* - * If this is xfssyncd() then only sync the superblock if we can - * lock it without sleeping and it is not pinned. + * If the buffer is pinned then push on the log so we won't get stuck + * waiting in the write for someone, maybe ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have the + * superblock buffer locked at that point so it can become pinned in + * between there and here. */ - if (flags & SYNC_TRYLOCK) { - ASSERT(!(flags & SYNC_WAIT)); - - bp = xfs_getsb(mp, XBF_TRYLOCK); - if (!bp) - goto out; - - bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *); - if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp)) - goto out_brelse; - } else { - bp = xfs_getsb(mp, 0); - - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for someone, maybe - * ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have - * the superblock buffer locked at that point so it can - * become pinned in between there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0); - } - - - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - - error = xfs_bwrite(mp, bp); - if (error) - return error; - - /* - * If this is a data integrity sync make sure all pending buffers - * are flushed out for the log coverage check below. - */ - if (flags & SYNC_WAIT) - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, flags); - return error; + bp = xfs_getsb(mp, 0); + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); - out_brelse: - xfs_buf_relse(bp); - out: - return error; + return xfs_bwrite(mp, bp); } /* @@ -441,7 +396,7 @@ int xfs_quiesce_data( struct xfs_mount *mp) { - int error; + int error, error2 = 0; /* push non-blocking */ xfs_sync_data(mp, 0); @@ -452,13 +407,20 @@ xfs_quiesce_data( xfs_qm_sync(mp, SYNC_WAIT); /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, SYNC_WAIT); + error = xfs_sync_fsdata(mp); + + /* make sure all delwri buffers are written out */ + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + /* mark the log as covered if needed */ + if (xfs_log_need_covered(mp)) + error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) XFS_bflush(mp->m_rtdev_targp); - return error; + return error ? error : error2; } STATIC void @@ -581,9 +543,9 @@ xfs_flush_inodes( } /* - * Every sync period we need to unpin all items, reclaim inodes, sync - * quota and write out the superblock. We might need to cover the log - * to indicate it is idle. + * Every sync period we need to unpin all items, reclaim inodes and sync + * disk quotas. We might need to cover the log to indicate that the + * filesystem is idle. */ STATIC void xfs_sync_worker( @@ -597,7 +559,8 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -660,7 +623,7 @@ xfs_syncd_init( mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); return 0; diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 5a10760..207fa77 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -41,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" -#include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" @@ -50,6 +49,9 @@ #include "xfs_aops.h" #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index fcaa62f..8a319cf 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -32,6 +32,10 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xlog_ticket; struct log; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(int, count) + __field(int, pincount) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, + __entry->pincount, (char *)__entry->caller_ip) ) @@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); +DEFINE_INODE_EVENT(xfs_inode_pin); +DEFINE_INODE_EVENT(xfs_inode_unpin); +DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); + /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ @@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_PROTO(struct xfs_dquot *dqp), \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); -DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); -DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); @@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); DEFINE_DQUOT_EVENT(xfs_dqlookup_want); DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); -DEFINE_DQUOT_EVENT(xfs_dqlookup_move); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -1495,6 +1503,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct log *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct log *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 5f79dd7..b89ec5d 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ xfs_qm_dqinit( * No need to re-initialize these if this is a reclaimed dquot. */ if (brandnewdquot) { - dqp->dq_flnext = dqp->dq_flprev = dqp; + INIT_LIST_HEAD(&dqp->q_freelist); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); @@ -119,20 +119,20 @@ xfs_qm_dqinit( * Only the q_core portion was zeroed in dqreclaim_one(). * So, we need to reset others. */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - dqp->MPL_NEXT = dqp->HL_NEXT = NULL; - dqp->HL_PREVP = dqp->MPL_PREVP = NULL; - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(dqp->dq_flnext == dqp->dq_flprev); + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + INIT_LIST_HEAD(&dqp->q_mplist); + INIT_LIST_HEAD(&dqp->q_hashlist); + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + atomic_set(&dqp->q_pincount, 0); + dqp->q_hash = NULL; + ASSERT(list_empty(&dqp->q_freelist)); trace_xfs_dqreuse(dqp); } @@ -158,7 +158,7 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); sv_destroy(&dqp->q_pinwait); @@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) >= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + - XFS_QI_BTIMELIMIT(mp)); + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; } @@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) >= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + - XFS_QI_ITIMELIMIT(mp)); + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; } @@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) >= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + - XFS_QI_RTBTIMELIMIT(mp)); + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk( uint type, xfs_buf_t *bp) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_dqblk_t *d; int curid, i; @@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk( /* * ID of the first dquot in the block - id's are zero based. */ - curid = id - (id % XFS_QM_DQPERBLK(mp)); + curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); - memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); - for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : XFS_BLI_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } @@ -419,7 +420,7 @@ xfs_qm_dqalloc( /* now we can just get the buffer (there's nothing to read yet) */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp || (error = XFS_BUF_GETERROR(bp))) goto error1; @@ -500,7 +501,8 @@ xfs_qm_dqtobp( */ if (dqp->q_blkno == (xfs_daddr_t) 0) { /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); + dqp->q_fileoffset = (xfs_fileoff_t)id / + mp->m_quotainfo->qi_dqperchunk; nmaps = 1; quotip = XFS_DQ_TO_QIP(dqp); xfs_ilock(quotip, XFS_ILOCK_SHARED); @@ -529,7 +531,7 @@ xfs_qm_dqtobp( /* * offset of dquot in the (fixed sized) dquot chunk. */ - dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(xfs_dqblk_t); if (map.br_startblock == HOLESTARTBLOCK) { /* @@ -559,15 +561,13 @@ xfs_qm_dqtobp( * Read in the buffer, unless we've just done the allocation * (in which case we already have the buf). */ - if (! newdquot) { + if (!newdquot) { trace_xfs_dqtobp_read(dqp); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0, &bp))) { - return (error); - } + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp); if (error || !bp) return XFS_ERROR(error); } @@ -689,14 +689,14 @@ xfs_qm_idtodq( tp = NULL; if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - if ((error = xfs_trans_reserve(tp, - XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { cancelflags = 0; goto error0; } @@ -751,7 +751,6 @@ xfs_qm_dqlookup( { xfs_dquot_t *dqp; uint flist_locked; - xfs_dquot_t *d; ASSERT(mutex_is_locked(&qh->qh_lock)); @@ -760,7 +759,7 @@ xfs_qm_dqlookup( /* * Traverse the hashchain looking for a match */ - for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { + list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { /* * We already have the hashlock. We don't need the * dqlock to look at the id field of the dquot, since the @@ -772,12 +771,12 @@ xfs_qm_dqlookup( /* * All in core dquots must be on the dqlist of mp */ - ASSERT(dqp->MPL_PREVP != NULL); + ASSERT(!list_empty(&dqp->q_mplist)); xfs_dqlock(dqp); if (dqp->q_nrefs == 0) { - ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + ASSERT(!list_empty(&dqp->q_freelist)); + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqlookup_want(dqp); /* @@ -787,7 +786,7 @@ xfs_qm_dqlookup( */ dqp->dq_flags |= XFS_DQ_WANT; xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); dqp->dq_flags &= ~(XFS_DQ_WANT); } @@ -802,46 +801,28 @@ xfs_qm_dqlookup( if (flist_locked) { if (dqp->q_nrefs != 0) { - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); flist_locked = B_FALSE; } else { - /* - * take it off the freelist - */ + /* take it off the freelist */ trace_xfs_dqlookup_freelist(dqp); - XQM_FREELIST_REMOVE(dqp); - /* xfs_qm_freelist_print(&(xfs_Gqm-> - qm_dqfreelist), - "after removal"); */ + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; } } - /* - * grab a reference - */ XFS_DQHOLD(dqp); if (flist_locked) - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * move the dquot to the front of the hashchain */ ASSERT(mutex_is_locked(&qh->qh_lock)); - if (dqp->HL_PREVP != &qh->qh_next) { - trace_xfs_dqlookup_move(dqp); - if ((d = dqp->HL_NEXT)) - d->HL_PREVP = dqp->HL_PREVP; - *(dqp->HL_PREVP) = d; - d = qh->qh_next; - d->HL_PREVP = &dqp->HL_NEXT; - dqp->HL_NEXT = d; - dqp->HL_PREVP = &qh->qh_next; - qh->qh_next = dqp; - } + list_move(&dqp->q_hashlist, &qh->qh_list); trace_xfs_dqlookup_done(dqp); *O_dqpp = dqp; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (0); + return 0; } } @@ -975,16 +956,17 @@ xfs_qm_dqget( */ if (ip) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (! XFS_IS_DQTYPE_ON(mp, type)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } + /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ if (type == XFS_DQ_USER) { + if (!XFS_IS_UQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_udquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_udquot; @@ -992,6 +974,11 @@ xfs_qm_dqget( goto dqret; } } else { + if (!XFS_IS_OQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_gdquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_gdquot; @@ -1033,13 +1020,14 @@ xfs_qm_dqget( */ ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; - XQM_HASHLIST_INSERT(h, dqp); + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; /* * Attach this dquot to this filesystem's list of all dquots, * kept inside the mount structure in m_quotainfo field */ - xfs_qm_mplist_lock(mp); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); /* * We return a locked dquot to the caller, with a reference taken @@ -1047,9 +1035,9 @@ xfs_qm_dqget( xfs_dqlock(dqp); dqp->q_nrefs = 1; - XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); - - xfs_qm_mplist_unlock(mp); + list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); + mp->m_quotainfo->qi_dquots++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1086,10 +1074,10 @@ xfs_qm_dqput( * drop the dqlock and acquire the freelist and dqlock * in the right order; but try to get it out-of-order first */ - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqput_wait(dqp); xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); } @@ -1100,10 +1088,8 @@ xfs_qm_dqput( if (--dqp->q_nrefs == 0) { trace_xfs_dqput_free(dqp); - /* - * insert at end of the freelist. - */ - XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; /* * If we just added a udquot to the freelist, then @@ -1118,10 +1104,6 @@ xfs_qm_dqput( xfs_dqlock(gdqp); dqp->q_gdquot = NULL; } - - /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), - "@@@@@++ Free list (after append) @@@@@+"); - */ } xfs_dqunlock(dqp); @@ -1133,7 +1115,7 @@ xfs_qm_dqput( break; dqp = gdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1386,10 +1368,10 @@ int xfs_qm_dqpurge( xfs_dquot_t *dqp) { - xfs_dqhash_t *thishash; + xfs_dqhash_t *qh = dqp->q_hash; xfs_mount_t *mp = dqp->q_mount; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); @@ -1407,7 +1389,7 @@ xfs_qm_dqpurge( return (1); } - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1452,14 +1434,16 @@ xfs_qm_dqpurge( ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - thishash = dqp->q_hash; - XQM_HASHLIST_REMOVE(thishash, dqp); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); + list_del_init(&dqp->q_hashlist); + qh->qh_version++; + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dqreclaims++; + mp->m_quotainfo->qi_dquots--; /* * XXX Move this to the front of the freelist, if we can get the * freelist lock. */ - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); dqp->q_mount = NULL; dqp->q_hash = NULL; @@ -1467,7 +1451,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&thishash->qh_lock); + mutex_unlock(&qh->qh_lock); return (0); } @@ -1517,6 +1501,7 @@ void xfs_qm_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { + xfs_mount_t *mp = dqp->q_mount; xfs_buf_t *bp; /* @@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait( * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); if (!bp) goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(dqp->q_mount, 0); + xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index a0f7da5..5da3a23 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -33,40 +33,23 @@ * The hash chain headers (hash buckets) */ typedef struct xfs_dqhash { - struct xfs_dquot *qh_next; + struct list_head qh_list; struct mutex qh_lock; uint qh_version; /* ever increasing version */ uint qh_nelems; /* number of dquots on the list */ } xfs_dqhash_t; -typedef struct xfs_dqlink { - struct xfs_dquot *ql_next; /* forward link */ - struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ -} xfs_dqlink_t; - struct xfs_mount; struct xfs_trans; /* - * This is the marker which is designed to occupy the first few - * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers - * must come first. - * This serves as the marker ("sentinel") when we have to restart list - * iterations because of locking considerations. - */ -typedef struct xfs_dqmarker { - struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ - struct xfs_dquot*dqm_flprev; - xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ - xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ - uint dqm_flags; /* various flags (XFS_DQ_*) */ -} xfs_dqmarker_t; - -/* * The incore dquot structure */ typedef struct xfs_dquot { - xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_freelist; /* global free list of dquots */ + struct list_head q_mplist; /* mount's list of dquots */ + struct list_head q_hashlist; /* gloabl hash list of dquots */ xfs_dqhash_t *q_hash; /* the hashchain header */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ @@ -87,13 +70,6 @@ typedef struct xfs_dquot { wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ } xfs_dquot_t; - -#define dq_flnext q_lists.dqm_flnext -#define dq_flprev q_lists.dqm_flprev -#define dq_mplist q_lists.dqm_mplist -#define dq_hashlist q_lists.dqm_hashlist -#define dq_flags q_lists.dqm_flags - /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, @@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) } #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 4e4ee9a..8d89a24 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin( /* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem, - int stale) + xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp = logitem->qli_dquot; @@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove( xfs_dq_logitem_t *logitem, xfs_trans_t *tp) { - xfs_qm_dquot_logitem_unpin(logitem, 0); + xfs_qm_dquot_logitem_unpin(logitem); } /* @@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf( } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) return; @@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_dquot_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_qm_dquot_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*)) @@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init( xfs_dq_logitem_t *lp; lp = &dqp->q_logitem; - lp->qli_item.li_type = XFS_LI_DQUOT; - lp->qli_item.li_ops = &xfs_dquot_item_ops; - lp->qli_item.li_mountp = dqp->q_mount; + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); lp->qli_dquot = dqp; lp->qli_format.qlf_type = XFS_LI_DQUOT; lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); @@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) */ /*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) +xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) { return; } @@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t* ,int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init( qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); - qf->qql_item.li_type = XFS_LI_QUOTAOFF; - if (start) - qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; - else - qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 417e61e..38e7641 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -67,9 +67,6 @@ static cred_t xfs_zerocr; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); -STATIC void xfs_qm_freelist_init(xfs_frlist_t *); -STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, gfp_t); @@ -84,21 +81,25 @@ extern struct mutex qcheck_lock; #endif #ifdef QUOTADEBUG -#define XQM_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dquot_t *dqp; int i = 0; \ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ - "bcnt = %d, icnt = %d, refs = %d", \ - ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ - DQFLAGTO_TYPESTR(dqp), \ - (int) be64_to_cpu(dqp->q_core.d_bcount), \ - (int) be64_to_cpu(dqp->q_core.d_icount), \ - (int) dqp->q_nrefs); } \ +static void +xfs_qm_dquot_list_print( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp; + int i = 0; + + list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " + "bcnt = %lld, icnt = %lld, refs = %d", + i++, be32_to_cpu(dqp->q_core.d_id), + DQFLAGTO_TYPESTR(dqp), + (long long)be64_to_cpu(dqp->q_core.d_bcount), + (long long)be64_to_cpu(dqp->q_core.d_icount), + dqp->q_nrefs); + } } #else -#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) +static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } #endif /* @@ -144,7 +145,9 @@ xfs_Gqm_init(void) /* * Freelist of all dquots of all file systems */ - xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); + INIT_LIST_HEAD(&xqm->qm_dqfrlist); + xqm->qm_dqfrlist_cnt = 0; + mutex_init(&xqm->qm_dqfrlist_lock); /* * dquot zone. we register our own low-memory callback. @@ -189,6 +192,7 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { + struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); @@ -204,7 +208,21 @@ xfs_qm_destroy( xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); + + /* frlist cleanup */ + mutex_lock(&xqm->qm_dqfrlist_lock); + list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); +#endif + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } + mutex_unlock(&xqm->qm_dqfrlist_lock); + mutex_destroy(&xqm->qm_dqfrlist_lock); #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif @@ -256,7 +274,7 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqp, *n; ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); @@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref( /* * Go thru the freelist and destroy all inactive dquots. */ - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { + list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; if (dqp->dq_flags & XFS_DQ_INACTIVE) { ASSERT(dqp->q_mount == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; xfs_dqunlock(dqp); xfs_qm_dqdestroy(dqp); } else { xfs_dqunlock(dqp); } - dqp = nextdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll @@ -305,7 +321,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } @@ -449,20 +465,21 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - xfs_mount_t *mp, - int sync_mode) + struct xfs_mount *mp, + int sync_mode) { - int recl; - xfs_dquot_t *dqp; - int niters; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl; + struct xfs_dquot *dqp; + int niters; + int error; - if (mp->m_quotainfo == NULL) + if (!q) return 0; niters = 0; again: - xfs_qm_mplist_lock(mp); - FOREACH_DQUOT_IN_MP(dqp, mp) { + mutex_lock(&q->qi_dqlist_lock); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if (! XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); @@ -470,7 +487,7 @@ again: } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check @@ -485,21 +502,21 @@ again: * Let go of the mplist lock. We don't want to hold it * across a disk write. */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, sync_mode); xfs_dqunlock(dqp); if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - xfs_qm_mplist_unlock(mp); + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + mutex_unlock(&q->qi_dqlist_lock); /* XXX restart limit */ goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); /* return ! busy */ return 0; } @@ -509,15 +526,15 @@ again: */ STATIC void xfs_qm_detach_gdquots( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_dquot_t *dqp, *gdqp; - int nrecl; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *gdqp; + int nrecl; again: - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if ((gdqp = dqp->q_gdquot)) { xfs_dqlock(gdqp); @@ -530,15 +547,14 @@ xfs_qm_detach_gdquots( * Can't hold the mplist lock across a dqput. * XXXmust convert to marker based iterations here. */ - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); xfs_qm_dqput(gdqp); - xfs_qm_mplist_lock(mp); - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) + mutex_lock(&q->qi_dqlist_lock); + if (nrecl != q->qi_dqreclaims) goto again; } - dqp = dqp->MPL_NEXT; } } @@ -550,23 +566,23 @@ xfs_qm_detach_gdquots( */ STATIC int xfs_qm_dqpurge_int( - xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ + struct xfs_mount *mp, + uint flags) { - xfs_dquot_t *dqp; - uint dqtype; - int nrecl; - xfs_dquot_t *nextdqp; - int nmisses; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *n; + uint dqtype; + int nrecl; + int nmisses; - if (mp->m_quotainfo == NULL) + if (!q) return 0; dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * In the first pass through all incore dquots of this filesystem, @@ -578,28 +594,25 @@ xfs_qm_dqpurge_int( again: nmisses = 0; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* * Try to get rid of all of the unwanted dquots. The idea is to * get them off mplist and hashlist, but leave them on freelist. */ - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { /* * It's OK to look at the type without taking dqlock here. * We're holding the mplist lock here, and that's needed for * a dqreclaim. */ - if ((dqp->dq_flags & dqtype) == 0) { - dqp = dqp->MPL_NEXT; + if ((dqp->dq_flags & dqtype) == 0) continue; - } if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); mutex_lock(&dqp->q_hash->qh_lock); - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * XXXTheoretically, we can get into a very long @@ -607,7 +620,7 @@ xfs_qm_dqpurge_int( * No one can be adding dquots to the mplist at * this point, but somebody might be taking things off. */ - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { + if (nrecl != q->qi_dqreclaims) { mutex_unlock(&dqp->q_hash->qh_lock); goto again; } @@ -617,11 +630,9 @@ xfs_qm_dqpurge_int( * Take the dquot off the mplist and hashlist. It may remain on * freelist in INACTIVE state. */ - nextdqp = dqp->MPL_NEXT; nmisses += xfs_qm_dqpurge(dqp); - dqp = nextdqp; } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return nmisses; } @@ -921,12 +932,13 @@ xfs_qm_dqdetach( int xfs_qm_sync( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - int recl, restarts; - xfs_dquot_t *dqp; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl, restarts; + struct xfs_dquot *dqp; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -934,18 +946,19 @@ xfs_qm_sync( restarts = 0; again: - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * dqpurge_all() also takes the mplist lock and iterate thru all dquots * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared * when we have the mplist lock, we know that dquots will be consistent * as long as we have it locked. */ - if (! XFS_IS_QUOTA_ON(mp)) { - xfs_qm_mplist_unlock(mp); + if (!XFS_IS_QUOTA_ON(mp)) { + mutex_unlock(&q->qi_dqlist_lock); return 0; } - FOREACH_DQUOT_IN_MP(dqp, mp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { /* * If this is vfs_sync calling, then skip the dquots that * don't 'seem' to be dirty. ie. don't acquire dqlock. @@ -969,7 +982,7 @@ xfs_qm_sync( } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { if (flags & SYNC_TRYLOCK) { xfs_dqunlock(dqp); @@ -989,7 +1002,7 @@ xfs_qm_sync( * Let go of the mplist lock. We don't want to hold it * across a disk write */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) @@ -997,17 +1010,17 @@ xfs_qm_sync( else if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) break; - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return 0; } @@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo( return error; } - xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); - lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + INIT_LIST_HEAD(&qinf->qi_dqlist); + mutex_init(&qinf->qi_dqlist_lock); + lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); qinf->qi_dqreclaims = 0; @@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo( */ xfs_qm_rele_quotafs_ref(mp); - xfs_qm_list_destroy(&qi->qi_dqlist); + ASSERT(list_empty(&qi->qi_dqlist)); + mutex_destroy(&qi->qi_dqlist_lock); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1177,7 +1192,7 @@ xfs_qm_list_init( int n) { mutex_init(&list->qh_lock); - list->qh_next = NULL; + INIT_LIST_HEAD(&list->qh_list); list->qh_version = 0; list->qh_nelems = 0; } @@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - unsigned oldv = mp->m_sb.sb_versionnum; -#endif ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == @@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc( /* qflags will get updated _after_ quotacheck */ mp->m_sb.sb_qflags = 0; -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - cmn_err(CE_NOTE, - "Old superblock version %x, converting to %x.", - oldv, mp->m_sb.sb_versionnum); -#endif } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; @@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts( #ifdef DEBUG j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); do_div(j, sizeof(xfs_dqblk_t)); - ASSERT(XFS_QM_DQPERBLK(mp) == j); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); - for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) break; @@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs( * goto the next block. */ bno++; - firstid += XFS_QM_DQPERBLK(mp); + firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } @@ -1505,7 +1512,7 @@ xfs_qm_dqiterate( continue; firstid = (xfs_dqid_t) map[i].br_startoff * - XFS_QM_DQPERBLK(mp); + mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ @@ -1516,7 +1523,7 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_baread(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - (int)XFS_QI_DQCHUNKLEN(mp)); + mp->m_quotainfo->qi_dqchunklen); rablkno++; } } @@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust( /* * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. */ - if (! XFS_IS_SUSER_DQUOT(dqp)) { + if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); } @@ -1747,14 +1756,14 @@ xfs_qm_quotacheck( lastino = 0; flags = 0; - ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); + ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * There should be no cached dquots. The (simplistic) quotacheck * algorithm doesn't like that. */ - ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); + ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); @@ -1763,15 +1772,19 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - if ((uip = XFS_QI_UQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) + uip = mp->m_quotainfo->qi_uquotaip; + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } - if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) + gip = mp->m_quotainfo->qi_gquotaip; + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; } @@ -1804,7 +1817,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } @@ -1825,7 +1838,7 @@ xfs_qm_quotacheck( mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; - XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); + xfs_qm_dquot_list_print(mp); error_return: if (error) { @@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos( } } - XFS_QI_UQIP(mp) = uip; - XFS_QI_GQIP(mp) = gip; + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; return 0; } + /* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - * XXXsup merge this with qm_reclaim_one(). + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. */ -STATIC int -xfs_qm_shake_freelist( - int howmany) +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) { - int nreclaimed; - xfs_dqhash_t *hash; - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; int restarts; - int nflushes; - - if (howmany <= 0) - return 0; - nreclaimed = 0; restarts = 0; - nflushes = 0; + dqpout = NULL; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); -#endif - /* lock order is : hashchainlock, freelistlock, mplistlock */ - tryagain: - xfs_qm_freelist_lock(xfs_Gqm); + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ +startagain: + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && - nreclaimed < howmany); ) { + list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { + struct xfs_mount *mp = dqp->q_mount; xfs_dqlock(dqp); /* * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. */ if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + + trace_xfs_dqreclaim_want(dqp); + xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; + return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto tryagain; + goto startagain; } /* @@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist( * life easier. */ if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); + ASSERT(mp == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - nextdqp = dqp->dq_flnext; - goto off_freelist; + break; } - ASSERT(dqp->MPL_PREVP); + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); + /* * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; continue; } @@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist( if (XFS_DQ_IS_DIRTY(dqp)) { int error; - trace_xfs_dqshake_dirty(dqp); + trace_xfs_dqreclaim_dirty(dqp); /* * We flush it delayed write, so don't bother - * releasing the mplock. + * releasing the freelist lock. */ error = xfs_qm_dqflush(dqp, 0); if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - dqp = dqp->dq_flnext; continue; } + /* * We're trying to get the hashlock out of order. This races * with dqlookup; so, we giveup and goto the next dquot if @@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist( * waiting for the freelist lock. */ if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; + restarts++; + goto dqfunlock; } + /* * This races with dquot allocation code as well as dqflush_all * and reclaim code. So, if we failed to grab the mplist lock, * giveup everything and start over. */ - hash = dqp->q_hash; - ASSERT(hash); - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - /* XXX put a sentinel so that we can come back here */ + if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { + restarts++; + mutex_unlock(&dqp->q_hash->qh_lock); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&hash->qh_lock); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - goto tryagain; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + goto startagain; } - trace_xfs_dqshake_unlink(dqp); - -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", - dqp, be32_to_cpu(dqp->q_core.d_id)); -#endif ASSERT(dqp->q_nrefs == 0); - nextdqp = dqp->dq_flnext; - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(hash, dqp); + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dquots--; + mp->m_quotainfo->qi_dqreclaims++; + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + mutex_unlock(&dqp->q_hash->qh_lock); +dqfunlock: xfs_dqfunlock(dqp); - xfs_qm_mplist_unlock(dqp->q_mount); - mutex_unlock(&hash->qh_lock); - - off_freelist: - XQM_FREELIST_REMOVE(dqp); xfs_dqunlock(dqp); - nreclaimed++; - XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); + if (dqpout) + break; + if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqpout; +} + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed = 0; + xfs_dquot_t *dqp; + + if (howmany <= 0) + return 0; + + while (nreclaimed < howmany) { + dqp = xfs_qm_dqreclaim_one(); + if (!dqp) + return nreclaimed; xfs_qm_dqdestroy(dqp); - dqp = nextdqp; + nreclaimed++; } - xfs_qm_freelist_unlock(xfs_Gqm); return nreclaimed; } - /* * The kmem_shake interface is invoked when memory is running low. */ @@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) if (!xfs_Gqm) return 0; - nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ + nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ /* incore dquots in all f/s's */ ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; @@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) } -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int nflushes; - - restarts = 0; - dqpout = NULL; - nflushes = 0; - - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ - startagain: - xfs_qm_freelist_lock(xfs_Gqm); - - FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; - } - - ASSERT(dqp->q_hash); - ASSERT(dqp->MPL_PREVP); - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqreclaim: dquot %p flush failed", dqp); - } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; - } - - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - continue; - } - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) - goto mplistunlock; - - trace_xfs_dqreclaim_unlink(dqp); - - ASSERT(dqp->q_nrefs == 0); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); - XQM_FREELIST_REMOVE(dqp); - dqpout = dqp; - mutex_unlock(&dqp->q_hash->qh_lock); - mplistunlock: - xfs_qm_mplist_unlock(dqp->q_mount); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - if (dqpout) - break; - } - - xfs_qm_freelist_unlock(xfs_Gqm); - return dqpout; -} - - /*------------------------------------------------------------------*/ /* @@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach( } } -/* ------------- list stuff -----------------*/ -STATIC void -xfs_qm_freelist_init(xfs_frlist_t *ql) -{ - ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock); - ql->qh_version = 0; - ql->qh_nelems = 0; -} - -STATIC void -xfs_qm_freelist_destroy(xfs_frlist_t *ql) -{ - xfs_dquot_t *dqp, *nextdqp; - - mutex_lock(&ql->qh_lock); - for (dqp = ql->qh_next; - dqp != (xfs_dquot_t *)ql; ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); -#endif - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - mutex_unlock(&ql->qh_lock); - mutex_destroy(&ql->qh_lock); - - ASSERT(ql->qh_nelems == 0); -} - -STATIC void -xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - dq->dq_flnext = ql->qh_next; - dq->dq_flprev = (xfs_dquot_t *)ql; - ql->qh_next = dq; - dq->dq_flnext->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems++; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_unlink(xfs_dquot_t *dq) -{ - xfs_dquot_t *next = dq->dq_flnext; - xfs_dquot_t *prev = dq->dq_flprev; - - next->dq_flprev = prev; - prev->dq_flnext = next; - dq->dq_flnext = dq->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems--; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 495564b..c9446f1 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone; #define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 typedef xfs_dqhash_t xfs_dqlist_t; -/* - * The freelist head. The first two fields match the first two in the - * xfs_dquot_t structure (in xfs_dqmarker_t) - */ -typedef struct xfs_frlist { - struct xfs_dquot *qh_next; - struct xfs_dquot *qh_prev; - struct mutex qh_lock; - uint qh_version; - uint qh_nelems; -} xfs_frlist_t; /* * Quota Manager (global) structure. Lives only in core. @@ -91,7 +80,9 @@ typedef struct xfs_qm { xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ + struct list_head qm_dqfrlist; /* freelist of dquots */ + struct mutex qm_dqfrlist_lock; + int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ int qm_dqfree_ratio;/* ratio of free to inuse dquots */ @@ -106,7 +97,9 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ + struct list_head qi_dqlist; /* all dquots in filesys */ + struct mutex qi_dqlist_lock; + int qi_dquots; int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ @@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); -/* list stuff */ -extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); -extern void xfs_qm_freelist_unlink(xfs_dquot_t *); - #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); #else diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 83e7ea3..3d1fc79 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v) ndquot, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 50bee07..26fa431 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { + struct xfs_quotainfo *q = mp->m_quotainfo; uint dqtype; int error; uint inactivate_flags; @@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff( * critical thing. * If quotaoff, then we must be dealing with the root filesystem. */ - ASSERT(mp->m_quotainfo); - if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - ASSERT(mp->m_quotainfo); + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); /* * If we're just turning off quota enforcement, change mp and go. @@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = mp->m_qflags; spin_unlock(&mp->m_sb_lock); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); /* XXX what to do if error ? Revert back to old vals incore ? */ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); @@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff( * Nothing to do? Don't complain. This happens when we're just * turning off quota enforcement. */ - if ((mp->m_qflags & flags) == 0) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); - } + if ((mp->m_qflags & flags) == 0) + goto out_unlock; /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, @@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff( */ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); if (error) - goto out_error; + goto out_unlock; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff( * So, if we couldn't purge all the dquots from the filesystem, * we can't get rid of the incore data structures. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) delay(10 * nculprits); /* @@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff( if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_error; + goto out_unlock; } /* @@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff( */ if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); return (0); } /* - * Release our quotainode references, and vn_purge them, - * if we don't need them anymore. + * Release our quotainode references if we don't need them anymore. */ - if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - IRELE(XFS_QI_UQIP(mp)); - XFS_QI_UQIP(mp) = NULL; + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - IRELE(XFS_QI_GQIP(mp)); - XFS_QI_GQIP(mp) = NULL; + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; } -out_error: - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (error); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; } int @@ -379,9 +374,9 @@ xfs_qm_scall_quotaon( /* * Switch on quota enforcement in core. */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); return (0); } @@ -392,11 +387,12 @@ xfs_qm_scall_quotaon( */ int xfs_qm_scall_getqstat( - xfs_mount_t *mp, - fs_quota_stat_t *out) + struct xfs_mount *mp, + struct fs_quota_stat *out) { - xfs_inode_t *uip, *gip; - boolean_t tempuqip, tempgqip; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip, *gip; + boolean_t tempuqip, tempgqip; uip = gip = NULL; tempuqip = tempgqip = B_FALSE; @@ -415,9 +411,9 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - if (mp->m_quotainfo) { - uip = mp->m_quotainfo->qi_uquotaip; - gip = mp->m_quotainfo->qi_gquotaip; + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, @@ -441,15 +437,15 @@ xfs_qm_scall_getqstat( if (tempgqip) IRELE(gip); } - if (mp->m_quotainfo) { - out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); - out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); - out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); - out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); - out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); - out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; } - return (0); + return 0; } /* @@ -462,6 +458,7 @@ xfs_qm_scall_setqlim( uint type, fs_disk_quota_t *newlim) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_disk_dquot_t *ddq; xfs_dquot_t *dqp; xfs_trans_t *tp; @@ -485,7 +482,7 @@ xfs_qm_scall_setqlim( * a quotaoff from happening). (XXXThis doesn't currently happen * because we take the vfslock before calling xfs_qm_sysent). */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&q->qi_quotaofflock); /* * Get the dquot (locked), and join it to the transaction. @@ -493,9 +490,8 @@ xfs_qm_scall_setqlim( */ if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { xfs_trans_cancel(tp, XFS_TRANS_ABORT); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); ASSERT(error != ENOENT); - return (error); + goto out_unlock; } xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -513,8 +509,8 @@ xfs_qm_scall_setqlim( ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_bhardlimit = hard; - mp->m_quotainfo->qi_bsoftlimit = soft; + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; } } else { qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); @@ -529,8 +525,8 @@ xfs_qm_scall_setqlim( ddq->d_rtb_hardlimit = cpu_to_be64(hard); ddq->d_rtb_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_rtbhardlimit = hard; - mp->m_quotainfo->qi_rtbsoftlimit = soft; + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; } } else { qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); @@ -546,8 +542,8 @@ xfs_qm_scall_setqlim( ddq->d_ino_hardlimit = cpu_to_be64(hard); ddq->d_ino_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_ihardlimit = hard; - mp->m_quotainfo->qi_isoftlimit = soft; + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; } } else { qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); @@ -572,23 +568,23 @@ xfs_qm_scall_setqlim( * for warnings. */ if (newlim->d_fieldmask & FS_DQ_BTIMER) { - mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; + q->qi_btimelimit = newlim->d_btimer; ddq->d_btimer = cpu_to_be32(newlim->d_btimer); } if (newlim->d_fieldmask & FS_DQ_ITIMER) { - mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; + q->qi_itimelimit = newlim->d_itimer; ddq->d_itimer = cpu_to_be32(newlim->d_itimer); } if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; + q->qi_rtbtimelimit = newlim->d_rtbtimer; ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); } if (newlim->d_fieldmask & FS_DQ_BWARNS) - mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; + q->qi_bwarnlimit = newlim->d_bwarns; if (newlim->d_fieldmask & FS_DQ_IWARNS) - mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; + q->qi_iwarnlimit = newlim->d_iwarns; if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; + q->qi_rtbwarnlimit = newlim->d_rtbwarns; } else { /* * If the user is now over quota, start the timelimit. @@ -605,8 +601,9 @@ xfs_qm_scall_setqlim( error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + out_unlock: + mutex_unlock(&q->qi_quotaofflock); return error; } @@ -853,7 +850,8 @@ xfs_dqrele_inode( int error; /* skip quota inodes */ - if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); read_unlock(&pag->pag_ici_lock); @@ -931,7 +929,8 @@ struct mutex qcheck_lock; } typedef struct dqtest { - xfs_dqmarker_t q_lists; + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_hashlist; xfs_dqhash_t *q_hash; /* the hashchain header */ xfs_mount_t *q_mount; /* filesystem this relates to */ xfs_dqid_t d_id; /* user id or group id */ @@ -942,14 +941,9 @@ typedef struct dqtest { STATIC void xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) { - xfs_dquot_t *d; - if (((d) = (h)->qh_next)) - (d)->HL_PREVP = &((dqp)->HL_NEXT); - (dqp)->HL_NEXT = d; - (dqp)->HL_PREVP = &((h)->qh_next); - (h)->qh_next = (xfs_dquot_t *)dqp; - (h)->qh_version++; - (h)->qh_nelems++; + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + h->qh_nelems++; } STATIC void xfs_qm_dqtest_print( @@ -1061,9 +1055,7 @@ xfs_qm_internalqcheck_dqget( xfs_dqhash_t *h; h = DQTEST_HASH(mp, id, type); - for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; - d = (xfs_dqtest_t *) d->HL_NEXT) { - /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ + list_for_each_entry(d, &h->qh_list, q_hashlist) { if (d->d_id == id && mp == d->q_mount) { *O_dq = d; return (0); @@ -1074,6 +1066,7 @@ xfs_qm_internalqcheck_dqget( d->d_id = id; d->q_mount = mp; d->q_hash = h; + INIT_LIST_HEAD(&d->q_hashlist); xfs_qm_hashinsert(h, d); *O_dq = d; return (0); @@ -1180,8 +1173,6 @@ xfs_qm_internalqcheck( xfs_ino_t lastino; int done, count; int i; - xfs_dqtest_t *d, *e; - xfs_dqhash_t *h1; int error; lastino = 0; @@ -1221,19 +1212,18 @@ xfs_qm_internalqcheck( } cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { - h1 = &qmtest_udqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_t *d, *n; + xfs_dqhash_t *h; + + h = &qmtest_udqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } - h1 = &qmtest_gdqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + h = &qmtest_gdqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 8286b28..94a3d92 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -24,43 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* Number of dquots that fit in to a dquot block */ -#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) - -#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) - -#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) -#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) -#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) -#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) -#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) -#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) -#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) -#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) -#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) -#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) -#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) - -#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) -#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) - -#define xfs_qm_mplist_lock(mp) \ - mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_nowait(mp) \ - mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_unlock(mp) \ - mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define XFS_QM_IS_MPLIST_LOCKED(mp) \ - mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) - -#define xfs_qm_freelist_lock(qm) \ - mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_lock_nowait(qm) \ - mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_unlock(qm) \ - mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) - /* * Hash into a bucket in the dquot hash table, based on <mp, id>. */ @@ -72,9 +35,6 @@ XFS_DQ_HASHVAL(mp, id)) : \ (xfs_Gqm->qm_grp_dqhtable + \ XFS_DQ_HASHVAL(mp, id))) -#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp) : \ - XFS_IS_OQUOTA_ON(mp)) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ @@ -86,68 +46,6 @@ !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) -#define HL_PREVP dq_hashlist.ql_prevp -#define HL_NEXT dq_hashlist.ql_next -#define MPL_PREVP dq_mplist.ql_prevp -#define MPL_NEXT dq_mplist.ql_next - - -#define _LIST_REMOVE(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (dqp)->NXT)) \ - (d)->PVP = (dqp)->PVP; \ - *((dqp)->PVP) = d; \ - (dqp)->NXT = NULL; \ - (dqp)->PVP = NULL; \ - (h)->qh_version++; \ - (h)->qh_nelems--; \ - } - -#define _LIST_INSERT(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (h)->qh_next)) \ - (d)->PVP = &((dqp)->NXT); \ - (dqp)->NXT = d; \ - (dqp)->PVP = &((h)->qh_next); \ - (h)->qh_next = dqp; \ - (h)->qh_version++; \ - (h)->qh_nelems++; \ - } - -#define FOREACH_DQUOT_IN_MP(dqp, mp) \ - for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) - -#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ -for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ - (dqp) = (dqp)->dq_flnext) - -#define XQM_HASHLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) - -#define XQM_FREELIST_INSERT(h, dqp) \ - xfs_qm_freelist_append(h, dqp) - -#define XQM_MPLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) - -#define XQM_HASHLIST_REMOVE(h, dqp) \ - _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) -#define XQM_FREELIST_REMOVE(dqp) \ - xfs_qm_freelist_unlink(dqp) -#define XQM_MPLIST_REMOVE(h, dqp) \ - { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ - XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } - -#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) - -#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ - (tp)->t_dqinfo->dqa_usrdquots : \ - (tp)->t_dqinfo->dqa_grpdquots) -#define XFS_IS_SUSER_DQUOT(dqp) \ - (!((dqp)->q_core.d_id)) - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index c3ab75c..061d827 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -59,12 +59,11 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp; + xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); - lp = &dqp->q_logitem; + ASSERT(lp->qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. @@ -96,7 +95,7 @@ xfs_trans_log_dquot( { xfs_log_item_desc_t *lidp; - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); @@ -198,16 +197,16 @@ xfs_trans_get_dqtrx( int i; xfs_dqtrx_t *qa; - for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { - qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); + qa = XFS_QM_ISUDQ(dqp) ? + tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || - qa[i].qt_dquot == dqp) { - return (&qa[i]); - } + qa[i].qt_dquot == dqp) + return &qa[i]; } - return (NULL); + return NULL; } /* @@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -639,7 +638,7 @@ xfs_trans_dqresv( softlimit = q->qi_bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -651,7 +650,7 @@ xfs_trans_dqresv( softlimit = q->qi_rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } @@ -691,7 +690,7 @@ xfs_trans_dqresv( count = be64_to_cpu(dqp->q_core.d_icount); timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = q->qi_ihardlimit; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5c11e4d..99587de 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork( } if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f3c49e6..240340a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -372,12 +372,12 @@ xfs_buf_item_pin( */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip, - int stale) + xfs_buf_log_item_t *bip) { struct xfs_ail *ailp; xfs_buf_t *bp; int freed; + int stale = bip->bli_flags & XFS_BLI_STALE; bp = bip->bli_buf; ASSERT(bp != NULL); @@ -428,40 +428,34 @@ xfs_buf_item_unpin_remove( xfs_buf_log_item_t *bip, xfs_trans_t *tp) { - xfs_buf_t *bp; - xfs_log_item_desc_t *lidp; - int stale = 0; - - bp = bip->bli_buf; - /* - * will xfs_buf_item_unpin() call xfs_buf_item_relse()? - */ + /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ if ((atomic_read(&bip->bli_refcount) == 1) && (bip->bli_flags & XFS_BLI_STALE)) { + /* + * yes -- We can safely do some work here and then call + * buf_item_unpin to do the rest because we are + * are holding the buffer locked so no one else will be + * able to bump up the refcount. We have to remove the + * log item from the transaction as we are about to release + * our reference to the buffer. If we don't, the unlock that + * occurs later in the xfs_trans_uncommit() will try to + * reference the buffer which we no longer have a hold on. + */ + struct xfs_log_item_desc *lidp; + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); trace_xfs_buf_item_unpin_stale(bip); - /* - * yes -- clear the xaction descriptor in-use flag - * and free the chunk if required. We can safely - * do some work here and then call buf_item_unpin - * to do the rest because if the if is true, then - * we are holding the buffer locked so no one else - * will be able to bump up the refcount. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); - stale = lidp->lid_flags & XFS_LID_BUF_STALE; + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); xfs_trans_free_item(tp, lidp); + /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. + * Since the transaction no longer refers to the buffer, the + * buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); } - - xfs_buf_item_unpin(bip, stale); - - return; + xfs_buf_item_unpin(bip); } /* @@ -675,7 +669,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_buf_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, @@ -733,10 +727,7 @@ xfs_buf_item_init( bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); - bip->bli_item.li_type = XFS_LI_BUF; - bip->bli_item.li_ops = &xfs_buf_item_ops; - bip->bli_item.li_mountp = mp; - bip->bli_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 217f34a..df44545 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone; * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ -typedef struct xfs_buf_log_format_t { +typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ ushort blf_flags; /* misc state */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 92d5cd5..ef96175 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) void xfs_error_report( - char *tag, - int level, - xfs_mount_t *mp, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) { xfs_cmn_err(XFS_PTAG_ERROR_REPORT, CE_ALERT, mp, "XFS internal error %s at line %d of file %s. Caller 0x%p\n", - tag, linenum, fname, ra); + tag, linenum, filename, ra); xfs_stack_trace(); } @@ -205,15 +205,15 @@ xfs_error_report( void xfs_corruption_error( - char *tag, - int level, - xfs_mount_t *mp, - void *p, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 16); - xfs_error_report(tag, level, mp, fname, linenum, ra); + xfs_error_report(tag, level, mp, filename, linenum, ra); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0c93051..c2c1a07 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -29,10 +29,11 @@ extern int xfs_error_trap(int); struct xfs_mount; -extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, - char *fname, int linenum, inst_t *ra); -extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, - void *p, char *fname, int linenum, inst_t *ra); +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6f35ed1..409fe81 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) */ /*ARGSUSED*/ STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +xfs_efi_item_unpin(xfs_efi_log_item_t *efip) { struct xfs_ail *ailp = efip->efi_item.li_ailp; @@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_efi_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, @@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp, KM_SLEEP); } - efip->efi_item.li_type = XFS_LI_EFI; - efip->efi_item.li_ops = &xfs_efi_item_ops; - efip->efi_item.li_mountp = mp; - efip->efi_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; @@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp) */ /*ARGSUSED*/ STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) { return; } @@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_efd_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, @@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp, KM_SLEEP); } - efdp->efd_item.li_type = XFS_LI_EFD; - efdp->efd_item.li_ops = &xfs_efd_item_ops; - efdp->efd_item.li_mountp = mp; - efdp->efd_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0ffd564..8cd6e8d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2449,6 +2449,8 @@ xfs_iunpin_nowait( { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + /* Give the log a push to start the unpinning I/O */ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7bfea85..cf8249a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -543,6 +543,7 @@ xfs_inode_item_pin( { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); atomic_inc(&iip->ili_inode->i_pincount); } @@ -556,11 +557,11 @@ xfs_inode_item_pin( /* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip, - int stale) + xfs_inode_log_item_t *iip) { struct xfs_inode *ip = iip->ili_inode; + trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up(&ip->i_ipin_wait); @@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_inode_item_unpin(iip, 0); + xfs_inode_item_unpin(iip); } /* @@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_inode_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, @@ -865,17 +866,9 @@ xfs_inode_item_init( ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); - iip->ili_item.li_type = XFS_LI_INODE; - iip->ili_item.li_ops = &xfs_inode_item_ops; - iip->ili_item.li_mountp = mp; - iip->ili_item.li_ailp = mp->m_ail; iip->ili_inode = ip; - - /* - We have zeroed memory. No need ... - iip->ili_extents_buf = NULL; - */ - + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); iip->ili_format.ilf_type = XFS_LI_INODE; iip->ili_format.ilf_ino = ip->i_ino; iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0b650399..ef14943 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -55,71 +55,33 @@ #define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_imap_to_bmap( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int pbm; - xfs_fsblock_t start_block; - - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - if (XFS_IS_REALTIME_INODE(ip)) { - iomapp->iomap_flags |= IOMAP_REALTIME; - iomapp->iomap_target = mp->m_rtdev_targp; - } else { - iomapp->iomap_target = mp->m_ddev_targp; - } - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_DELAY; - } else { - iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} +STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + int, struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, + struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int *); int xfs_iomap( - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) + struct xfs_inode *ip, + xfs_off_t offset, + ssize_t count, + int flags, + struct xfs_bmbt_irec *imap, + int *nimaps, + int *new) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + int bmapi_flags = 0; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + *new = 0; + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -160,8 +122,8 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL, NULL); + bmapi_flags, NULL, 0, imap, + nimaps, NULL, NULL); if (error) goto out; @@ -169,46 +131,41 @@ xfs_iomap( switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ - if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && - (imap.br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && + (imap->br_startblock != HOLESTARTBLOCK) && + (imap->br_startblock != DELAYSTARTBLOCK)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { error = xfs_iomap_write_direct(ip, offset, count, flags, - &imap, &nimaps, nimaps); + imap, nimaps); } else { error = xfs_iomap_write_delay(ip, offset, count, flags, - &imap, &nimaps); + imap, nimaps); } if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); + trace_xfs_iomap_alloc(ip, offset, count, flags, imap); } - iomap_flags = IOMAP_NEW; + *new = 1; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !isnullstartblock(imap.br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && !isnullstartblock(imap->br_startblock)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } error = xfs_iomap_write_allocate(ip, offset, count, - &imap, &nimaps); + imap, nimaps); break; } - if (nimaps) { - *niomaps = xfs_imap_to_bmap(ip, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } + ASSERT(*nimaps <= 1); out: if (lockmode) @@ -216,7 +173,6 @@ out: return XFS_ERROR(error); } - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -int +STATIC int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, int flags, xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) + int *nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -330,7 +285,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) ret_imap->br_blockcount + ret_imap->br_startoff); @@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate( return 0; } -int +STATIC int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, @@ -588,7 +543,7 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -int +STATIC int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 174f299..81ac4af 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,19 +18,6 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) - - -typedef enum { /* iomap_flags values */ - IOMAP_READ = 0, /* mapping for a read */ - IOMAP_HOLE = 0x02, /* mapping covers a hole */ - IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ - IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ - IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ - /* but uninitialized file data */ - IOMAP_NEW = 0x40 /* just allocate */ -} iomap_flags_t; - typedef enum { /* base extent manipulation calls */ BMAPI_READ = (1 << 0), /* read extents */ @@ -52,43 +39,11 @@ typedef enum { { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } -/* - * xfs_iomap_t: File system I/O map - * - * The iomap_bn field is expressed in 512-byte blocks, and is where the - * mapping starts on disk. - * - * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. - * iomap_offset is the offset of the mapping in the file itself. - * iomap_bsize is the size of the mapping, iomap_delta is the - * desired data's offset into the mapping, given the offset supplied - * to the file I/O map routine. - * - * When a request is made to read beyond the logical end of the object, - * iomap_size may be set to 0, but iomap_offset and iomap_length should be set - * to the actual amount of underlying storage that has been allocated, if any. - */ - -typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512B blk of mapping */ - xfs_buftarg_t *iomap_target; - xfs_off_t iomap_offset; /* offset of mapping, bytes */ - xfs_off_t iomap_bsize; /* size of mapping, bytes */ - xfs_off_t iomap_delta; /* offset into mapping, bytes */ - iomap_flags_t iomap_flags; -} xfs_iomap_t; - struct xfs_inode; struct xfs_bmbt_irec; extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); + struct xfs_bmbt_irec *, int *, int *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2be0191..3038dd5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -44,13 +44,8 @@ kmem_zone_t *xfs_log_ticket_zone; -#define xlog_write_adv_cnt(ptr, len, off, bytes) \ - { (ptr) += (bytes); \ - (len) -= (bytes); \ - (off) += (bytes);} - /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, +STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, xlog_in_core_t **, xfs_lsn_t *); STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, @@ -59,11 +54,9 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags); +STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -102,7 +95,7 @@ STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, uint flags); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); +STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); @@ -258,7 +251,7 @@ xfs_log_done( * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, iclog, &lsn)))) { + (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -516,18 +509,10 @@ xfs_log_unmount_write(xfs_mount_t *mp) #ifdef DEBUG xlog_in_core_t *first_iclog; #endif - xfs_log_iovec_t reg[1]; xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; - /* the data section must be 32 bit size aligned */ - struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ - } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; - /* * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). @@ -549,16 +534,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - reg[0].i_addr = (void*)&magic; - reg[0].i_len = sizeof(magic); - reg[0].i_type = XLOG_REG_TYPE_UNMOUNT; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = (void *)&magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + /* remove inited flag */ - ((xlog_ticket_t *)tic)->t_flags = 0; - error = xlog_write(mp, reg, 1, tic, &lsn, + tic->t_flags = 0; + error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* * At this point, we're umounting anyway, @@ -648,10 +647,26 @@ xfs_log_unmount(xfs_mount_t *mp) xlog_dealloc_log(mp->m_log); } +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; +} + /* * Write region vectors to log. The write happens using the space reservation * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). + * transaction occur with one call to xfs_log_write(). However, it is important + * to note that the transaction reservation code makes an assumption about the + * number of log headers a transaction requires that may be violated if you + * don't pass all the transaction vectors in one call.... */ int xfs_log_write( @@ -663,11 +678,15 @@ xfs_log_write( { struct log *log = mp->m_log; int error; + struct xfs_log_vec vec = { + .lv_niovecs = nentries, + .lv_iovecp = reg, + }; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); + error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -1020,6 +1039,7 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; int error = ENOMEM; + uint log2_size = 0; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); if (!log) { @@ -1045,29 +1065,30 @@ xlog_alloc_log(xfs_mount_t *mp, error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { - log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - if (log->l_sectbb_log < 0 || - log->l_sectbb_log > mp->m_sectbb_log) { - xlog_warn("XFS: Log sector size (0x%x) out of range.", - log->l_sectbb_log); + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xlog_warn("XFS: Log sector size too small " + "(0x%x < 0x%x)", log2_size, BBSHIFT); goto out_free_log; } - /* for larger sector sizes, must have v2 or external log */ - if (log->l_sectbb_log != 0 && - (log->l_logBBstart != 0 && - !xfs_sb_version_haslogv2(&mp->m_sb))) { - xlog_warn("XFS: log sector size (0x%x) invalid " - "for configuration.", log->l_sectbb_log); + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size too large " + "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); goto out_free_log; } - if (mp->m_sb.sb_logsectlog < BBSHIFT) { - xlog_warn("XFS: Log sector log (0x%x) too small.", - mp->m_sb.sb_logsectlog); + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log2_size); goto out_free_log; } } - log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + log->l_sectBBsize = 1 << log2_size; xlog_get_iclog_buffer_size(mp, log); @@ -1174,26 +1195,31 @@ out: * ticket. Return the lsn of the commit record. */ STATIC int -xlog_commit_record(xfs_mount_t *mp, - xlog_ticket_t *ticket, - xlog_in_core_t **iclog, - xfs_lsn_t *commitlsnp) +xlog_commit_record( + struct log *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) { - int error; - xfs_log_iovec_t reg[1]; - - reg[0].i_addr = NULL; - reg[0].i_len = 0; - reg[0].i_type = XLOG_REG_TYPE_COMMIT; + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; ASSERT_ALWAYS(iclog); - if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, - iclog, XLOG_COMMIT_TRANS))) { + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xlog_commit_record */ - +} /* * Push on the buffer cache code if we ever use more than 75% of the on-disk @@ -1614,6 +1640,192 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) } /* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct log *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * can be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct log *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; +} + +/* * Write some region out to in-core log * * This will be called when writing externally provided regions or when @@ -1655,209 +1867,157 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) */ STATIC int xlog_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, + struct log *log, + struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ - xlog_op_header_t *logop_head; /* ptr to log operation header */ - __psint_t ptr; /* copy address into data region */ - int len; /* # xlog_write() bytes 2 still copy */ - int index; /* region index currently copying */ - int log_offset; /* offset (from 0) into data region */ - int start_rec_copy; /* # bytes to copy for start record */ - int partial_copy; /* did we split a region? */ - int partial_copy_len;/* # bytes copied if split region */ - int need_copy; /* # bytes need to memcpy this region */ - int copy_len; /* # bytes actually memcpy'ing */ - int copy_off; /* # bytes from entry start */ - int contwr; /* continued write of in-core log? */ - int error; - int record_cnt = 0, data_cnt = 0; - - partial_copy_len = partial_copy = 0; - - /* Calculate potential maximum space. Each region gets its own - * xlog_op_header_t and may need to be double word aligned. - */ - len = 0; - if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ - len += sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; - for (index = 0; index < nentries; index++) { - len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - ticket->t_res_num_ophdrs++; - len += reg[index].i_len; - xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); - } - contwr = *start_lsn = 0; + *start_lsn = 0; - if (ticket->t_curr_res < len) { - xlog_print_tic_res(mp, ticket); + len = xlog_write_calc_vec_length(ticket, log_vector); + if (ticket->t_curr_res < len) { + xlog_print_tic_res(log->l_mp, ticket); #ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); + xlog_panic( + "xfs_log_write: reservation ran out. Need to up reservation"); #else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, - "xfs_log_write: reservation ran out. Need to up reservation"); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + /* Customer configurable panic */ + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + + /* If we did not panic, shutdown the filesystem */ + xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); #endif - } else + } + ticket->t_curr_res -= len; - for (index = 0; index < nentries; ) { - if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset))) - return error; + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && index < lv->lv_niovecs) { + void *ptr; + int log_offset; - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - /* start_lsn is the first lsn written to. That's all we need. */ - if (! *start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; - /* This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). - */ - while (index < nentries) { - ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); - ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); - start_rec_copy = 0; - - /* If first write for transaction, insert start record. - * We can't be trying to commit if we are inited. We can't - * have any "partial_copy" if we are inited. - */ - if (ticket->t_flags & XLOG_TIC_INITED) { - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_len = 0; - logop_head->oh_flags = XLOG_START_TRANS; - logop_head->oh_res2 = 0; - ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ - record_cnt++; - - start_rec_copy = sizeof(xlog_op_header_t); - xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); - } + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - /* Copy log operation header directly into data section */ - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_res2 = 0; + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && index < lv->lv_niovecs) { + struct xfs_log_iovec *reg = &vecp[index]; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } - /* header copied directly */ - xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); - /* are we copying a commit or unmount record? */ - logop_head->oh_flags = flags; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; - /* - * We've seen logs corrupted with bad transaction client - * ids. This makes sure that XFS doesn't generate them on. - * Turn this into an EIO and shut down the filesystem. - */ - switch (logop_head->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_fs_cmn_err(CE_WARN, mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, ticket); - return XFS_ERROR(EIO); - } + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; - /* Partial write last time? => (partial_copy != 0) - * need_copy is the amount we'd like to copy if everything could - * fit in the current memcpy. - */ - need_copy = reg[index].i_len - partial_copy_len; - - copy_off = partial_copy_len; - if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - copy_len = need_copy; - logop_head->oh_len = cpu_to_be32(copy_len); - if (partial_copy) - logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - partial_copy_len = partial_copy = 0; - } else { /* partial write */ - copy_len = iclog->ic_size - log_offset; - logop_head->oh_len = cpu_to_be32(copy_len); - logop_head->oh_flags |= XLOG_CONTINUE_TRANS; - if (partial_copy) - logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; - partial_copy_len += copy_len; - partial_copy++; - len += sizeof(xlog_op_header_t); /* from splitting of region */ - /* account for new log op header */ - ticket->t_curr_res -= sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - xlog_verify_dest_ptr(log, ptr); - - /* copy region */ - ASSERT(copy_len >= 0); - memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); - xlog_write_adv_cnt(ptr, len, log_offset, copy_len); - - /* make copy_len total bytes copied, including headers */ - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - if (partial_copy) { /* copied partial region */ - /* already marked WANT_SYNC by xlog_state_get_iclog_space */ - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - if ((error = xlog_state_release_iclog(log, iclog))) - return error; - break; /* don't increment index */ - } else { /* copied entire region */ - index++; - partial_copy_len = partial_copy = 0; - - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else if ((error = xlog_state_release_iclog(log, iclog))) - return error; - if (index == nentries) - return 0; /* we are done */ - else - break; + if (++index == lv->lv_niovecs) { + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0) { + if (!lv) + return 0; + break; + } } - } /* if (partial_copy) */ - } /* while (index < nentries) */ - } /* for (index = 0; index < nentries; ) */ - ASSERT(len == 0); + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; - } - return xlog_state_release_iclog(log, iclog); -} /* xlog_write */ +} /***************************************************************************** @@ -3157,14 +3317,16 @@ xfs_log_ticket_get( * Allocate and initialise a new log ticket. */ STATIC xlog_ticket_t * -xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int cnt, - char client, - uint xflags) +xlog_ticket_alloc( + struct log *log, + int unit_bytes, + int cnt, + char client, + uint xflags) { - xlog_ticket_t *tic; + struct xlog_ticket *tic; uint num_headers; + int iclog_space; tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); if (!tic) @@ -3208,16 +3370,40 @@ xlog_ticket_alloc(xlog_t *log, /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); - /* for LR headers */ - num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; - /* for roundoff padding for transaction data and one for commit record */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { @@ -3233,13 +3419,13 @@ xlog_ticket_alloc(xlog_t *log, tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_tid = random32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); + sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3260,20 +3446,22 @@ xlog_ticket_alloc(xlog_t *log, * part of the log in case we trash the log structure. */ void -xlog_verify_dest_ptr(xlog_t *log, - __psint_t ptr) +xlog_verify_dest_ptr( + struct log *log, + char *ptr) { int i; int good_ptr = 0; - for (i=0; i < log->l_iclog_bufs; i++) { - if (ptr >= (__psint_t)log->l_iclog_bak[i] && - ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) good_ptr++; } - if (! good_ptr) + + if (!good_ptr) xlog_panic("xlog_verify_dest_ptr: invalid ptr"); -} /* xlog_verify_dest_ptr */ +} STATIC void xlog_verify_grant_head(xlog_t *log, int equals) diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 97a24c7..229d1f3 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -110,6 +110,12 @@ typedef struct xfs_log_iovec { uint i_type; /* type of region */ } xfs_log_iovec_t; +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ +}; + /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -126,6 +132,13 @@ typedef struct xfs_log_callback { struct xfs_mount; struct xlog_in_core; struct xlog_ticket; +struct xfs_log_item; +struct xfs_item_ops; + +void xfs_log_item_init(struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fd02a18..9cf6951 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -396,9 +396,7 @@ typedef struct log { struct xfs_buf_cancel **l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ @@ -449,6 +447,14 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + /* * Unmount record type is used as a pseudo transaction type for the ticket. * It's value must be outside the range of XFS_TRANS_* values. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 22e6efd..0de08e36 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *); #define xlog_recover_check_summary(log) #endif - /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ +static inline int +xlog_buf_bbcount_valid( + xlog_t *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ STATIC xfs_buf_t * xlog_get_bp( xlog_t *log, int nbblks) { - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_get_bp(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } - if (log->l_sectbb_log) { - if (nbblks > 1) - nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to acommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accomodate this possiblility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } @@ -93,6 +121,10 @@ xlog_put_bp( xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ STATIC xfs_caddr_t xlog_align( xlog_t *log, @@ -100,14 +132,14 @@ xlog_align( int nbblks, xfs_buf_t *bp) { + xfs_daddr_t offset; xfs_caddr_t ptr; - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); + offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); + ptr = XFS_BUF_PTR(bp) + BBTOB(offset); + + ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp)); - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); return ptr; } @@ -124,21 +156,18 @@ xlog_bread_noalign( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bread(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); @@ -186,17 +215,15 @@ xlog_bwrite( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bwrite(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); @@ -327,39 +354,38 @@ xlog_find_cycle_start( { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, bp, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( @@ -376,12 +402,16 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -629,7 +659,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -639,11 +669,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -699,16 +731,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -726,7 +758,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -748,7 +780,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -833,12 +865,12 @@ xlog_find_tail( if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -849,7 +881,7 @@ xlog_find_tail( for (i = (int)(*head_blk) - 1; i >= 0; i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; @@ -866,7 +898,7 @@ xlog_find_tail( for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { @@ -941,7 +973,7 @@ xlog_find_tail( umount_data_blk = (i + hblks) % log->l_logBBsize; error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto bread_err; + goto done; op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { @@ -987,12 +1019,10 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) @@ -1152,16 +1182,22 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1169,7 +1205,7 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) @@ -1188,7 +1224,7 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); @@ -1408,6 +1444,7 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans( memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans( item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } @@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - xlog_recover_t *trans) + struct log *log, + xlog_recover_t *trans, + int pass) { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); @@ -1535,6 +1577,8 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); list_move(&item->ri_list, &trans->r_itemq); break; } @@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); list_move_tail(&item->ri_list, &trans->r_itemq); break; default: @@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) + if (!(flags & XFS_BLI_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; + } /* * Insert an xfs_buf_cancel record into the hash table of @@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1( while (nextp != NULL) { if (nextp->bc_blkno == blkno && nextp->bc_len == len) { nextp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); return; } prevp = nextp; @@ -1640,6 +1689,7 @@ xlog_recover_do_buffer_pass1( bcp->bc_refcount = 1; bcp->bc_next = NULL; prevp->bc_next = bcp; + trace_xfs_log_recover_buf_cancel_add(log, buf_f); } /* @@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer( unsigned int *data_map = NULL; unsigned int map_size = 0; + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer( /*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( + struct xfs_mount *mp, xlog_recover_item_t *item, xfs_buf_t *bp, xfs_buf_log_format_t *buf_f) @@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer( unsigned int map_size = 0; int error; + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer( { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* @@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans( */ cancel = xlog_recover_do_buffer_pass2(log, buf_f); if (cancel) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } } + trace_xfs_log_recover_buf_recover(log, buf_f); switch (buf_f->blf_type) { case XFS_LI_BUF: blkno = buf_f->blf_blkno; @@ -2204,7 +2263,7 @@ xlog_recover_do_buffer_trans( (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) return XFS_ERROR(error); @@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans( if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); @@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans( /* do nothing */ } else { xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto error; } @@ -2758,11 +2820,12 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item; - error = xlog_recover_reorder_trans(trans); + error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: error = xlog_recover_do_buffer_trans(log, item, pass); @@ -2919,8 +2982,9 @@ xlog_recover_process_data( error = xlog_recover_unmount_trans(trans); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, be32_to_cpu(ohead->oh_len)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xlog_warn( @@ -2930,7 +2994,7 @@ xlog_recover_process_data( break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, + error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: @@ -3331,42 +3395,6 @@ xlog_pack_data( } } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - __be32 *up = (__be32 *)dp; - uint chksum = 0; - int i; - - /* divide length by 4 to get # words */ - for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - if (chksum != be32_to_cpu(rhead->h_chksum)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - be32_to_cpu(rhead->h_chksum), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } -} -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif - STATIC void xlog_unpack_data( xlog_rec_header_t *rhead, @@ -3390,8 +3418,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - xlog_unpack_data_checksum(rhead, dp, log); } STATIC int @@ -3490,7 +3516,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3946,10 +3972,6 @@ xlog_recover_check_summary( xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; @@ -3984,30 +4006,5 @@ xlog_recover_check_summary( xfs_buf_relse(agibp); } } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e79b56b..d7bf38c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1405,13 +1405,6 @@ xfs_mountfs( xfs_qm_mount_quotas(mp); } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - if (XFS_IS_QUOTA_ON(mp)) - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); - else - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); -#endif - /* * Now we are mounted, reserve a small amount of unused space for * privileged transactions. This is needed so that transaction diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index fdcab3f..e0e64b1 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ -#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ -#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f73e358..be578ec 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -45,23 +45,12 @@ #include "xfs_trans_space.h" #include "xfs_inode_item.h" - -STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); -STATIC uint xfs_trans_count_vecs(xfs_trans_t *); -STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); -STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); -STATIC void xfs_trans_committed(xfs_trans_t *, int); -STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); -STATIC void xfs_trans_free(xfs_trans_t *); - kmem_zone_t *xfs_trans_zone; - /* * Reservation functions here avoid a huge stack in xfs_trans_init * due to register overflow from temporaries in the calculations. */ - STATIC uint xfs_calc_write_reservation(xfs_mount_t *mp) { @@ -261,6 +250,19 @@ _xfs_trans_alloc( } /* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + xfs_trans_t *tp) +{ + atomic_dec(&tp->t_mountp->m_active_trans); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* * This is called to create a new transaction which will share the * permanent log reservation of the given transaction. The remaining * unused block and rt extent reservations are also inherited. This @@ -764,94 +766,278 @@ xfs_trans_unreserve_and_mod_sb( } } +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +static uint +xfs_trans_count_vecs( + struct xfs_trans *tp) +{ + int nvecs; + xfs_log_item_desc_t *lidp; + + nvecs = 1; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (lidp == NULL) + return 0; + + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + lidp = xfs_trans_next_item(tp, lidp); + } + + return nvecs; +} /* - * xfs_trans_commit + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). * - * Commit the given transaction to the log a/synchronously. + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +static void +xfs_trans_fill_vecs( + struct xfs_trans *tp, + struct xfs_log_iovec *log_vector) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_iovec *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; + + nitems = 0; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp); + while (lidp) { + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* + * The item may be marked dirty but not log anything. This can + * be used to get called when a transaction is committed. + */ + if (lidp->lid_size) + nitems++; + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; + IOP_PIN(lidp->lid_item); + lidp = xfs_trans_next_item(tp, lidp); + } + + /* + * Now that we've counted the number of items in this transaction, fill + * in the transaction header. Note that the transaction header does not + * have a log item. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; +} + +/* + * The committed item processing consists of calling the committed routine of + * each logged item, updating the item's position in the AIL if necessary, and + * unpinning each item. If the committed routine returns -1, then do nothing + * further with the item because it may have been freed. * - * XFS disk error handling mechanism is not based on a typical - * transaction abort mechanism. Logically after the filesystem - * gets marked 'SHUTDOWN', we can't let any new transactions - * be durable - ie. committed to disk - because some metadata might - * be inconsistent. In such cases, this returns an error, and the - * caller may assume that all locked objects joined to the transaction - * have already been unlocked as if the commit had succeeded. - * Do not reference the transaction structure after this call. + * Since items are unlocked when they are copied to the incore log, it is + * possible for two transactions to be completing and manipulating the same + * item simultaneously. The AIL lock will protect the lsn field of each item. + * The value of this field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because otherwise + * they could be immediately flushed and we'd have to race with the flusher + * trying to pull the item from the AIL as we add it. */ - /*ARGSUSED*/ -int -_xfs_trans_commit( - xfs_trans_t *tp, - uint flags, - int *log_flushed) +static void +xfs_trans_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, + int aborted) { - xfs_log_iovec_t *log_vector; - int nvec; - xfs_mount_t *mp; - xfs_lsn_t commit_lsn; - /* REFERENCED */ - int error; - int log_flags; - int sync; -#define XFS_TRANS_LOGVEC_COUNT 16 - xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xlog_in_core *commit_iclog; - int shutdown; + xfs_lsn_t item_lsn; + struct xfs_ail *ailp; - commit_lsn = -1; + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* If the committed routine returns -1, item has been freed. */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + return; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. + * If the returned lsn is greater than what it contained before, update + * the location of the item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it is possible that a + * later transaction completing simultaneously with an earlier one + * using the same item could complete first with a higher lsn. This + * would cause the earlier transaction to fail the test below. */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn and update the + * position of the item in the AIL. + * + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(ailp, lip, item_lsn); } else { - log_flags = 0; + spin_unlock(&ailp->xa_lock); } - mp = tp->t_mountp; /* - * If there is nothing to be logged by the transaction, - * then unlock all of the items associated with the - * transaction and free the transaction structure. - * Also make sure to return any reserved blocks to - * the free pool. + * Now that we've repositioned the item in the AIL, unpin it so it can + * be flushed. Pass information about buffer stale state down from the + * log item flags, if anyone else stales the buffer we do not want to + * pay any attention to it. */ -shut_us_down: - shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; - if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { - xfs_trans_unreserve_and_mod_sb(tp); + IOP_UNPIN(lip); +} + +/* Clear all the per-AG busy list items listed in this transaction */ +static void +xfs_trans_clear_busy_extents( + struct xfs_trans *tp) +{ + xfs_log_busy_chunk_t *lbcp; + xfs_log_busy_slot_t *lbsp; + int i; + + for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) { + i = 0; + for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { + if (XFS_LBC_ISFREE(lbcp, i)) + continue; + xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx); + } + } + xfs_trans_free_busy(tp); +} + +/* + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + */ +STATIC void +xfs_trans_committed( + struct xfs_trans *tp, + int abortflag) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + + /* Call the transaction's completion callback if there is one. */ + if (tp->t_callback != NULL) + tp->t_callback(tp, tp->t_callarg); + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { + xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); + } + + /* free the item chunks, ignoring the embedded chunk */ + for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { + next_licp = licp->lic_next; + kmem_free(licp); + } + + xfs_trans_clear_busy_extents(tp); + xfs_trans_free(tp); +} + +/* + * Called from the trans_commit code when we notice that + * the filesystem is in the middle of a forced shutdown. + */ +STATIC void +xfs_trans_uncommit( + struct xfs_trans *tp, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { /* - * It is indeed possible for the transaction to be - * not dirty but the dqinfo portion to be. All that - * means is that we have some (non-persistent) quota - * reservations that need to be unreserved. + * Unpin all but those that aren't dirty. */ - xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, - NULL, log_flags); - if (commit_lsn == -1 && !shutdown) - shutdown = XFS_ERROR(EIO); - } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); - return (shutdown); + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN_REMOVE(lidp->lid_item, tp); } - ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + xfs_trans_free_items(tp, flags); + xfs_trans_free_busy(tp); + xfs_trans_free(tp); +} + +/* + * Format the transaction direct to the iclog. This isolates the physical + * transaction commit operation from the logical operation and hence allows + * other methods to be introduced without affecting the existing commit path. + */ +static int +xfs_trans_commit_iclog( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + int shutdown; + int error; + int log_flags = 0; + struct xlog_in_core *commit_iclog; +#define XFS_TRANS_LOGVEC_COUNT 16 + struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; + struct xfs_log_iovec *log_vector; + uint nvec; + /* * Ask each log item how many log_vector entries it will @@ -861,8 +1047,7 @@ shut_us_down: */ nvec = xfs_trans_count_vecs(tp); if (nvec == 0) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - goto shut_us_down; + return ENOMEM; /* triggers a shutdown! */ } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { log_vector = log_vector_fast; } else { @@ -877,6 +1062,9 @@ shut_us_down: */ xfs_trans_fill_vecs(tp, log_vector); + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); /* @@ -884,18 +1072,17 @@ shut_us_down: * at any time after this call. However, all the items associated * with the transaction are still locked and pinned in memory. */ - commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); + *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - tp->t_commit_lsn = commit_lsn; - if (nvec > XFS_TRANS_LOGVEC_COUNT) { + tp->t_commit_lsn = *commit_lsn; + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); - } /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. */ - if (error || commit_lsn == -1) { + if (error || *commit_lsn == -1) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); return XFS_ERROR(EIO); @@ -909,8 +1096,6 @@ shut_us_down: */ xfs_trans_unreserve_and_mod_sb(tp); - sync = tp->t_flags & XFS_TRANS_SYNC; - /* * Tell the LM to call the transaction completion routine * when the log write with LSN commit_lsn completes (e.g. @@ -953,7 +1138,7 @@ shut_us_down: * the commit lsn of this transaction for dependency tracking * purposes. */ - xfs_trans_unlock_items(tp, commit_lsn); + xfs_trans_unlock_items(tp, *commit_lsn); /* * If we detected a log error earlier, finish committing @@ -973,156 +1158,114 @@ shut_us_down: * and the items are released we can finally allow the iclog to * go to disk. */ - error = xfs_log_release_iclog(mp, commit_iclog); - - /* - * If the transaction needs to be synchronous, then force the - * log out now and wait for it. - */ - if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); - } - XFS_STATS_INC(xs_trans_sync); - } else { - XFS_STATS_INC(xs_trans_async); - } - - return (error); + return xfs_log_release_iclog(mp, commit_iclog); } /* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. + * xfs_trans_commit + * + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. */ -STATIC uint -xfs_trans_count_vecs( - xfs_trans_t *tp) +int +_xfs_trans_commit( + struct xfs_trans *tp, + uint flags, + int *log_flushed) { - int nvecs; - xfs_log_item_desc_t *lidp; + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; - nvecs = 1; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. + /* + * Determine whether this commit is releasing a permanent + * log reservation or not. */ - if (lidp == NULL) - return 0; - - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - lidp = xfs_trans_next_item(tp, lidp); + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; } - return nvecs; -} - -/* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. - */ -STATIC void -xfs_trans_uncommit( - xfs_trans_t *tp, - uint flags) -{ - xfs_log_item_desc_t *lidp; + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { - /* - * Unpin all but those that aren't dirty. - */ - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; } - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + ASSERT(tp->t_ticket != NULL); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); -} + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -STATIC void -xfs_trans_fill_vecs( - xfs_trans_t *tp, - xfs_log_iovec_t *log_vector) -{ - xfs_log_item_desc_t *lidp; - xfs_log_iovec_t *vecp; - uint nitems; + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + if (error == ENOMEM) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + error = XFS_ERROR(EIO); + goto out_unreserve; + } /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. */ - vecp = log_vector + 1; /* pointer arithmetic */ - - nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - /* - * The item may be marked dirty but not log anything. - * This can be used to get called when a transaction - * is committed. - */ - if (lidp->lid_size) { - nitems++; + if (sync) { + if (!error) { + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); } - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; /* pointer arithmetic */ - IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); } + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + /* - * Now that we've counted the number of items in this - * transaction, fill in the transaction header. + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_busy(tp); + xfs_trans_free(tp); + XFS_STATS_INC(xs_trans_empty); + return error; +} /* * Unlock all of the transaction's items and free the transaction. @@ -1200,20 +1343,6 @@ xfs_trans_cancel( xfs_trans_free(tp); } - -/* - * Free the transaction structure. If there is more clean up - * to do when the structure is freed, add it here. - */ -STATIC void -xfs_trans_free( - xfs_trans_t *tp) -{ - atomic_dec(&tp->t_mountp->m_active_trans); - xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); -} - /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when @@ -1283,174 +1412,3 @@ xfs_trans_roll( xfs_trans_ihold(trans, dp); return 0; } - -/* - * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). - * - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - * - * Call xfs_trans_chunk_committed() to process the items in - * each chunk. - */ -STATIC void -xfs_trans_committed( - xfs_trans_t *tp, - int abortflag) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - /* - * Call the transaction's completion callback if there - * is one. - */ - if (tp->t_callback != NULL) { - tp->t_callback(tp, tp->t_callarg); - } - - /* - * Special case the chunk embedded in the transaction. - */ - licp = &(tp->t_items); - if (!(xfs_lic_are_all_free(licp))) { - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - } - - /* - * Process the items in each chunk in turn. - */ - licp = licp->lic_next; - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Clear all the per-AG busy list items listed in this transaction - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (!XFS_LBC_ISFREE(lbcp, i)) { - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, - lbsp->lbc_idx); - } - } - lbcp = lbcp->lbc_next; - } - xfs_trans_free_busy(tp); - - /* - * That's it for the transaction structure. Free it. - */ - xfs_trans_free(tp); -} - -/* - * This is called to perform the commit processing for each - * item described by the given chunk. - * - * The commit processing consists of unlocking items which were - * held locked with the SYNC_UNLOCK attribute, calling the committed - * routine of each logged item, updating the item's position in the AIL - * if necessary, and unpinning each item. If the committed routine - * returns -1, then do nothing further with the item because it - * may have been freed. - * - * Since items are unlocked when they are copied to the incore - * log, it is possible for two transactions to be completing - * and manipulating the same item simultaneously. The AIL lock - * will protect the lsn field of each item. The value of this - * field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because - * otherwise they could be immediately flushed and we'd have to race - * with the flusher trying to pull the item from the AIL as we add it. - */ -STATIC void -xfs_trans_chunk_committed( - xfs_log_item_chunk_t *licp, - xfs_lsn_t lsn, - int aborted) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_lsn_t item_lsn; - int i; - - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - struct xfs_ail *ailp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - - /* - * Send in the ABORTED flag to the COMMITTED routine - * so that it knows whether the transaction was aborted - * or not. - */ - item_lsn = IOP_COMMITTED(lip, lsn); - - /* - * If the committed routine returns -1, make - * no more references to the item. - */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { - continue; - } - - /* - * If the returned lsn is greater than what it - * contained before, update the location of the - * item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it - * is possible that a later transaction completing - * simultaneously with an earlier one using the - * same item could complete first with a higher lsn. - * This would cause the earlier transaction to fail - * the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn - * and update the position of the item in - * the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, - * unpin it so it can be flushed. Pass information - * about buffer stale state down from the log item - * flags, if anyone else stales the buffer we do not - * want to pay any attention to it. - */ - IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); - } -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 79c8bab..c62beee 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,15 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } + /* * Transaction types. Used to distinguish types of buffers. */ @@ -159,7 +168,6 @@ typedef struct xfs_log_item_desc { #define XFS_LID_DIRTY 0x1 #define XFS_LID_PINNED 0x2 -#define XFS_LID_BUF_STALE 0x8 /* * This structure is used to maintain a chunk list of log_item_desc @@ -833,7 +841,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin)(xfs_log_item_t *); void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); @@ -846,7 +854,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index fb58636..9cd8090 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,11 +40,51 @@ #include "xfs_rw.h" #include "xfs_trace.h" +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + int i; -STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); -STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); + len = BBTOB(len); + for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { + if (xfs_lic_are_all_free(licp)) { + ASSERT(licp == &tp->t_items); + ASSERT(licp->lic_next == NULL); + return NULL; + } + + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (xfs_lic_isfree(licp, i)) + continue; + + lidp = xfs_lic_slot(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) + continue; + + if (XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; + } + } + + return NULL; +} /* * Add the locked buffer to the transaction. @@ -112,14 +152,6 @@ xfs_trans_bjoin( * within the transaction, just increment its lock recursion count * and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use get_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ @@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) @@ -259,14 +287,6 @@ int xfs_error_mod = 33; * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use read_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ @@ -328,11 +348,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; lidp->lid_flags |= XFS_LID_DIRTY; - lidp->lid_flags &= ~XFS_LID_BUF_STALE; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -782,7 +797,7 @@ xfs_trans_binval( bip->bli_format.blf_flags |= XFS_BLI_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + lidp->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -902,111 +917,3 @@ xfs_trans_dquot_buf( bip->bli_format.blf_flags |= type; } - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Only check the first, embedded - * chunk, since we don't want to spend all day scanning large transactions. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - licp = &tp->t_items; - if (!xfs_lic_are_all_free(licp)) { - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - break; - } else { - bp = NULL; - } - } - } - return bp; -} - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Check all the chunks, we - * want to be thorough. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match_all( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - return bp; - } - } - } - return NULL; -} |