diff options
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 2446 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/dir.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 23 | ||||
-rw-r--r-- | fs/ocfs2/file.h | 1 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 5 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 7 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_fs.h | 8 |
8 files changed, 2002 insertions, 497 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378..a966968 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -47,62 +47,230 @@ #include "buffer_head_io.h" -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno); +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - int wanted, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head *bhs[]); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, - struct ocfs2_alloc_context *meta_ac); +#define OCFS2_MAX_PATH_DEPTH 5 -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_new_eb_bh); +struct ocfs2_path { + int p_tree_depth; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 blkno, - u32 new_clusters); +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head **target_bh); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - unsigned int new_i_clusters, - struct buffer_head *old_last_eb, - struct buffer_head **new_last_eb); + if (keep_root) + start = 1; -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno) + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + + path->p_tree_depth = depth; +} + +static void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + } + + return path; +} + +/* + * Allocate and initialize a new path based on a disk inode tree. + */ +static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + + return ocfs2_new_path(di_bh, el); +} + +/* + * Convenience function to journal all components in a path. + */ +static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT +}; + +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) { return blkno == (le64_to_cpu(ext->e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, + ocfs2_clusters_to_blocks(sb, le32_to_cpu(ext->e_clusters))); } +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) == + le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; +} + +/* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +struct ocfs2_insert_type { + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_free_records; + int ins_tree_depth; +}; + /* * How many free extents have we got before we need more meta data? */ @@ -242,6 +410,28 @@ bail: } /* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + le32_to_cpu(el->l_recs[i].e_clusters); +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -268,6 +458,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; + u32 new_cpos; mlog_entry_void(); @@ -302,6 +493,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -330,7 +524,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); - eb_el->l_recs[0].e_cpos = fe->i_clusters; + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); eb_el->l_recs[0].e_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) @@ -376,7 +574,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); - el->l_recs[i].e_cpos = fe->i_clusters; + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); el->l_recs[i].e_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); @@ -425,6 +623,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct buffer_head **ret_new_eb_bh) { int status, i; + u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -480,11 +679,13 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_clusters = fe->i_clusters; + fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters); for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { fe_el->l_recs[i].e_cpos = 0; fe_el->l_recs[i].e_clusters = 0; @@ -515,199 +716,6 @@ bail: } /* - * Expects the tree to already have room in the rightmost leaf for the - * extent. Updates all the extent blocks (and the dinode) on the way - * down. - */ -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters) -{ - int status, i, num_bhs = 0; - u64 next_blkno; - u16 next_free; - struct buffer_head **eb_bhs = NULL; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - - mlog_entry_void(); - - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; - if (el->l_tree_depth) { - /* This is another operation where we want to be - * careful about our tree updates. An error here means - * none of the previous changes we made should roll - * forward. As a result, we have to record the buffers - * for this part of the tree in an array and reserve a - * journal write to them before making any changes. */ - num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); - eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!eb_bhs) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - i = 0; - while(el->l_tree_depth) { - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); - - BUG_ON(i >= num_bhs); - status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); - status = -EIO; - goto bail; - } - - status = ocfs2_journal_access(handle, inode, eb_bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - el = &eb->h_list; - i++; - /* When we leave this loop, eb_bhs[num_bhs - 1] will - * hold the bottom-most leaf extent block. */ - } - BUG_ON(el->l_tree_depth); - - el = &fe->id2.i_list; - /* If we have tree depth, then the fe update is - * trivial, and we want to switch el out for the - * bottom-most leaf in order to update it with the - * actual extent data below. */ - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - /* (num_bhs - 1) to avoid the leaf */ - for(i = 0; i < (num_bhs - 1); i++) { - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - el = &eb->h_list; - - /* finally, make our actual change to the - * intermediate extent blocks. */ - next_free = le16_to_cpu(el->l_next_free_rec); - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - - status = ocfs2_journal_dirty(handle, eb_bhs[i]); - if (status < 0) - mlog_errno(status); - } - BUG_ON(i != (num_bhs - 1)); - /* note that the leaf block wasn't touched in - * the loop above */ - eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; - el = &eb->h_list; - BUG_ON(el->l_tree_depth); - } - - /* yay, we can finally add the actual extent now! */ - i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) && - ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { - le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); - } else if (le16_to_cpu(el->l_next_free_rec) && - (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { - /* having an empty extent at eof is legal. */ - if (el->l_recs[i].e_cpos != fe->i_clusters) { - ocfs2_error(inode->i_sb, - "Dinode %llu trailing extent is bad: " - "cpos (%u) != number of clusters (%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(fe->i_clusters)); - status = -EIO; - goto bail; - } - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - } else { - /* No contiguous record, or no empty record at eof, so - * we add a new one. */ - - BUG_ON(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count)); - i = le16_to_cpu(el->l_next_free_rec); - - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - el->l_recs[i].e_cpos = fe->i_clusters; - le16_add_cpu(&el->l_next_free_rec, 1); - } - - /* - * extent_map errors are not fatal, so they are ignored outside - * of flushing the thing. - */ - status = ocfs2_extent_map_append(inode, &el->l_recs[i], - new_clusters); - if (status) { - mlog_errno(status); - ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); - if (status < 0) - mlog_errno(status); - } - - status = 0; -bail: - if (eb_bhs) { - for (i = 0; i < num_bhs; i++) - if (eb_bhs[i]) - brelse(eb_bhs[i]); - kfree(eb_bhs); - } - - mlog_exit(status); - return status; -} - -/* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three @@ -807,53 +815,1523 @@ bail: return status; } -/* the caller needs to update fe->i_clusters */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters, - struct ocfs2_alloc_context *meta_ac) +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) { - int status, i, shift; - struct buffer_head *last_eb_bh = NULL; + return !rec->e_clusters; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + if (el->l_next_free_rec == el->l_count && !has_empty) + BUG(); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", + insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +static int ocfs2_find_subtree_root(struct inode *inode, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Inode %lu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + inode->i_ino, left->p_tree_depth, + right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; struct buffer_head *bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent list at " + "depth %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; - mlog(0, "add %u clusters starting at block %llu to inode %llu\n", - new_clusters, (unsigned long long)start_blk, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } - if (el->l_tree_depth) { - /* jump to end of tree */ - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_exit(status); - goto bail; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + brelse(bh); + bh = NULL; + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, + &bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out; + } + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(inode, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +static int ocfs2_find_leaf(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust e_clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + */ +static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + mlog(0, "Adjust records at index %u\n", i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path->p_node[i].el; + idx = le16_to_cpu(left_el->l_next_free_rec) - 1; + left_rec = &el->l_recs[idx]; + + el = right_path->p_node[i].el; + right_rec = &el->l_recs[0]; + + ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, + right_el); + + ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + /* + * Setup our list pointers now so that the current + * parents become children in the next iteration. + */ + left_el = left_path->p_node[i].el; + right_el = right_path->p_node[i].el; + } + + /* + * At the root node, adjust the two adjacent records which + * begin our path to the leaves. + */ + + el = left_path->p_node[subtree_index].el; + left_el = left_path->p_node[subtree_index + 1].el; + right_el = right_path->p_node[subtree_index + 1].el; + + ocfs2_adjust_root_records(el, left_el, right_el, + left_path->p_node[subtree_index + 1].bh->b_blocknr); + + root_bh = left_path->p_node[subtree_index].bh; + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) + mlog_errno(ret); +} + +static int ocfs2_rotate_subtree_right(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i; + struct buffer_head *right_leaf_bh; + struct buffer_head *left_leaf_bh = NULL; + struct buffer_head *root_bh; + struct ocfs2_extent_list *right_el, *left_el; + struct ocfs2_extent_rec move_rec; + + left_leaf_bh = path_leaf_bh(left_path); + left_el = path_leaf_el(left_path); + + if (left_el->l_next_free_rec != left_el->l_count) { + ocfs2_error(inode->i_sb, + "Inode %llu has non-full interior leaf node %llu" + "(next free = %u)", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)left_leaf_bh->b_blocknr, + le16_to_cpu(left_el->l_next_free_rec)); + return -EROFS; + } + + /* + * This extent block may already have an empty record, so we + * return early if so. + */ + if (ocfs2_is_empty_extent(&left_el->l_recs[0])) + return 0; + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + right_leaf_bh = path_leaf_bh(right_path); + right_el = path_leaf_el(right_path); + + /* This is a code error, not a disk corruption. */ + mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " + "because rightmost leaf block %llu is empty\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)right_leaf_bh->b_blocknr); + + ocfs2_create_empty_extent(right_el); + + ret = ocfs2_journal_dirty(handle, right_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Do the copy now. */ + i = le16_to_cpu(left_el->l_next_free_rec) - 1; + move_rec = left_el->l_recs[i]; + right_el->l_recs[0] = move_rec; + + /* + * Clear out the record we just copied and shift everything + * over, leaving an empty extent in the left leaf. + * + * We temporarily subtract from next_free_rec so that the + * shift will lose the tail record (which is now defunct). + */ + le16_add_cpu(&left_el->l_next_free_rec, -1); + ocfs2_shift_records_right(left_el); + memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&left_el->l_next_free_rec, 1); + + ret = ocfs2_journal_dirty(handle, left_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the left of the current one. + * + * Will return zero if the path passed in is already the leftmost path. + */ +static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + *cpos = 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + el = path->p_node[i].el; + + /* + * Find the extent record just before the one in our + * path. + */ + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == 0) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the leftmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The leftmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); + *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1; + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + struct ocfs2_path *path) +{ + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; + + if (handle->h_buffer_credits < credits) + return ocfs2_extend_trans(handle, credits); + + return 0; +} + +/* + * Trap the case where we're inserting into the theoretical range past + * the _actual_ left leaf range. Otherwise, we'll rotate a record + * whose cpos is less than ours into the right leaf. + * + * It's only necessary to look at the rightmost record of the left + * leaf because the logic that calls us should ensure that the + * theoretical ranges in the path components above the leaves are + * correct. + */ +static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, + u32 insert_cpos) +{ + struct ocfs2_extent_list *left_el; + struct ocfs2_extent_rec *rec; + int next_free; + + left_el = path_leaf_el(left_path); + next_free = le16_to_cpu(left_el->l_next_free_rec); + rec = &left_el->l_recs[next_free - 1]; + + if (insert_cpos > le32_to_cpu(rec->e_cpos)) + return 1; + return 0; +} + +/* + * Rotate all the records in a btree right one record, starting at insert_cpos. + * + * The path to the rightmost leaf should be passed in. + * + * The array is assumed to be large enough to hold an entire path (tree depth). + * + * Upon succesful return from this function: + * + * - The 'right_path' array will contain a path to the leaf block + * whose range contains e_cpos. + * - That leaf block will have a single empty extent in list index 0. + * - In the case that the rotation requires a post-insert update, + * *ret_left_path will contain a valid path which can be passed to + * ocfs2_insert_path(). + */ +static int ocfs2_rotate_tree_right(struct inode *inode, + handle_t *handle, + u32 insert_cpos, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, start; + u32 cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); + + /* + * What we want to do here is: + * + * 1) Start with the rightmost path. + * + * 2) Determine a path to the leaf block directly to the left + * of that leaf. + * + * 3) Determine the 'subtree root' - the lowest level tree node + * which contains a path to both leaves. + * + * 4) Rotate the subtree. + * + * 5) Find the next subtree by considering the left path to be + * the new right path. + * + * The check at the top of this while loop also accepts + * insert_cpos == cpos because cpos is only a _theoretical_ + * value to get us the left path - insert_cpos might very well + * be filling that hole. + * + * Stop at a cpos of '0' because we either started at the + * leftmost branch (i.e., a tree with one branch and a + * rotation inside of it), or we've gone as far as we can in + * rotating subtrees. + */ + while (cpos && insert_cpos <= cpos) { + mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", + insert_cpos, cpos); + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog_bug_on_msg(path_leaf_bh(left_path) == + path_leaf_bh(right_path), + "Inode %lu: error during insert of %u " + "(left path cpos %u) results in two identical " + "paths ending at %llu\n", + inode->i_ino, insert_cpos, cpos, + (unsigned long long) + path_leaf_bh(left_path)->b_blocknr); + + if (ocfs2_rotate_requires_path_adjustment(left_path, + insert_cpos)) { + mlog(0, "Path adjustment required\n"); + + /* + * We've rotated the tree as much as we + * should. The rest is up to + * ocfs2_insert_path() to complete, after the + * record insertion. We indicate this + * situation by returning the left path. + * + * The reason we don't adjust the records here + * before the record insert is that an error + * later might break the rule where a parent + * record e_cpos will reflect the actual + * e_cpos of the 1st nonempty record of the + * child list. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + start = ocfs2_find_subtree_root(inode, left_path, right_path); + + mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", + start, + (unsigned long long) right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, start, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + right_path, start); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * There is no need to re-read the next right path + * as we know that it'll be our current left + * path. Optimize by copying values instead. + */ + ocfs2_mv_path(right_path, left_path); + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(left_path); + +out_ret_path: + return ret; +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert, + struct inode *inode) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(el->l_tree_depth); + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le32_add_cpu(&rec->e_clusters, + le32_to_cpu(insert_rec->e_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "inode %lu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le32_to_cpu(insert_rec->e_cpos), + le32_to_cpu(insert_rec->e_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static inline void ocfs2_update_dinode_clusters(struct inode *inode, + struct ocfs2_dinode *di, + u32 clusters) +{ + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Append may need a left path update. cpos: %u, " + "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), + left_cpos); + + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } + + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_root_el(right_path); + bh = path_root_bh(right_path); + i = 0; + while (1) { + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %llu has a bad extent list", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EIO; + goto out; + } + + el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos; + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + le32_to_cpu(insert_rec->e_clusters)); + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + -le32_to_cpu(el->l_recs[next_free - 1].e_cpos)); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + + if (++i >= right_path->p_tree_depth) + break; + + bh = right_path->p_node[i].bh; + el = right_path->p_node[i].el; + } + + *ret_left_path = left_path; + ret = 0; +out: + if (ret != 0) + ocfs2_free_path(left_path); + + return ret; +} + +/* + * This function only does inserts on an allocation b-tree. For dinode + * lists, ocfs2_insert_at_leaf() is called directly. + * + * right_path is the path we want to do the actual insert + * in. left_path should only be passed in if we need to update that + * portion of the tree after an edge insert. + */ +static int ocfs2_insert_path(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret, subtree_index; + struct buffer_head *leaf_bh = path_leaf_bh(right_path); + struct ocfs2_extent_list *el; + + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (left_path) { + int credits = handle->h_buffer_credits; + + /* + * There's a chance that left_path got passed back to + * us without being accounted for in the + * journal. Extend our transaction here to be sure we + * can change those blocks. + */ + credits += left_path->p_tree_depth; + + ret = ocfs2_extend_trans(handle, credits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + el = path_leaf_el(right_path); + + ocfs2_insert_at_leaf(insert_rec, el, insert, inode); + ret = ocfs2_journal_dirty(handle, leaf_bh); + if (ret) + mlog_errno(ret); + + if (left_path) { + /* + * The rotate code has indicated that we need to fix + * up portions of the tree after the insert. + * + * XXX: Should we extend the transaction here? + */ + subtree_index = ocfs2_find_subtree_root(inode, left_path, + right_path); + ocfs2_complete_edge_insert(inode, handle, left_path, + right_path, subtree_index); + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_do_insert_extent(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *type) +{ + int ret, rotate = 0; + u32 cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_list *el; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(el->l_tree_depth) == 0) { + ocfs2_insert_at_leaf(insert_rec, el, type, inode); + goto out_update_clusters; + } + + right_path = ocfs2_new_inode_path(di_bh); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * Determine the path to start with. Rotations need the + * rightmost path, everything else can go directly to the + * target leaf. + */ + cpos = le32_to_cpu(insert_rec->e_cpos); + if (type->ins_appending == APPEND_NONE && + type->ins_contig == CONTIG_NONE) { + rotate = 1; + cpos = UINT_MAX; + } + + ret = ocfs2_find_path(inode, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Rotations and appends need special treatment - they modify + * parts of the tree's above them. + * + * Both might pass back a path immediate to the left of the + * one being inserted to. This will be cause + * ocfs2_insert_path() to modify the rightmost records of + * left_path to account for an edge insert. + * + * XXX: When modifying this code, keep in mind that an insert + * can wind up skipping both of these two special cases... + */ + if (rotate) { + ret = ocfs2_rotate_tree_right(inode, handle, + le32_to_cpu(insert_rec->e_cpos), + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (type->ins_appending == APPEND_TAIL + && type->ins_contig != CONTIG_LEFT) { + ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_insert_path(inode, handle, left_path, right_path, + insert_rec, type); + if (ret) { + mlog_errno(ret); + goto out; + } + +out_update_clusters: + ocfs2_update_dinode_clusters(inode, di, + le32_to_cpu(insert_rec->e_clusters)); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + + return ret; +} + +static void ocfs2_figure_contig_type(struct inode *inode, + struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + enum ocfs2_contig_type contig_type = CONTIG_NONE; + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], + insert_rec); + if (contig_type != CONTIG_NONE) { + insert->ins_contig_index = i; + break; + } + } + insert->ins_contig = contig_type; +} + +/* + * This should only be called against the righmost leaf extent list. + * + * ocfs2_figure_appending_type() will figure out whether we'll have to + * insert at the tail of the rightmost leaf. + * + * This should also work against the dinode list for tree's with 0 + * depth. If we consider the dinode list to be the rightmost leaf node + * then the logic here makes sense. + */ +static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + u32 cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + insert->ins_appending = APPEND_NONE; + + BUG_ON(el->l_tree_depth); + + if (!el->l_next_free_rec) + goto set_tail_append; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + /* Were all records empty? */ + if (le16_to_cpu(el->l_next_free_rec) == 1) + goto set_tail_append; } - /* Can we allocate without adding/shifting tree bits? */ i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) == 0 - || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) - || le32_to_cpu(el->l_recs[i].e_clusters) == 0 - || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) - goto out_add; + rec = &el->l_recs[i]; + + if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))) + goto set_tail_append; - mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " - "tree now.\n"); + return; + +set_tail_append: + insert->ins_appending = APPEND_TAIL; +} + +/* + * Helper function called at the begining of an insert. + * + * This computes a few things that are commonly used in the process of + * inserting into the btree: + * - Whether the new extent is contiguous with an existing one. + * - The current tree depth. + * - Whether the insert is an appending one. + * - The total # of free records in the tree. + * + * All of the information is stored on the ocfs2_insert_type + * structure. + */ +static int ocfs2_figure_insert_type(struct inode *inode, + struct buffer_head *di_bh, + struct buffer_head **last_eb_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + struct buffer_head *bh = NULL; + + el = &di->id2.i_list; + insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); + + if (el->l_tree_depth) { + /* + * If we have tree depth, we read in the + * rightmost extent block ahead of time as + * ocfs2_figure_insert_type() and ocfs2_add_branch() + * may want it later. + */ + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_last_eb_blk), &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_exit(ret); + goto out; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + } + + /* + * Unless we have a contiguous insert, we'll need to know if + * there is room left in our allocation tree for another + * extent record. + * + * XXX: This test is simplistic, we can search for empty + * extent records too. + */ + insert->ins_free_records = le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec); + + if (!insert->ins_tree_depth) { + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_appending_type(insert, el, insert_rec); + return 0; + } + + path = ocfs2_new_inode_path(di_bh); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * In the case that we're inserting past what the tree + * currently accounts for, ocfs2_find_path() will return for + * us the rightmost tree path. This is accounted for below in + * the appending code. + */ + ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + /* + * Now that we have the path, there's two things we want to determine: + * 1) Contiguousness (also set contig_index if this is so) + * + * 2) Are we doing an append? We can trivially break this up + * into two types of appends: simple record append, or a + * rotate inside the tail leaf. + */ + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + + /* + * The insert code isn't quite ready to deal with all cases of + * left contiguousness. Specifically, if it's an insert into + * the 1st record in a leaf, it will require the adjustment of + * e_clusters on the last record of the path directly to it's + * left. For now, just catch that case and fool the layers + * above us. This works just fine for tree_depth == 0, which + * is why we allow that above. + */ + if (insert->ins_contig == CONTIG_LEFT && + insert->ins_contig_index == 0) + insert->ins_contig = CONTIG_NONE; + + /* + * Ok, so we can simply compare against last_eb to figure out + * whether the path doesn't exist. This will only happen in + * the case that we're doing a tail append, so maybe we can + * take advantage of that information somehow. + */ + if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + /* + * Ok, ocfs2_find_path() returned us the rightmost + * tree path. This might be an appending insert. There are + * two cases: + * 1) We're doing a true append at the tail: + * -This might even be off the end of the leaf + * 2) We're "appending" by rotating in the tail + */ + ocfs2_figure_appending_type(insert, el, insert_rec); + } + +out: + ocfs2_free_path(path); + + if (ret == 0) + *last_eb_bh = bh; + else + brelse(bh); + return ret; +} + +/* + * Insert an extent into an inode btree. + * + * The caller needs to update fe->i_clusters + */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u32 cpos, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_insert_type insert = {0, }; + struct ocfs2_extent_rec rec; + + mlog(0, "add %u clusters at position %u to inode %llu\n", + new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, + OCFS2_I(inode)->ip_clusters); + + rec.e_cpos = cpu_to_le32(cpos); + rec.e_blkno = cpu_to_le64(start_blk); + rec.e_clusters = cpu_to_le32(new_clusters); + + status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + &insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Insert.appending: %u, Insert.Contig: %u, " + "Insert.contig_index: %d, Insert.free_records: %d, " + "Insert.tree_depth: %d\n", + insert.ins_appending, insert.ins_contig, insert.ins_contig_index, + insert.ins_free_records, insert.ins_tree_depth); + + /* + * Avoid growing the tree unless we're out of records and the + * insert type requres one. + */ + if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) + goto out_add; shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); if (shift < 0) { @@ -866,13 +2344,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, * and didn't find room for any more extents - we need to add * another tree level */ if (shift) { - /* if we hit a leaf, we'd better be empty :) */ - BUG_ON(le16_to_cpu(el->l_next_free_rec) != - le16_to_cpu(el->l_count)); BUG_ON(bh); - mlog(0, "ocfs2_allocate_extent: need to shift tree depth " - "(current = %u)\n", - le16_to_cpu(fe->id2.i_list.l_tree_depth)); + mlog(0, "need to shift tree depth " + "(current = %d)\n", insert.ins_tree_depth); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to @@ -883,15 +2357,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + insert.ins_tree_depth++; /* Special case: we have room now if we shifted from * tree_depth 0 */ - if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + if (insert.ins_tree_depth == 1) goto out_add; } /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ - mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + mlog(0, "add branch. bh = %p\n", bh); status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, meta_ac); if (status < 0) { @@ -900,9 +2375,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } out_add: - /* Finally, we can add clusters. */ - status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, - start_blk, new_clusters); + /* Finally, we can add clusters. This might rotate the tree for us. */ + status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); if (status < 0) mlog_errno(status); @@ -1447,140 +2921,141 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, +static int ocfs2_find_new_last_ext_blk(struct inode *inode, u32 new_i_clusters, - struct buffer_head *old_last_eb, + struct ocfs2_path *path, struct buffer_head **new_last_eb) { - int i, status = 0; - u64 block = 0; + int ret = 0; + u32 cpos; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; *new_last_eb = NULL; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } - /* we have no tree, so of course, no last_eb. */ - if (!fe->id2.i_list.l_tree_depth) - goto bail; + if (!path->p_tree_depth) + goto out; /* trunc to zero special case - this makes tree_depth = 0 * regardless of what it is. */ if (!new_i_clusters) - goto bail; + goto out; - eb = (struct ocfs2_extent_block *) old_last_eb->b_data; - el = &(eb->h_list); + el = path_leaf_el(path); BUG_ON(!el->l_next_free_rec); /* Make sure that this guy will actually be empty after we * clear away the data. */ - if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) - goto bail; + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + if (le16_to_cpu(el->l_next_free_rec) > 1 && + le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters) + goto out; + } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) + goto out; - /* Ok, at this point, we know that last_eb will definitely - * change, so lets traverse the tree and find the second to - * last extent block. */ - el = &(fe->id2.i_list); - /* go down the tree, */ - do { - for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { - if (le32_to_cpu(el->l_recs[i].e_cpos) < - new_i_clusters) { - block = le64_to_cpu(el->l_recs[i].e_blkno); - break; - } - } - BUG_ON(i < 0); + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } - if (bh) { - brelse(bh); - bh = NULL; - } + ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } - status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } - } while (el->l_tree_depth); + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EROFS; + goto out; + } *new_last_eb = bh; get_bh(*new_last_eb); - mlog(0, "returning block %llu\n", - (unsigned long long)le64_to_cpu(eb->h_blkno)); -bail: - if (bh) - brelse(bh); + mlog(0, "returning block %llu, (cpos: %u)\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); +out: + brelse(bh); - return status; + return ret; } static int ocfs2_do_truncate(struct ocfs2_super *osb, unsigned int clusters_to_del, struct inode *inode, struct buffer_head *fe_bh, - struct buffer_head *old_last_eb_bh, handle_t *handle, - struct ocfs2_truncate_context *tc) + struct ocfs2_truncate_context *tc, + struct ocfs2_path *path) { - int status, i, depth; + int status, i, index; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_block *last_eb = NULL; struct ocfs2_extent_list *el; struct buffer_head *eb_bh = NULL; struct buffer_head *last_eb_bh = NULL; - u64 next_eb = 0; u64 delete_blk = 0; fe = (struct ocfs2_dinode *) fe_bh->b_data; - status = ocfs2_find_new_last_ext_blk(osb, - inode, - fe, + status = ocfs2_find_new_last_ext_blk(inode, le32_to_cpu(fe->i_clusters) - - clusters_to_del, - old_last_eb_bh, - &last_eb_bh); + clusters_to_del, + path, &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } - if (last_eb_bh) + + /* + * Each component will be touched, so we might as well journal + * here to avoid having to handle errors later. + */ + for (i = 0; i < path_num_items(path); i++) { + status = ocfs2_journal_access(handle, inode, + path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (last_eb_bh) { + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); + el = &(fe->id2.i_list); + + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) { + ocfs2_error(inode->i_sb, + "Inode %lu has an empty extent record, depth %u\n", + inode->i_ino, le16_to_cpu(el->l_tree_depth)); goto bail; } - el = &(fe->id2.i_list); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; spin_unlock(&OCFS2_I(inode)->ip_lock); le32_add_cpu(&fe->i_clusters, -clusters_to_del); - fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); i = le16_to_cpu(el->l_next_free_rec) - 1; @@ -1589,26 +3064,34 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, /* tree depth zero, we can just delete the clusters, otherwise * we need to record the offset of the next level extent block * as we may overwrite it. */ - if (!el->l_tree_depth) + if (!el->l_tree_depth) { delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(el->l_recs[i].e_clusters)); - else - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); - if (!el->l_recs[i].e_clusters) { - /* if we deleted the whole extent record, then clear - * out the other fields and update the extent - * list. For depth > 0 trees, we've already recorded - * the extent block in 'next_eb' */ - el->l_recs[i].e_cpos = 0; - el->l_recs[i].e_blkno = 0; - BUG_ON(!el->l_next_free_rec); - le16_add_cpu(&el->l_next_free_rec, -1); + if (!el->l_recs[i].e_clusters) { + /* if we deleted the whole extent record, then clear + * out the other fields and update the extent + * list. + */ + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + + /* + * The leftmost record might be an empty extent - + * delete it here too. + */ + if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) { + el->l_recs[0].e_cpos = 0; + el->l_recs[0].e_blkno = 0; + el->l_next_free_rec = 0; + } + } } - depth = le16_to_cpu(el->l_tree_depth); - if (!fe->i_clusters) { + if (le32_to_cpu(fe->i_clusters) == 0) { /* trunc to zero is a special case. */ el->l_tree_depth = 0; fe->i_last_eb_blk = 0; @@ -1625,12 +3108,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, /* If there will be a new last extent block, then by * definition, there cannot be any leaves to the right of * him. */ - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } last_eb->h_next_leaf_blk = 0; status = ocfs2_journal_dirty(handle, last_eb_bh); if (status < 0) { @@ -1639,33 +3116,25 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } } + index = 1; /* if our tree depth > 0, update all the tree blocks below us. */ - while (depth) { - mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", - depth, (unsigned long long)next_eb); - status = ocfs2_read_block(osb, next_eb, &eb_bh, - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } + while (index <= path->p_tree_depth) { + eb_bh = path->p_node[index].bh; eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } - el = &(eb->h_list); + el = path->p_node[index].el; - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } + mlog(0, "traveling tree (index = %d, extent block: %llu)\n", + index, (unsigned long long)eb_bh->b_blocknr); BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); + if (index != + (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { + ocfs2_error(inode->i_sb, + "Inode %lu has invalid ext. block %llu\n", + inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + goto bail; + } i = le16_to_cpu(el->l_next_free_rec) - 1; @@ -1680,7 +3149,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); /* bottom-most block requires us to delete data.*/ if (!el->l_tree_depth) delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) @@ -1692,6 +3160,12 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, BUG_ON(!el->l_next_free_rec); le16_add_cpu(&el->l_next_free_rec, -1); } + if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) { + el->l_recs[0].e_cpos = 0; + el->l_recs[0].e_blkno = 0; + el->l_next_free_rec = 0; + } + mlog(0, "extent block %llu, after: record %d: " "(%u, %u, %llu), next = %u\n", (unsigned long long)le64_to_cpu(eb->h_blkno), i, @@ -1714,6 +3188,22 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, BUG_ON(el->l_recs[0].e_clusters); BUG_ON(el->l_recs[0].e_cpos); BUG_ON(el->l_recs[0].e_blkno); + + /* + * We need to remove this extent block from + * the list above it. + * + * Since we've passed it already in this loop, + * no need to worry about journaling. + */ + el = path->p_node[index - 1].el; + i = le16_to_cpu(el->l_next_free_rec) - 1; + BUG_ON(i < 0); + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_clusters = 0; + el->l_recs[i].e_blkno = 0; + le16_add_cpu(&el->l_next_free_rec, -1); + if (eb->h_suballoc_slot == 0) { /* * This code only understands how to @@ -1736,9 +3226,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } } } - brelse(eb_bh); - eb_bh = NULL; - depth--; + index++; } BUG_ON(!delete_blk); @@ -1750,10 +3238,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } status = 0; bail: - if (!status) - ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); - else - ocfs2_extent_map_drop(inode, 0); + mlog_exit(status); return status; } @@ -1770,82 +3255,70 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct ocfs2_truncate_context *tc) { int status, i, credits, tl_sem = 0; - u32 clusters_to_del, target_i_clusters; - u64 last_eb = 0; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; + u32 clusters_to_del, new_highest_cpos, range; struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_path *path = NULL; mlog_entry_void(); down_write(&OCFS2_I(inode)->ip_alloc_sem); - target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - last_eb_bh = tc->tc_last_eb_bh; - tc->tc_last_eb_bh = NULL; - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - if (fe->id2.i_list.l_tree_depth) { - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - el = &eb->h_list; - } else - el = &fe->id2.i_list; - last_eb = le64_to_cpu(fe->i_last_eb_blk); + path = ocfs2_new_inode_path(fe_bh); + if (!path) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } start: - mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " - "last_eb = %llu, fe->i_last_eb_blk = %llu, " - "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", - le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, - (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), - le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); - - if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { - mlog(0, "last_eb changed!\n"); - BUG_ON(!fe->id2.i_list.l_tree_depth); - last_eb = le64_to_cpu(fe->i_last_eb_blk); - /* i_last_eb_blk may have changed, read it if - * necessary. We don't have to worry about the - * truncate to zero case here (where there becomes no - * last_eb) because we never loop back after our work - * is done. */ - if (last_eb_bh) { - brelse(last_eb_bh); - last_eb_bh = NULL; - } - - status = ocfs2_read_block(osb, last_eb, - &last_eb_bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } - el = &(eb->h_list); + /* + * Truncate always works against the rightmost tree branch. + */ + status = ocfs2_find_path(inode, path, UINT_MAX); + if (status) { + mlog_errno(status); + goto bail; } - /* by now, el will point to the extent list on the bottom most - * portion of this tree. */ + mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", + OCFS2_I(inode)->ip_clusters, path->p_tree_depth); + + /* + * By now, el will point to the extent list on the bottom most + * portion of this tree. Only the tail record is considered in + * each pass. + * + * We handle the following cases, in order: + * - empty extent: delete the remaining branch + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (truncate has completed) + */ + el = path_leaf_el(path); i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) + range = le32_to_cpu(el->l_recs[i].e_cpos) + + le32_to_cpu(el->l_recs[i].e_clusters); + if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { + clusters_to_del = 0; + } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); - else + } else if (range > new_highest_cpos) { clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + le32_to_cpu(el->l_recs[i].e_cpos)) - - target_i_clusters; + new_highest_cpos; + } else { + status = 0; + goto bail; + } - mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", + clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + + BUG_ON(clusters_to_del == 0); mutex_lock(&tl_inode->i_mutex); tl_sem = 1; @@ -1861,7 +3334,8 @@ start: } credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - fe, el); + (struct ocfs2_dinode *)fe_bh->b_data, + el); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -1870,13 +3344,8 @@ start: goto bail; } - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); - if (status < 0) - mlog_errno(status); - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, - last_eb_bh, handle, tc); + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, + tc, path); if (status < 0) { mlog_errno(status); goto bail; @@ -1888,8 +3357,12 @@ start: ocfs2_commit_trans(osb, handle); handle = NULL; - BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); - if (le32_to_cpu(fe->i_clusters) > target_i_clusters) + ocfs2_reinit_path(path, 1); + + /* + * Only loop if we still have allocation. + */ + if (OCFS2_I(inode)->ip_clusters) goto start; bail: up_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -1902,8 +3375,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (last_eb_bh) - brelse(last_eb_bh); + ocfs2_free_path(path); /* This will drop the ext_alloc cluster lock for us */ ocfs2_free_truncate_context(tc); @@ -1912,7 +3384,6 @@ bail: return status; } - /* * Expects the inode to already be locked. This will figure out which * inodes need to be locked and will put them on the returned truncate @@ -1923,7 +3394,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context **tc) { - int status, metadata_delete; + int status, metadata_delete, i; unsigned int new_i_clusters; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -1944,7 +3415,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, "%llu\n", fe->i_clusters, new_i_clusters, (unsigned long long)fe->i_size); - if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { + if (!ocfs2_sparse_alloc(osb) && + le32_to_cpu(fe->i_clusters) <= new_i_clusters) { ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " "%u and size %llu whereas struct inode has " "cluster count %u and size %llu which caused an " @@ -1986,7 +3458,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, goto bail; } el = &(eb->h_list); - if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + + i = 0; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + i = 1; + /* + * XXX: Should we check that next_free_rec contains + * the extent? + */ + if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) metadata_delete = 1; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 0b82e80..b0880fd 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, - u64 blkno, + u32 cpos, + u64 start_blk, u32 new_clusters, struct ocfs2_alloc_context *meta_ac); int ocfs2_num_free_extents(struct ocfs2_super *osb, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 66821e1..5d211c5 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -365,8 +365,10 @@ int ocfs2_do_extend_dir(struct super_block *sb, spin_unlock(&OCFS2_I(dir)->ip_lock); if (extend) { - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, - parent_fe_bh, handle, + u32 offset = OCFS2_I(dir)->ip_clusters; + + status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, + 1, parent_fe_bh, handle, data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 25e36fb..8c97fa1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -397,6 +397,7 @@ bail: */ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *logical_offset, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, @@ -460,18 +461,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, block = ocfs2_clusters_to_blocks(osb->sb, bit_off); mlog(0, "Allocating %u clusters at block %u for inode %llu\n", num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, - num_bits, meta_ac); + status = ocfs2_insert_extent(osb, handle, inode, fe_bh, + *logical_offset, block, num_bits, + meta_ac); if (status < 0) { mlog_errno(status); goto leave; } - le32_add_cpu(&fe->i_clusters, num_bits); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); - status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { mlog_errno(status); @@ -479,6 +476,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, } clusters_to_add -= num_bits; + *logical_offset += num_bits; if (clusters_to_add) { mlog(0, "need to alloc once more, clusters = %u, wanted = " @@ -501,7 +499,7 @@ static int ocfs2_extend_allocation(struct inode *inode, int restart_func = 0; int drop_alloc_sem = 0; int credits, num_free_extents; - u32 prev_clusters; + u32 prev_clusters, logical_start; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; handle_t *handle = NULL; @@ -512,6 +510,12 @@ static int ocfs2_extend_allocation(struct inode *inode, mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); + /* + * This function only exists for file systems which don't + * support holes. + */ + BUG_ON(ocfs2_sparse_alloc(osb)); + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { @@ -526,6 +530,8 @@ static int ocfs2_extend_allocation(struct inode *inode, goto leave; } + logical_start = OCFS2_I(inode)->ip_clusters; + restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); @@ -590,6 +596,7 @@ restarted_transaction: status = ocfs2_do_extend_allocation(osb, inode, + &logical_start, clusters_to_add, bh, handle, diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index cc973f0..e2f6551 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -39,6 +39,7 @@ enum ocfs2_alloc_restarted { }; int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *cluster_start, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a93c15f..d65fef4 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1671,8 +1671,11 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; if (l > ocfs2_fast_symlink_chars(sb)) { + u32 offset = 0; + inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, + status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, + new_fe_bh, handle, data_ac, NULL, NULL); if (status < 0) { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index db8e77c..fe7e1ec 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -303,6 +303,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) return 1; } +static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index a476b63..f010197 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -86,7 +86,8 @@ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB -#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT +#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ + | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -311,7 +312,10 @@ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this - header (a leaf) */ + header (a leaf) + NOTE: The high 8 bits cannot be + used - tree_depth is never that big. + */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; |