aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/acl.c36
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c9
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c24
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dir.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h30
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c21
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c401
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c49
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c114
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c1
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c87
-rw-r--r--fs/ocfs2/inode.c7
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c15
-rw-r--r--fs/ocfs2/namei.c305
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h83
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h103
-rw-r--r--fs/ocfs2/refcounttree.c73
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/reservations.c22
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/suballoc.c239
-rw-r--r--fs/ocfs2/suballoc.h21
-rw-r--r--fs/ocfs2/super.c170
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c6
46 files changed, 2563 insertions, 421 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da70229..3919150 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_mode = new_mode;
+ inode->i_ctime = CURRENT_TIME;
di->i_mode = cpu_to_le16(inode->i_mode);
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_journal_dirty(handle, di_bh);
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
int ocfs2_check_acl(struct inode *inode, int mask)
{
- struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret = -EAGAIN;
- if (IS_ERR(acl))
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return ret;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+
+ brelse(di_bh);
+
+ if (IS_ERR(acl)) {
+ mlog_errno(PTR_ERR(acl));
return PTR_ERR(acl);
+ }
if (acl) {
- int ret = posix_acl_permission(inode, acl, mask);
+ ret = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
return ret;
}
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct posix_acl *acl = NULL;
- int ret = 0;
+ int ret = 0, ret2;
mode_t mode;
if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
mode = inode->i_mode;
ret = posix_acl_create_masq(clone, &mode);
if (ret >= 0) {
- ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
if (ret > 0) {
ret = ocfs2_set_acl(handle, inode,
di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 215e12c..592fae5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
last_page_bytes = PAGE_ALIGN(end);
index = start >> PAGE_CACHE_SHIFT;
do {
- pages[numpages] = grab_cache_page(mapping, index);
+ pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9..5cfeee1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
* out in so that future reads from that region will get
* zero's.
*/
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
/*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mlog_errno(ret);
goto out;
} else if (ret == 1) {
- ret = ocfs2_refcount_cow(inode, di_bh,
+ ret = ocfs2_refcount_cow(inode, filp, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93f..7606f66 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index ec6d123..c7ee03c 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
- "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
(unsigned int)check.bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
goto out;
}
- mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
(unsigned int)check.bc_crc32e, (unsigned int)crc);
rc = -EIO;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f..52c7557 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * - o2hb_region_bitmap allows us to limit the region number to max region.
+ * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * heartbeat on it.
+ * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES 0
+#define O2HB_DB_TYPE_LIVEREGIONS 1
+#define O2HB_DB_TYPE_QUORUMREGIONS 2
+#define O2HB_DB_TYPE_FAILEDREGIONS 3
+#define O2HB_DB_TYPE_REGION_LIVENODES 4
+#define O2HB_DB_TYPE_REGION_NUMBER 5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+struct o2hb_debug_buf {
+ int db_type;
+ int db_size;
+ int db_len;
+ void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
#define O2HB_DEBUG_DIR "o2hb"
#define O2HB_DEBUG_LIVENODES "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+
static struct dentry *o2hb_debug_dir;
static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
static LIST_HEAD(o2hb_all_regions);
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
#define O2HB_DEFAULT_BLOCK_BITS 9
+enum o2hb_heartbeat_modes {
+ O2HB_HEARTBEAT_LOCAL = 0,
+ O2HB_HEARTBEAT_GLOBAL,
+ O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
+};
+
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
/* Only sets a new threshold if there are no active regions.
*
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+ int ret = -1;
+
+ if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+ spin_lock(&o2hb_live_lock);
+ if (list_empty(&o2hb_all_regions)) {
+ o2hb_heartbeat_mode = hb_mode;
+ ret = 0;
+ }
+ spin_unlock(&o2hb_live_lock);
+ }
+
+ return ret;
+}
+
struct o2hb_node_event {
struct list_head hn_item;
enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
struct block_device *hr_bdev;
struct o2hb_disk_slot *hr_slots;
+ /* live node map of this region */
+ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned int hr_region_num;
+
+ struct dentry *hr_debug_dir;
+ struct dentry *hr_debug_livenodes;
+ struct dentry *hr_debug_regnum;
+ struct dentry *hr_debug_elapsed_time;
+ struct o2hb_debug_buf *hr_db_livenodes;
+ struct o2hb_debug_buf *hr_db_regnum;
+ struct o2hb_debug_buf *hr_db_elapsed_time;
+
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
* a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+static int o2hb_pop_count(void *map, int count)
+{
+ int i = -1, pop = 0;
+
+ while ((i = find_next_bit(map, count, i + 1)) < count)
+ pop++;
+ return pop;
+}
+
static void o2hb_write_timeout(struct work_struct *work)
{
+ int failed, quorum;
+ unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+ O2NM_MAX_REGIONS);
+ quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS);
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+ mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+ quorum, failed);
+
+ /*
+ * Fence if the number of failed regions >= half the number
+ * of quorum regions
+ */
+ if ((failed << 1) < quorum)
+ return;
+ }
+
o2quo_disk_timeout();
}
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ }
cancel_delayed_work(&reg->hr_write_timeout_work);
reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
{
assert_spin_locked(&o2hb_live_lock);
+ BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2nm_node_put(node);
}
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+ struct o2hb_disk_slot *slot)
+{
+ assert_spin_locked(&o2hb_live_lock);
+
+ if (!o2hb_global_heartbeat_active())
+ return;
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ return;
+
+ /*
+ * A region can be added to the quorum only when it sees all
+ * live nodes heartbeat on it. In other words, the region has been
+ * added to all nodes.
+ */
+ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ sizeof(o2hb_live_node_bitmap)))
+ return;
+
+ if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+ return;
+
+ printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+ config_item_name(&reg->hr_item));
+
+ set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+}
+
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
u64 cputime;
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
+ int tmp;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
- /* Is this correct? Do we assume that the node doesn't exist
- * if we're not configured for him? */
+ /*
+ * If a node is no longer configured but is still in the livemap, we
+ * may need to clear that bit from the livemap.
+ */
node = o2nm_get_node_by_num(slot->ds_node_num);
- if (!node)
- return 0;
+ if (!node) {
+ spin_lock(&o2hb_live_lock);
+ tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ if (!tmp)
+ return 0;
+ }
if (!o2hb_verify_crc(reg, hb_block)) {
/* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
slot->ds_node_num, (long long)slot->ds_last_generation);
+ set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+ "bitmap\n", slot->ds_node_num);
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
+ clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* last off the live_slot generates a callback */
list_del_init(&slot->ds_live_item);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+ "nodes bitmap\n", slot->ds_node_num);
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
- o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
- slot->ds_node_num);
+ /* node can be null */
+ o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+ node, slot->ds_node_num);
changed = 1;
}
@@ -706,11 +873,14 @@ fire_callbacks:
slot->ds_equal_samples = 0;
}
out:
+ o2hb_set_quorum_device(reg, slot);
+
spin_unlock(&o2hb_live_lock);
o2hb_run_event_list(&event);
- o2nm_node_put(node);
+ if (node)
+ o2nm_node_put(node);
return changed;
}
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
int i, ret, highest_node, change = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
return ret;
}
+ /*
+ * If a node is not configured but is in the livemap, we still need
+ * to read the slot so as to be able to remove it from the livemap.
+ */
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ set_bit(i, configured_nodes);
+ }
+
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
#ifdef CONFIG_DEBUG_FS
static int o2hb_debug_open(struct inode *inode, struct file *file)
{
+ struct o2hb_debug_buf *db = inode->i_private;
+ struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
char *buf = NULL;
int i = -1;
int out = 0;
+ /* max_nodes should be the largest bitmap we pass here */
+ BUG_ON(sizeof(map) < db->db_size);
+
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto bail;
- o2hb_fill_node_map(map, sizeof(map));
+ switch (db->db_type) {
+ case O2HB_DB_TYPE_LIVENODES:
+ case O2HB_DB_TYPE_LIVEREGIONS:
+ case O2HB_DB_TYPE_QUORUMREGIONS:
+ case O2HB_DB_TYPE_FAILEDREGIONS:
+ spin_lock(&o2hb_live_lock);
+ memcpy(map, db->db_data, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_LIVENODES:
+ spin_lock(&o2hb_live_lock);
+ reg = (struct o2hb_region *)db->db_data;
+ memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_NUMBER:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ reg->hr_region_num);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ jiffies_to_msecs(jiffies -
+ reg->hr_last_timeout_start));
+ goto done;
+
+ default:
+ goto done;
+ }
- while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+done:
i_size_write(inode, out);
file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- if (o2hb_debug_livenodes)
- debugfs_remove(o2hb_debug_livenodes);
- if (o2hb_debug_dir)
- debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
+ debugfs_remove(o2hb_debug_failedregions);
+ debugfs_remove(o2hb_debug_quorumregions);
+ debugfs_remove(o2hb_debug_liveregions);
+ debugfs_remove(o2hb_debug_livenodes);
+ debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len,
+ int type, int size, int len, void *data)
+{
+ *db = kmalloc(db_len, GFP_KERNEL);
+ if (!*db)
+ return NULL;
+
+ (*db)->db_type = type;
+ (*db)->db_size = size;
+ (*db)->db_len = len;
+ (*db)->db_data = data;
+
+ return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+ &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+ int ret = -ENOMEM;
+
+ o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+ if (!o2hb_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ o2hb_debug_dir,
+ &o2hb_db_livenodes,
+ sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES,
+ sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES,
+ o2hb_live_node_bitmap);
+ if (!o2hb_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_liveregions,
+ sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
+ if (!o2hb_debug_liveregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_quorumregions =
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
+ if (!o2hb_debug_quorumregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_failedregions =
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
+ if (!o2hb_debug_failedregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ if (ret)
+ o2hb_exit();
+
+ return ret;
}
int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+ memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+ memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+ memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+ memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
- o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
- o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
- S_IFREG|S_IRUSR,
- o2hb_debug_dir, NULL,
- &o2hb_debug_fops);
- if (!o2hb_debug_livenodes) {
- mlog_errno(-ENOMEM);
- debugfs_remove(o2hb_debug_dir);
- return -ENOMEM;
- }
-
- return 0;
+ return o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
if (reg->hr_slots)
kfree(reg->hr_slots);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_livenodes);
+ debugfs_remove(reg->hr_debug_livenodes);
+ debugfs_remove(reg->hr_debug_regnum);
+ debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_dir);
+
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
+ if (o2hb_global_heartbeat_active())
+ set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
spin_unlock(&o2hb_live_lock);
if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
else
ret = -EIO;
+ if (hb_task && o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+ config_item_name(&reg->hr_item));
+
out:
if (filp)
fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+ int ret = -ENOMEM;
+
+ reg->hr_debug_dir =
+ debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+ if (!reg->hr_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_livenodes =
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ reg->hr_debug_dir,
+ &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap),
+ O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_regnum =
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+ reg->hr_debug_dir,
+ &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER,
+ 0, O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_regnum) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_elapsed_time =
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+ reg->hr_debug_dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+ 0, 0, reg);
+ if (!reg->hr_debug_elapsed_time) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ return ret;
+}
+
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
const char *name)
{
struct o2hb_region *reg = NULL;
+ int ret;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
return ERR_PTR(-ENOMEM);
- config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
spin_lock(&o2hb_live_lock);
+ reg->hr_region_num = 0;
+ if (o2hb_global_heartbeat_active()) {
+ reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+ O2NM_MAX_REGIONS);
+ if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+ spin_unlock(&o2hb_live_lock);
+ return ERR_PTR(-EFBIG);
+ }
+ set_bit(reg->hr_region_num, o2hb_region_bitmap);
+ }
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
+ config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+ ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+ if (ret) {
+ config_item_put(&reg->hr_item);
+ return ERR_PTR(ret);
+ }
+
return &reg->hr_item;
}
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
+ if (o2hb_global_heartbeat_active()) {
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ }
hb_task = reg->hr_task;
reg->hr_task = NULL;
spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
wake_up(&o2hb_steady_queue);
}
+ if (o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+ config_item_name(&reg->hr_item));
config_item_put(item);
}
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+ char *page)
+{
+ return sprintf(page, "%s\n",
+ o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+ const char *page, size_t count)
+{
+ unsigned int i;
+ int ret;
+ size_t len;
+
+ len = (page[count - 1] == '\n') ? count - 1 : count;
+ if (!len)
+ return -EINVAL;
+
+ for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+ if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ continue;
+
+ ret = o2hb_global_hearbeat_mode_set(i);
+ if (!ret)
+ printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+ o2hb_heartbeat_mode_desc[i]);
+ return count;
+ }
+
+ return -EINVAL;
+
+}
+
static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
.store = o2hb_heartbeat_group_threshold_store,
};
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "mode",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2hb_heartbeat_group_mode_show,
+ .store = o2hb_heartbeat_group_mode_store,
+};
+
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
&o2hb_heartbeat_group_attr_threshold.attr,
+ &o2hb_heartbeat_group_attr_mode.attr,
NULL,
};
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
spin_unlock(&o2hb_live_lock);
}
EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+ struct o2hb_region *reg;
+ int numregs = 0;
+ char *p;
+
+ spin_lock(&o2hb_live_lock);
+
+ p = region_uuids;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+ if (numregs < max_regions) {
+ memcpy(p, config_item_name(&reg->hr_item),
+ O2HB_MAX_REGION_NAME_LEN);
+ p += O2HB_MAX_REGION_NAME_LEN;
+ }
+ numregs++;
+ }
+
+ spin_unlock(&o2hb_live_lock);
+
+ return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+ return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f16492..00ad8e8 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
#define O2HB_REGION_TIMEOUT_MS 2000
+#define O2HB_MAX_REGION_NAME_LEN 32
+
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a..ea2ed9f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f3..bb24064 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
+ mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
return &node->nd_item;
}
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
}
write_unlock(&cluster->cl_nodes_lock);
+ mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+ config_item_name(&node->nd_item));
+
config_item_put(item);
}
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854b..49b5943 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION** Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS 32
+
#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3..9aa426e 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -977,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
- int ret;
+ int ret = 0;
struct o2net_msg *msg = NULL;
size_t veclen, caller_bytes = 0;
struct kvec *vec = NULL;
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
{
o2quo_hb_down(node_num);
+ if (!node)
+ return;
+
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
o2quo_hb_up(node_num);
+ BUG_ON(!node);
+
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1759,6 +1764,7 @@ static int o2net_accept_one(struct socket *sock)
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
+ struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
@@ -1796,11 +1802,15 @@ static int o2net_accept_one(struct socket *sock)
goto out;
}
- if (o2nm_this_node() > node->nd_num) {
- mlog(ML_NOTICE, "unexpected connect attempted from a lower "
- "numbered node '%s' at " "%pI4:%d with num %u\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port), node->nd_num);
+ if (o2nm_this_node() >= node->nd_num) {
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
+ "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1857,6 +1867,8 @@ out:
sock_release(new_sock);
if (node)
o2nm_node_put(node);
+ if (local_node)
+ o2nm_node_put(local_node);
if (sc)
sc_put(sc);
return ret;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7..edaded4 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
#include "inode.h"
#include "super.h"
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+ unsigned long gen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ BUG_ON(dentry->d_inode);
+ dentry->d_fsdata = (void *)gen;
+}
+
static int ocfs2_dentry_revalidate(struct dentry *dentry,
struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
- /* Never trust a negative dentry - force a new lookup. */
+ /* For a negative dentry -
+ * check the generation number of the parent and compare with the
+ * one stored in the inode.
+ */
if (inode == NULL) {
- mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
- goto bail;
+ unsigned long gen = (unsigned long) dentry->d_fsdata;
+ unsigned long pgen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ mlog(0, "negative dentry: %.*s parent gen: %lu "
+ "dentry gen: %lu\n",
+ dentry->d_name.len, dentry->d_name.name, pgen, gen);
+ if (gen != pgen)
+ goto bail;
+ goto valid;
}
BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
goto bail;
}
+valid:
ret = 1;
bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
+ if (!dentry->d_inode && dentry->d_fsdata) {
+ /* Converting a negative dentry to positive
+ Clear dentry->d_fsdata */
+ dentry->d_fsdata = dl = NULL;
+ }
+
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
out:
iput(inode);
+ ocfs2_dentry_attach_gen(dentry);
}
/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd178..b79eff7 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f04ebcf..c49f6de 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3931,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
+ cpos = split_hash;
+ ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+ data_ac, meta_ac, new_dx_leaves,
+ num_dx_leaves);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
for (i = 0; i < num_dx_leaves; i++) {
ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
orig_dx_leaves[i],
@@ -3939,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
mlog_errno(ret);
goto out_commit;
}
- }
- cpos = split_hash;
- ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
- data_ac, meta_ac, new_dx_leaves,
- num_dx_leaves);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ new_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
}
ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4b6ae2c..b36d0bf 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
DLM_LOCK_REQUEST_MSG, /* 515 */
DLM_RECO_DATA_DONE_MSG, /* 516 */
DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG /* 518 */
+ DLM_FINALIZE_RECO_MSG, /* 518 */
+ DLM_QUERY_REGION, /* 519 */
+ DLM_QUERY_NODEINFO, /* 520 */
};
struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
u8 domain[O2NM_MAX_NAME_LEN];
};
+struct dlm_query_region {
+ u8 qr_node;
+ u8 qr_numregions;
+ u8 qr_namelen;
+ u8 pad1;
+ u8 qr_domain[O2NM_MAX_NAME_LEN];
+ u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+ u8 ni_nodenum;
+ u8 pad1;
+ u16 ni_ipv4_port;
+ u32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+ u8 qn_nodenum;
+ u8 qn_numnodes;
+ u8 qn_namelen;
+ u8 pad1;
+ u8 qn_domain[O2NM_MAX_NAME_LEN];
+ struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
struct dlm_exit_domain
{
u8 node_idx;
@@ -1030,6 +1057,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_clean_master_list(struct dlm_ctxt *dlm,
u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 5efdd37..272ec86 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
struct hlist_head *bucket;
struct hlist_node *list;
int i, out = 0;
- unsigned long total = 0, longest = 0, bktcnt;
+ unsigned long total = 0, longest = 0, bucket_count = 0;
out += snprintf(db->buf + out, db->len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
mle = hlist_entry(list, struct dlm_master_list_entry,
master_hash_node);
++total;
- ++bktcnt;
+ ++bucket_count;
if (db->len - out < 200)
continue;
out += dump_mle(mle, db->buf + out, db->len - out);
}
- longest = max(longest, bktcnt);
- bktcnt = 0;
+ longest = max(longest, bucket_count);
+ bucket_count = 0;
}
spin_unlock(&dlm->master_lock);
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
spin_lock(&dlm->track_lock);
if (oldres)
track_list = &oldres->tracking;
- else
+ else {
track_list = &dlm->tracking_list;
+ if (list_empty(track_list)) {
+ dl = NULL;
+ spin_unlock(&dlm->track_lock);
+ goto bail;
+ }
+ }
list_for_each_entry(res, track_list, tracking) {
if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
} else
dl = NULL;
+bail:
/* passed to seq_show */
return dl;
}
@@ -775,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
out += snprintf(db->buf + out, db->len - out,
- "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
+ "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
+ dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 153abb5..58a93b9 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* will have a negotiated version with the same major number and a minor
* number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
* be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ * - Message DLM_QUERY_REGION added to support global heartbeat
+ * - Message DLM_QUERY_NODEINFO added to allow online node removes
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 0,
+ .pv_minor = 1,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data);
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -693,6 +699,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
dlm_put(dlm);
@@ -920,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+ struct dlm_query_region *qr)
+{
+ char *local = NULL, *remote = qr->qr_regions;
+ char *l, *r;
+ int localnr, i, j, foundit;
+ int status = 0;
+
+ if (!o2hb_global_heartbeat_active()) {
+ if (qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+ "heartbeat enabled but local node %d does not\n",
+ qr->qr_domain, qr->qr_node, dlm->node_num);
+ status = -EINVAL;
+ }
+ goto bail;
+ }
+
+ if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Local node %d has global "
+ "heartbeat enabled but joining node %d does not\n",
+ qr->qr_domain, dlm->node_num, qr->qr_node);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ if (!local) {
+ status = -ENOMEM;
+ goto bail;
+ }
+
+ localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+
+ /* compare local regions with remote */
+ l = local;
+ for (i = 0; i < localnr; ++i) {
+ foundit = 0;
+ r = remote;
+ for (j = 0; j <= qr->qr_numregions; ++j) {
+ if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in local node %d but not in joining node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+ dlm->node_num, qr->qr_node);
+ goto bail;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ /* compare remote with local regions */
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ foundit = 0;
+ l = local;
+ for (j = 0; j < localnr; ++j) {
+ if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in joining node %d but not in local node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+ qr->qr_node, dlm->node_num);
+ goto bail;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+bail:
+ kfree(local);
+
+ return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_region *qr = NULL;
+ int status, ret = 0, i;
+ char *p;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+ if (!qr) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ qr->qr_node = dlm->node_num;
+ qr->qr_namelen = strlen(dlm->name);
+ memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+ /* if local hb, the numregions will be zero */
+ if (o2hb_global_heartbeat_active())
+ qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+ O2NM_MAX_REGIONS);
+
+ p = qr->qr_regions;
+ for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending regions to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+ sizeof(struct dlm_query_region),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+ ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qr);
+ return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_region *qr;
+ struct dlm_ctxt *dlm = NULL;
+ int status = 0;
+ int locked = 0;
+
+ qr = (struct dlm_query_region *) msg->buf;
+
+ mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+ qr->qr_domain);
+
+ status = -EINVAL;
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "before join domain\n", qr->qr_node, qr->qr_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qr->qr_node) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for global heartbeat was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but active dlm protocol is %d.%d\n", qr->qr_node,
+ qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_regions(dlm, qr);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+ struct o2nm_node *local;
+ struct dlm_node_info *remote;
+ int i, j;
+ int status = 0;
+
+ for (j = 0; j < qn->qn_numnodes; ++j)
+ mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+ &(qn->qn_nodes[j].ni_ipv4_address),
+ ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+ for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+ local = o2nm_get_node_by_num(i);
+ remote = NULL;
+ for (j = 0; j < qn->qn_numnodes; ++j) {
+ if (qn->qn_nodes[j].ni_nodenum == i) {
+ remote = &(qn->qn_nodes[j]);
+ break;
+ }
+ }
+
+ if (!local && !remote)
+ continue;
+
+ if ((local && !remote) || (!local && remote))
+ status = -EINVAL;
+
+ if (!status &&
+ ((remote->ni_nodenum != local->nd_num) ||
+ (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+ (remote->ni_ipv4_address != local->nd_ipv4_address)))
+ status = -EINVAL;
+
+ if (status) {
+ if (remote && !local)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in joining node %d but not in "
+ "local node %d\n", qn->qn_domain,
+ remote->ni_nodenum,
+ &(remote->ni_ipv4_address),
+ ntohs(remote->ni_ipv4_port),
+ qn->qn_nodenum, dlm->node_num);
+ if (local && !remote)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in local node %d but not in "
+ "joining node %d\n", qn->qn_domain,
+ local->nd_num, &(local->nd_ipv4_address),
+ ntohs(local->nd_ipv4_port),
+ dlm->node_num, qn->qn_nodenum);
+ BUG_ON((!local && !remote));
+ }
+
+ if (local)
+ o2nm_node_put(local);
+ }
+
+ return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_nodeinfo *qn = NULL;
+ struct o2nm_node *node;
+ int ret = 0, status, count, i;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+ if (!qn) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+ node = o2nm_get_node_by_num(i);
+ if (!node)
+ continue;
+ qn->qn_nodes[count].ni_nodenum = node->nd_num;
+ qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+ qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+ mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+ &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+ ++count;
+ o2nm_node_put(node);
+ }
+
+ qn->qn_nodenum = dlm->node_num;
+ qn->qn_numnodes = count;
+ qn->qn_namelen = strlen(dlm->name);
+ memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending nodeinfo to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ qn, sizeof(struct dlm_query_nodeinfo),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qn);
+ return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_nodeinfo *qn;
+ struct dlm_ctxt *dlm = NULL;
+ int locked = 0, status = -EINVAL;
+
+ qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+ mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+ qn->qn_domain);
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+ "join domain\n", qn->qn_nodenum, qn->qn_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qn->qn_nodenum) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+ "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for node query was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+ "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+ qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_nodes(dlm, qn);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
@@ -1240,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
+ /* Support for global heartbeat and node info was added in 1.1 */
+ if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+ status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
/* Joined state *must* be set before the joining node
@@ -1806,7 +2191,21 @@ static int dlm_register_net_handlers(void)
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+ sizeof(struct dlm_query_region),
+ dlm_query_region_handler,
+ NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ sizeof(struct dlm_query_nodeinfo),
+ dlm_query_nodeinfo_handler,
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc..f564b0e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
atomic_dec(&dlm->res_cur_count);
- dlm_put(dlm);
-
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->migration_pending = 0;
res->inflight_locks = 0;
- /* put in dlm_lockres_release */
- dlm_grab(dlm);
res->dlm = dlm;
kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
@@ -3438,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
wake_up(&res->wq);
wake_up(&dlm->migration_wq);
}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+ int i;
+ struct hlist_head *bucket;
+ struct dlm_master_list_entry *mle;
+ struct hlist_node *tmp, *list;
+
+ /*
+ * We notified all other nodes that we are exiting the domain and
+ * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+ * around we force free them and wake any processes that are waiting
+ * on the mles
+ */
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+
+ BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+ BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_safe(list, tmp, bucket) {
+ mle = hlist_entry(list, struct dlm_master_list_entry,
+ master_hash_node);
+ if (mle->type != DLM_MLE_BLOCK) {
+ mlog(ML_ERROR, "bad mle: %p\n", mle);
+ dlm_print_one_mle(mle);
+ }
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+
+ __dlm_unlink_mle(dlm, mle);
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
+ }
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac7..aaaffbc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
- if (res->state & DLM_LOCK_RES_DROPPING_REF)
- mlog(0, "%s:%.*s: owned by "
- "dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
- dlm->name, res->lockname.len,
- res->lockname.name, dead_node);
-
- /* the wake_up for this will happen when the
- * RECOVERING flag is dropped later */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "Ignore %.*s for "
+ "recovery as it is being freed\n",
+ res->lockname.len,
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
- dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca..2211acf 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
- if (!__dlm_lockres_has_locks(res) &&
- (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
- /* try not to scan the bitmap unless the first two
- * conditions are already true */
- int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
- if (bit >= O2NM_MAX_NODES) {
- /* since the bit for dlm->node_num is not
- * set, inflight_locks better be zero */
- BUG_ON(res->inflight_locks != 0);
- return 1;
- }
- }
- return 0;
+ int bit;
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ /*
+ * since the bit for dlm->node_num is not set, inflight_locks better
+ * be zero
+ */
+ BUG_ON(res->inflight_locks != 0);
+ return 1;
}
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
- spin_lock(&res->spinlock);
- if (!__dlm_lockres_unused(res)) {
- mlog(0, "%s:%.*s: tried to purge but not unused\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- spin_unlock(&res->spinlock);
- BUG();
- }
-
- if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "%s:%.*s: Delay dropref as this lockres is "
- "being remastered\n", dlm->name, res->lockname.len,
- res->lockname.name);
- /* Re-add the lockres to the end of the purge list */
- if (!list_empty(&res->purge)) {
- list_del_init(&res->purge);
- list_add_tail(&res->purge, &dlm->purge_list);
- }
- spin_unlock(&res->spinlock);
- return 0;
- }
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
- if (!master)
- res->state |= DLM_LOCK_RES_DROPPING_REF;
- spin_unlock(&res->spinlock);
mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
res->lockname.name, master);
if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
- spin_lock(&res->spinlock);
if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purgelist, "
"master = %d\n", res->lockname.len, res->lockname.name,
res, master);
list_del_init(&res->purge);
- spin_unlock(&res->spinlock);
dlm_lockres_put(res);
dlm->purge_count--;
- } else
- spin_unlock(&res->spinlock);
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
__dlm_unhash_lockres(res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
- spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- }
- return 0;
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ mlog(0, "lockres %s:%.*s: is in use or "
+ "being remastered, used %d, state %d\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name, !unused, lockres->state);
+ list_move_tail(&dlm->purge_list, &lockres->purge);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- if (dlm_purge_lockres(dlm, lockres))
- BUG();
+ dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index c2903b8..a7ebd9d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -612,6 +612,7 @@ static const struct file_operations dlmfs_file_operations = {
.poll = dlmfs_file_poll,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
+ .llseek = default_llseek,
};
static const struct inode_operations dlmfs_dir_inode_operations = {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a89..e8d94d7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
{
struct inode *inode;
struct address_space *mapping;
+ struct ocfs2_inode_info *oi;
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISDIR(inode->i_mode)) {
+ oi = OCFS2_I(inode);
+ oi->ip_dir_lock_gen++;
+ mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+ goto out;
+ }
+
if (!S_ISREG(inode->i_mode))
goto out;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e..1d596d8 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
OI_LS_PARENT,
OI_LS_RENAME1,
OI_LS_RENAME2,
+ OI_LS_REFLINK_TARGET,
};
int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 81296b4..1ca6867 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -63,12 +64,6 @@
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
-}
-
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
@@ -179,19 +174,22 @@ static int ocfs2_sync_file(struct file *file, int datasync)
{
int err = 0;
journal_t *journal;
- struct dentry *dentry = file->f_path.dentry;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
- dentry->d_name.len, dentry->d_name.name);
-
- err = ocfs2_sync_inode(dentry->d_inode);
- if (err)
- goto bail;
+ mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+ file->f_path.dentry, file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+ /*
+ * We still have to flush drive's caches to get data to the
+ * platter
+ */
+ if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
goto bail;
+ }
journal = osb->journal->j_journal;
err = jbd2_journal_force_commit(journal);
@@ -361,7 +359,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
goto out;
- return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+ return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
out:
return status;
@@ -774,7 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -904,8 +902,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
zero_clusters = last_cpos - zero_cpos;
if (needs_cow) {
- rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
- UINT_MAX);
+ rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+ zero_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
goto out;
@@ -2053,6 +2051,7 @@ out:
}
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ struct file *file,
loff_t pos, size_t count,
int *meta_level)
{
@@ -2070,7 +2069,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
*meta_level = 1;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+ ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
if (ret)
mlog_errno(ret);
out:
@@ -2078,7 +2077,7 @@ out:
return ret;
}
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t *ppos,
size_t count,
int appending,
@@ -2086,6 +2085,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
int *has_refcount)
{
int ret = 0, meta_level = 0;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
loff_t saved_pos, end;
@@ -2141,6 +2141,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
meta_level = -1;
ret = ocfs2_prepare_inode_for_refcount(inode,
+ file,
saved_pos,
count,
&meta_level);
@@ -2223,6 +2224,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs,
@@ -2246,16 +2249,39 @@ relock:
have_alloc_sem = 1;
}
- /* concurrent O_DIRECT writes are allowed */
- rw_level = !direct_io;
+ /*
+ * Concurrent O_DIRECT writes are allowed with
+ * mount_option "coherency=buffered".
+ */
+ rw_level = (!direct_io || full_coherency);
+
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
goto out_sems;
}
+ /*
+ * O_DIRECT writes with "coherency=full" need to take EX cluster
+ * inode_lock to guarantee coherency.
+ */
+ if (direct_io && full_coherency) {
+ /*
+ * We need to take and drop the inode lock to force
+ * other nodes to drop their caches. Buffered I/O
+ * already does this in write_begin().
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_sems;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ }
+
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+ ret = ocfs2_prepare_inode_for_write(file, ppos,
iocb->ki_left, appending,
&can_do_direct, &has_refcount);
if (ret < 0) {
@@ -2303,17 +2329,6 @@ relock:
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
- /*
- * direct write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- * Don't need i_size_read because we hold i_mutex.
- *
- * XXX(truncate): this looks buggy because ocfs2 did not
- * actually implement ->truncate. Take a look at
- * the new truncate sequence and update this accordingly
- */
- if (*ppos + count > inode->i_size)
- truncate_setsize(inode, inode->i_size);
ret = written;
goto out_dio;
}
@@ -2329,7 +2344,7 @@ out_dio:
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- ((file->f_flags & O_DIRECT) && has_refcount)) {
+ ((file->f_flags & O_DIRECT) && !direct_io)) {
ret = filemap_fdatawrite_range(file->f_mapping, pos,
pos + count - 1);
if (ret < 0)
@@ -2385,7 +2400,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
{
int ret;
- ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+ ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
sd->total_len, 0, NULL, NULL);
if (ret < 0) {
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0492464..f935fd6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
else
inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
+ OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
if (ocfs2_inode_is_fast_symlink(inode))
@@ -488,7 +489,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
- if (!status)
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
status = ocfs2_validate_inode_block(osb->sb, bh);
}
if (status < 0) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a86..1c508b1 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
struct list_head ip_io_markers;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
- u16 ip_dyn_features;
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- u32 ip_dir_start_lookup;
-
struct ocfs2_caching_info ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
+ u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
+ u32 ip_dir_lock_gen;
struct ocfs2_alloc_reservation ip_la_data_resv;
};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c1..7a48681 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+ __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
@@ -109,6 +129,328 @@ bail:
return status;
}
+int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+ oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oib, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+ oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oic, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+ oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oim, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+ oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oil, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+ oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oiu, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+ oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oif, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+ oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oij, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
+ return status;
+}
+
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct reflink_arguments args;
const char *old_path, *new_path;
bool preserve;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
default:
return -ENOTTY;
}
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
bool preserve;
struct reflink_arguments args;
struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c03..faa2303 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
unsigned int flushed;
- unsigned long old_id;
struct ocfs2_journal *journal = NULL;
mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- old_id = ocfs2_inc_trans_id(journal);
+ ocfs2_inc_trans_id(journal);
flushed = atomic_read(&journal->j_num_trans);
atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
return status;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
os = &osb->osb_orphan_scan;
+ mlog(0, "Begin orphan scan\n");
+
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
goto out;
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
+ mlog(0, "Orphan scan completed\n");
return;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8..43e56b9 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af2b8fe..7e32db9 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
return ret;
}
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
int ret;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
@@ -74,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
/*
* Another node might have truncated while we were waiting on
* cluster locks.
+ * We don't check size == 0 before the shift. This is borrowed
+ * from do_generic_file_read.
*/
- last_index = size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index) {
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!size || page->index > last_index)) {
ret = -EINVAL;
goto out;
}
@@ -107,9 +110,9 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = size & ~PAGE_CACHE_MASK;
+ len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
@@ -157,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+ ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f171b51..e7bde21 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
ret = ERR_PTR(status);
goto bail_unlock;
}
- }
+ } else
+ ocfs2_dentry_attach_gen(dentry);
bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
@@ -472,32 +473,23 @@ leave:
return status;
}
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
- struct inode *dir,
- struct inode *inode,
- dev_t dev,
- struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac,
+ u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 suballoc_loc, fe_blkno = 0;
- u16 suballoc_bit;
u16 feat;
*new_fe_bh = NULL;
- status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
- inode_ac, &suballoc_loc,
- &suballoc_bit, &fe_blkno);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
@@ -591,6 +583,34 @@ leave:
return status;
}
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac)
+{
+ int status = 0;
+ u64 suballoc_loc, fe_blkno = 0;
+ u16 suballoc_bit;
+
+ *new_fe_bh = NULL;
+
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ parent_fe_bh, handle, inode_ac,
+ fe_blkno, suballoc_loc, suballoc_bit);
+}
+
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
int mode)
@@ -1852,61 +1872,117 @@ bail:
return status;
}
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct inode **ret_orphan_dir,
- u64 blkno,
- char *name,
- struct ocfs2_dir_lookup_result *lookup)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ struct buffer_head **ret_orphan_dir_bh)
{
struct inode *orphan_dir_inode;
struct buffer_head *orphan_dir_bh = NULL;
- int status = 0;
-
- status = ocfs2_blkno_stringify(blkno, name);
- if (status < 0) {
- mlog_errno(status);
- return status;
- }
+ int ret = 0;
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -ENOENT;
- mlog_errno(status);
- return status;
+ ret = -ENOENT;
+ mlog_errno(ret);
+ return ret;
}
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (ret < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ mlog_errno(ret);
+ return ret;
}
- status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
- orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
- if (status < 0) {
- ocfs2_inode_unlock(orphan_dir_inode, 1);
+ *ret_orphan_dir = orphan_dir_inode;
+ *ret_orphan_dir_bh = orphan_dir_bh;
- mlog_errno(status);
- goto leave;
+ return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+ struct buffer_head *orphan_dir_bh,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+ ret = ocfs2_blkno_stringify(blkno, name);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+ orphan_dir_bh, name,
+ OCFS2_ORPHAN_NAMELEN, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ * ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+ &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+ blkno, name, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
*ret_orphan_dir = orphan_dir_inode;
-leave:
- if (status) {
+out:
+ brelse(orphan_dir_bh);
+
+ if (ret) {
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
}
- brelse(orphan_dir_bh);
-
- mlog_exit(status);
- return status;
+ mlog_exit(ret);
+ return ret;
}
static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -2053,6 +2129,99 @@ leave:
return status;
}
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+ struct buffer_head *dir_bh,
+ char *orphan_name,
+ struct inode **ret_orphan_dir,
+ u64 *ret_di_blkno,
+ struct ocfs2_dir_lookup_result *orphan_insert,
+ struct ocfs2_alloc_context **ret_inode_ac)
+{
+ int ret;
+ u64 di_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct inode *orphan_dir = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* reserve an inode spot */
+ ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+ &di_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+ di_blkno, orphan_name, orphan_insert);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *ret_orphan_dir = orphan_dir;
+ *ret_di_blkno = di_blkno;
+ *ret_inode_ac = inode_ac;
+ /*
+ * orphan_name and orphan_insert are already up to
+ * date via prepare_orphan_dir
+ */
+ } else {
+ /* Unroll reserve_new_inode* */
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ /* Unroll orphan dir locking */
+ mutex_unlock(&orphan_dir->i_mutex);
+ ocfs2_inode_unlock(orphan_dir, 1);
+ iput(orphan_dir);
+ }
+
+ brelse(orphan_dir_bh);
+
+ return 0;
+}
+
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode)
@@ -2068,6 +2237,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ u64 uninitialized_var(di_blkno), suballoc_loc;
+ u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
@@ -2076,20 +2247,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
return status;
}
- /*
- * We give the orphan dir the root blkno to fake an orphan name,
- * and allocate enough space for our insertion.
- */
- status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
- osb->root_blkno,
- orphan_name, &orphan_insert);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- /* reserve an inode spot */
- status = ocfs2_reserve_new_inode(osb, &inode_ac);
+ status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+ orphan_name, &orphan_dir,
+ &di_blkno, &orphan_insert, &inode_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -2116,17 +2276,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
did_quota_inode = 1;
- inode->i_nlink = 0;
- /* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, inode,
- 0, &new_di_bh, parent_di_bh, handle,
- inode_ac);
+ status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+ &suballoc_loc,
+ &suballoc_bit, di_blkno);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+ inode->i_nlink = 0;
+ /* do the real work now. */
+ status = __ocfs2_mknod_locked(dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac, di_blkno, suballoc_loc,
+ suballoc_bit);
if (status < 0) {
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b..d840821 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct ocfs2_dlm_lksb l_lksb;
+ unsigned char l_level;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
+ unsigned char l_requested;
+ unsigned char l_blocking;
unsigned int l_pending_gen;
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
+
wait_queue_head_t l_event;
struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
enum ocfs2_mount_options
{
- OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
+ OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
control lists */
OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+ writes */
+ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
- struct inode *system_inodes[NUM_SYSTEM_INODES];
+ struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+ struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
@@ -368,6 +380,8 @@ struct ocfs2_super
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
+ u8 osb_stackflags;
+
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
- OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+ (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+ OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+ return ocfs2_o2cb_stack(osb) &&
+ (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
}
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 33f1c9a..c2e4f82 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
| OCFS2_FEATURE_INCOMPAT_META_ECC \
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
| OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
- | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
+
+/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
*/
@@ -235,18 +243,31 @@
#define OCFS2_HAS_REFCOUNT_FL (0x0010)
/* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
-#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
-#define OCFS2_COMPR_FL (0x00000004) /* Compress file */
-#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */
-#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */
-#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */
-#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */
-#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */
-
-#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
-#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
+#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
+#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
+#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
+#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
+#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
+#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
+#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
+#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
+#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
+#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
/*
* Extent record flags (e_node.leaf.flags)
@@ -279,10 +300,13 @@
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
#define OCFS2_STACK_LABEL_LEN 4
#define OCFS2_CLUSTER_NAME_LEN 16
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
@@ -292,6 +316,11 @@
*/
#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -309,6 +338,7 @@ enum {
USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
@@ -317,8 +347,12 @@ enum {
TRUNCATE_LOG_SYSTEM_INODE,
LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
@@ -347,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
+#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
@@ -553,9 +588,21 @@ struct ocfs2_slot_map_extended {
*/
};
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
struct ocfs2_cluster_info {
/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
- __le32 ci_reserved;
+ union {
+ __le32 ci_reserved;
+ struct {
+ __u8 ci_stackflags;
+ __u8 ci_reserved1;
+ __u8 ci_reserved2;
+ __u8 ci_reserved3;
+ };
+ };
/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
/*18*/
};
@@ -592,9 +639,9 @@ struct ocfs2_super_block {
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
- stack. Only valid
- with INCOMPAT flag. */
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+ userspace or clusterinfo
+ INCOMPAT flag set. */
/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
for this fs*/
__le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420a..b46f39b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
/*
* ioctl commands
*/
-#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
+#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
/*
* Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,99 @@ struct reflink_arguments {
};
#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa7..b5f9160 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
struct ocfs2_cow_context {
struct inode *inode;
+ struct file *file;
u32 cow_start;
u32 cow_len;
struct ocfs2_extent_tree data_et;
@@ -2436,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
/*
- * If the refcount rec already exist, cool. We just need
- * to check whether there is a split. Otherwise we just need
- * to increase the refcount.
- * If we will insert one, increases recs_add.
- *
* We record all the records which will be inserted to the
* same refcount block, so that we can tell exactly whether
* we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
*/
if (rec.r_refcount) {
+ recs_add += 2;
/* Check whether we need a split at the beginning. */
if (cpos == start_cpos &&
cpos != le64_to_cpu(rec.r_cpos))
@@ -2922,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to;
+ unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
struct address_space *mapping = context->inode->i_mapping;
mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
new_cluster, new_len, cpos);
+ readahead_pages =
+ (ocfs2_cow_contig_clusters(sb) <<
+ OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
@@ -2950,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_CACHE_SIZE - 1))
to = map_end & (PAGE_CACHE_SIZE - 1);
- page = grab_cache_page(mapping, page_index);
+ page = find_or_create_page(mapping, page_index, GFP_NOFS);
/*
* In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2959,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
+ if (PageReadahead(page) && context->file) {
+ page_cache_async_readahead(mapping,
+ &context->file->f_ra,
+ context->file,
+ page, page_index,
+ readahead_pages);
+ }
+
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);
if (ret) {
@@ -3169,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
- page = grab_cache_page(context->inode->i_mapping, page_index);
+ page = find_or_create_page(context->inode->i_mapping,
+ page_index, GFP_NOFS);
BUG_ON(!page);
wait_on_page_writeback(page);
@@ -3398,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
return ret;
}
+static void ocfs2_readahead_for_cow(struct inode *inode,
+ struct file *file,
+ u32 start, u32 len)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+ unsigned long num_pages;
+ int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+ if (!file)
+ return;
+
+ mapping = file->f_mapping;
+ num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+ if (!num_pages)
+ num_pages = 1;
+
+ index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+ page_cache_sync_readahead(mapping, &file->f_ra, file,
+ index, num_pages);
+}
+
/*
* Starting at cpos, try to CoW write_len clusters. Don't CoW
* past max_cpos. This will stop when it runs into a hole or an
* unrefcounted extent.
*/
static int ocfs2_refcount_cow_hunk(struct inode *inode,
+ struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3432,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
BUG_ON(cow_len == 0);
+ ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
@@ -3453,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
context->ref_root_bh = ref_root_bh;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
context->get_clusters = ocfs2_di_get_clusters;
+ context->file = file;
ocfs2_init_dinode_extent_tree(&context->data_et,
INODE_CACHE(inode), di_bh);
@@ -3481,6 +3530,7 @@ out:
* clusters between cpos and cpos+write_len are safe to modify.
*/
int ocfs2_refcount_cow(struct inode *inode,
+ struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3500,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
num_clusters = write_len;
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
- ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+ ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
num_clusters, max_cpos);
if (ret) {
mlog_errno(ret);
@@ -4190,8 +4240,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out;
}
- mutex_lock(&new_inode->i_mutex);
- ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+ mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+ OI_LS_REFLINK_TARGET);
if (ret) {
mlog_errno(ret);
goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1..c8ce46f7 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
struct rb_node rf_node;
u64 rf_blkno;
u32 rf_generation;
+ struct kref rf_getcnt;
struct rw_semaphore rf_sem;
struct ocfs2_lock_res rf_lockres;
- struct kref rf_getcnt;
int rf_removed;
/* the following 4 fields are used by caching_info. */
- struct ocfs2_caching_info rf_ci;
spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
struct mutex rf_io_mutex;
struct super_block *rf_sb;
};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u32 clusters,
int *credits,
int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+ struct file *filep, struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index d8b6e42..3e78db3 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -732,25 +732,23 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
struct ocfs2_alloc_reservation *resv,
int *cstart, int *clen)
{
- unsigned int wanted = *clen;
-
if (resv == NULL || ocfs2_resmap_disabled(resmap))
return -ENOSPC;
spin_lock(&resv_lock);
- /*
- * We don't want to over-allocate for temporary
- * windows. Otherwise, we run the risk of fragmenting the
- * allocation space.
- */
- wanted = ocfs2_resv_window_bits(resmap, resv);
- if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
- wanted = *clen;
-
if (ocfs2_resv_empty(resv)) {
- mlog(0, "empty reservation, find new window\n");
+ /*
+ * We don't want to over-allocate for temporary
+ * windows. Otherwise, we run the risk of fragmenting the
+ * allocation space.
+ */
+ unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+ if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+ wanted = *clen;
+
+ mlog(0, "empty reservation, find new window\n");
/*
* Try to get a window here. If it works, we must fall
* through and test the bitmap . This avoids some
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9..ab4e017 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
{
int status = 0;
u64 blkno;
- unsigned long long blocks, bytes;
+ unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f..19965b0 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
/* for now we only have one cluster/node, make sure we see it
* in the heartbeat universe */
if (!o2hb_check_local_node_heartbeating()) {
+ if (o2hb_global_heartbeat_active())
+ mlog(ML_ERROR, "Global heartbeat not started\n");
rc = -EINVAL;
goto out;
}
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2dc57bc..252e7c8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -22,7 +22,6 @@
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
@@ -612,12 +611,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
- lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
- unlock_kernel();
return 0;
}
@@ -628,6 +625,7 @@ static const struct file_operations ocfs2_control_fops = {
.read = ocfs2_control_read,
.write = ocfs2_control_write,
.owner = THIS_MODULE,
+ .llseek = default_llseek,
};
static struct miscdevice ocfs2_control_device = {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index a8e6a95..5fed60d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -57,11 +57,28 @@ struct ocfs2_suballoc_result {
u64 sr_bg_blkno; /* The bg we allocated from. Set
to 0 when a block group is
contiguous. */
+ u64 sr_bg_stable_blkno; /*
+ * Doesn't change, always
+ * set to target block
+ * group descriptor
+ * block.
+ */
u64 sr_blkno; /* The first allocated block */
unsigned int sr_bit_offset; /* The bit in the bg */
unsigned int sr_bits; /* How many bits we claimed */
};
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+ if (res->sr_blkno == 0)
+ return 0;
+
+ if (res->sr_bg_blkno)
+ return res->sr_bg_blkno;
+
+ return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
brelse(ac->ac_bh);
ac->ac_bh = NULL;
ac->ac_resv = NULL;
+ if (ac->ac_find_loc_priv) {
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
+ }
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -336,7 +357,7 @@ out:
static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
struct ocfs2_group_desc *bg,
struct ocfs2_chain_list *cl,
- u64 p_blkno, u32 clusters)
+ u64 p_blkno, unsigned int clusters)
{
struct ocfs2_extent_list *el = &bg->bg_list;
struct ocfs2_extent_rec *rec;
@@ -348,7 +369,7 @@ static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
rec->e_blkno = cpu_to_le64(p_blkno);
rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
le16_to_cpu(cl->cl_bpc));
- rec->e_leaf_clusters = cpu_to_le32(clusters);
+ rec->e_leaf_clusters = cpu_to_le16(clusters);
le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
le16_add_cpu(&bg->bg_free_bits_count,
clusters * le16_to_cpu(cl->cl_bpc));
@@ -1359,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1678,6 +1707,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
if (!ret)
ocfs2_bg_discontig_fix_result(ac, gd, res);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
+
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
res->sr_bits,
le16_to_cpu(gd->bg_chain));
@@ -1691,6 +1729,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
if (ret < 0)
mlog_errno(ret);
+out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
out:
@@ -1708,7 +1747,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
{
int status;
u16 chain;
- u32 tmp_used;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1770,6 +1808,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
if (!status)
ocfs2_bg_discontig_fix_result(ac, bg, res);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
/*
* Keep track of previous block descriptor read. When
@@ -1796,22 +1839,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
}
- /* Ok, claim our bits now: set the info on dinode, chainlist
- * and then the group */
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(alloc_inode),
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
+
+ status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (status) {
mlog_errno(status);
goto bail;
}
- tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
- ocfs2_journal_dirty(handle, ac->ac_bh);
-
status = ocfs2_block_group_set_bits(handle,
alloc_inode,
bg,
@@ -1826,6 +1864,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
(unsigned long long)le64_to_cpu(fe->i_blkno));
+out_loc_only:
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
brelse(group_bh);
@@ -1845,6 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
int status;
u16 victim, i;
u16 bits_left = 0;
+ u64 hint = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
@@ -1872,7 +1912,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
goto bail;
}
- res->sr_bg_blkno = ac->ac_last_group;
+ res->sr_bg_blkno = hint;
if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
@@ -1896,8 +1936,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
- if (!status)
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
goto set_hint;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1920,8 +1962,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
ac->ac_chain = i;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
- if (!status)
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
break;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1936,7 +1980,7 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = res->sr_bg_blkno;
+ ac->ac_last_group = hint;
}
bail:
@@ -2016,6 +2060,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
}
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_suballoc_result *res;
+
+ BUG_ON(!ac);
+ BUG_ON(ac->ac_bits_given != 0);
+ BUG_ON(ac->ac_bits_wanted != 1);
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+ res = kzalloc(sizeof(*res), GFP_NOFS);
+ if (res == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ /*
+ * The handle started here is for chain relink. Alternatively,
+ * we could just disable relink for these calls.
+ */
+ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This will instruct ocfs2_claim_suballoc_bits and
+ * ocfs2_search_one_group to search but save actual allocation
+ * for later.
+ */
+ ac->ac_find_loc_only = 1;
+
+ ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac->ac_find_loc_priv = res;
+ *fe_blkno = res->sr_blkno;
+
+out:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+ if (ret)
+ kfree(res);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno)
+{
+ int ret;
+ u16 chain;
+ struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+ /*
+ * Since di_blkno is being passed back in, we check for any
+ * inconsistencies which may have happened between
+ * calls. These are code bugs as di_blkno is not expected to
+ * change once returned from ocfs2_find_new_inode_loc()
+ */
+ BUG_ON(res->sr_blkno != di_blkno);
+
+ ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+ res->sr_bg_stable_blkno, &bg_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ chain = le16_to_cpu(bg->bg_chain);
+
+ ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle,
+ ac->ac_inode,
+ bg,
+ bg_bh,
+ res->sr_bit_offset,
+ res->sr_bits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+ (unsigned long long)di_blkno);
+
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+ BUG_ON(res->sr_bits != 1);
+
+ *suballoc_loc = res->sr_bg_blkno;
+ *suballoc_bit = res->sr_bit_offset;
+ ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+ brelse(bg_bh);
+
+ return ret;
+}
+
int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
@@ -2253,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
(unsigned long *) undo_bg->bg_bitmap);
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
if (undo_fn)
jbd_unlock_bh_state(group_bh);
@@ -2567,7 +2749,8 @@ out:
* suballoc_bit.
*/
static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
- u16 *suballoc_slot, u16 *suballoc_bit)
+ u16 *suballoc_slot, u64 *group_blkno,
+ u16 *suballoc_bit)
{
int status;
struct buffer_head *inode_bh = NULL;
@@ -2604,6 +2787,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
if (suballoc_bit)
*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+ if (group_blkno)
+ *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
bail:
brelse(inode_bh);
@@ -2621,7 +2806,8 @@ bail:
*/
static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
struct inode *suballoc,
- struct buffer_head *alloc_bh, u64 blkno,
+ struct buffer_head *alloc_bh,
+ u64 group_blkno, u64 blkno,
u16 bit, int *res)
{
struct ocfs2_dinode *alloc_di;
@@ -2642,10 +2828,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
goto bail;
}
- if (alloc_di->i_suballoc_loc)
- bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
- else
- bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+ bg_blkno = group_blkno ? group_blkno :
+ ocfs2_which_suballoc_group(blkno, bit);
status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
&group_bh);
if (status < 0) {
@@ -2680,6 +2864,7 @@ bail:
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
{
int status;
+ u64 group_blkno = 0;
u16 suballoc_bit = 0, suballoc_slot = 0;
struct inode *inode_alloc_inode;
struct buffer_head *alloc_bh = NULL;
@@ -2687,7 +2872,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
mlog_entry("blkno: %llu", (unsigned long long)blkno);
status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
- &suballoc_bit);
+ &group_blkno, &suballoc_bit);
if (status < 0) {
mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
goto bail;
@@ -2715,7 +2900,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
}
status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
- blkno, suballoc_bit, res);
+ group_blkno, blkno, suballoc_bit, res);
if (status < 0)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a017dd3..b8afabf 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,9 @@ struct ocfs2_alloc_context {
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+ int ac_find_loc_only; /* hack for reflink operation ordering */
+ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
struct ocfs2_alloc_reservation *ac_resv;
};
@@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno);
+
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index fa1be1b..56f0cb3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
Opt_nointr,
Opt_hb_none,
Opt_hb_local,
+ Opt_hb_global,
Opt_data_ordered,
Opt_data_writeback,
Opt_atime_quantum,
@@ -177,6 +178,8 @@ enum {
Opt_noacl,
Opt_usrquota,
Opt_grpquota,
+ Opt_coherency_buffered,
+ Opt_coherency_full,
Opt_resv_level,
Opt_dir_resv_level,
Opt_err,
@@ -190,6 +193,7 @@ static const match_table_t tokens = {
{Opt_nointr, "nointr"},
{Opt_hb_none, OCFS2_HB_NONE},
{Opt_hb_local, OCFS2_HB_LOCAL},
+ {Opt_hb_global, OCFS2_HB_GLOBAL},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +209,8 @@ static const match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_coherency_buffered, "coherency=buffered"},
+ {Opt_coherency_full, "coherency=full"},
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
mlog_entry_void();
- for (i = 0; i < NUM_SYSTEM_INODES; i++) {
- inode = osb->system_inodes[i];
+ for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+ inode = osb->global_system_inodes[i];
if (inode) {
iput(inode);
- osb->system_inodes[i] = NULL;
+ osb->global_system_inodes[i] = NULL;
}
}
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
osb->root_inode = NULL;
}
+ if (!osb->local_system_inodes)
+ goto out;
+
+ for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+ if (osb->local_system_inodes[i]) {
+ iput(osb->local_system_inodes[i]);
+ osb->local_system_inodes[i] = NULL;
+ }
+ }
+
+ kfree(osb->local_system_inodes);
+ osb->local_system_inodes = NULL;
+
+out:
mlog_exit(0);
}
@@ -608,8 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
int ret = 0;
struct mount_options parsed_options;
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- lock_kernel();
+ u32 tmp;
if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
!ocfs2_check_set_options(sb, &parsed_options)) {
@@ -617,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
- (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
@@ -717,7 +737,6 @@ unlock_osb:
MS_POSIXACL : 0);
}
out:
- unlock_kernel();
return ret;
}
@@ -809,23 +828,29 @@ bail:
static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
{
- if (ocfs2_mount_local(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+ if (osb->s_mount_opt & hb_enabled) {
+ if (ocfs2_mount_local(osb)) {
mlog(ML_ERROR, "Cannot heartbeat on a locally "
"mounted device.\n");
return -EINVAL;
}
- }
-
- if (ocfs2_userspace_stack(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ if (ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Userspace stack expected, but "
"o2cb heartbeat arguments passed to mount\n");
return -EINVAL;
}
+ if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+ !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+ ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+ ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+ mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+ return -EINVAL;
+ }
}
- if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ if (!(osb->s_mount_opt & hb_enabled)) {
if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
!ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1291,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
{
int status;
char *p;
+ u32 tmp;
mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
options ? options : "(none)");
@@ -1322,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
break;
case Opt_hb_none:
- mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+ break;
+ case Opt_hb_global:
+ mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
break;
case Opt_barrier:
if (match_int(&args[0], &option)) {
@@ -1438,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_grpquota:
mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
break;
+ case Opt_coherency_buffered:
+ mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_coherency_full:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
case Opt_acl:
mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1477,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb,
}
}
+ /* Ensure only one heartbeat mode */
+ tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ status = 0;
+ goto bail;
+ }
+
status = 1;
bail:
@@ -1490,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
unsigned long opts = osb->s_mount_opt;
unsigned int local_alloc_megs;
- if (opts & OCFS2_MOUNT_HB_LOCAL)
- seq_printf(s, ",_netdev,heartbeat=local");
- else
- seq_printf(s, ",heartbeat=none");
+ if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+ seq_printf(s, ",_netdev");
+ if (opts & OCFS2_MOUNT_HB_LOCAL)
+ seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+ else
+ seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+ } else
+ seq_printf(s, ",%s", OCFS2_HB_NONE);
if (opts & OCFS2_MOUNT_NOINTR)
seq_printf(s, ",nointr");
@@ -1536,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_GRPQUOTA)
seq_printf(s, ",grpquota");
+ if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+ seq_printf(s, ",coherency=buffered");
+ else
+ seq_printf(s, ",coherency=full");
+
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
else
@@ -1640,13 +1693,9 @@ static void ocfs2_put_super(struct super_block *sb)
{
mlog_entry("(0x%p)\n", sb);
- lock_kernel();
-
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
- unlock_kernel();
-
mlog_exit_void();
}
@@ -1990,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
return 0;
}
+/* Make sure entire volume is addressable by our journal. Requires
+ osb_clusters_at_boot to be valid and for the journal to have been
+ initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+ int status = 0;
+ u64 max_block =
+ ocfs2_clusters_to_blocks(osb->sb,
+ osb->osb_clusters_at_boot) - 1;
+
+ /* 32-bit block number is always OK. */
+ if (max_block <= (u32)~0ULL)
+ goto out;
+
+ /* Volume is "huge", so see if our journal is new enough to
+ support it. */
+ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+ jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT))) {
+ mlog(ML_ERROR, "The journal cannot address the entire volume. "
+ "Enable the 'block64' journal option with tunefs.ocfs2");
+ status = -EFBIG;
+ goto out;
+ }
+
+ out:
+ return status;
+}
+
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
int sector_size,
@@ -2002,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
struct ocfs2_journal *journal;
__le32 uuid_net_key;
struct ocfs2_super *osb;
+ u64 total_blocks;
mlog_entry_void();
@@ -2060,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+ if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+ mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+ osb->max_slots);
+ status = -EINVAL;
+ goto bail;
+ }
+ mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
ocfs2_orphan_scan_init(osb);
status = ocfs2_recovery_init(osb);
@@ -2098,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
- if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
- mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
- osb->max_slots);
- status = -EINVAL;
- goto bail;
- }
- mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
osb->slot_recovery_generations =
kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
GFP_KERNEL);
@@ -2149,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_userspace_stack(osb)) {
+ if (ocfs2_clusterinfo_valid(osb)) {
+ osb->osb_stackflags =
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
OCFS2_STACK_LABEL_LEN);
@@ -2214,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
- > (u32)~0UL) {
- mlog(ML_ERROR, "Volume might try to write to blocks beyond "
- "what jbd can address in 32 bits.\n");
- status = -EINVAL;
+ total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+ le32_to_cpu(di->i_clusters));
+
+ status = generic_check_addressable(osb->sb->s_blocksize_bits,
+ total_blocks);
+ if (status) {
+ mlog(ML_ERROR, "Volume too large "
+ "to mount safely on this system");
+ status = -EFBIG;
goto bail;
}
@@ -2380,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ /* Now that journal has been initialized, check to make sure
+ entire volume is addressable. */
+ status = ocfs2_journal_addressable(osb);
+ if (status)
+ goto finally;
+
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d21..9975457 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
}
/* Fast symlinks can't be large */
- len = strlen(target);
+ len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
link = kzalloc(len + 1, GFP_NOFS);
if (!link) {
status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190..902efb2 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+ int type,
+ u32 slot)
{
- return slot == osb->slot_num || is_global_system_inode(type);
+ int index;
+ struct inode **local_system_inodes, **free = NULL;
+
+ BUG_ON(slot == OCFS2_INVALID_SLOT);
+ BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+ type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+ spin_lock(&osb->osb_lock);
+ local_system_inodes = osb->local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+
+ if (unlikely(!local_system_inodes)) {
+ local_system_inodes = kzalloc(sizeof(struct inode *) *
+ NUM_LOCAL_SYSTEM_INODES *
+ osb->max_slots,
+ GFP_NOFS);
+ if (!local_system_inodes) {
+ mlog_errno(-ENOMEM);
+ /*
+ * return NULL here so that ocfs2_get_sytem_file_inodes
+ * will try to create an inode and use it. We will try
+ * to initialize local_system_inodes next time.
+ */
+ return NULL;
+ }
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_system_inodes) {
+ /* Someone has initialized it for us. */
+ free = local_system_inodes;
+ local_system_inodes = osb->local_system_inodes;
+ } else
+ osb->local_system_inodes = local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+ if (unlikely(free))
+ kfree(free);
+ }
+
+ index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+ (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+ return &local_system_inodes[index];
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, slot))
- arr = &(osb->system_inodes[type]);
+ if (is_global_system_inode(type)) {
+ arr = &(osb->global_system_inodes[type]);
+ } else
+ arr = get_local_system_inode(osb, type, slot);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03469f..67cd439 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1286,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
- down_read(&oi->ip_xattr_sem);
ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size, &xis);
if (ret == -ENODATA && di->i_xattr_loc)
ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
buffer_size, &xbs);
- up_read(&oi->ip_xattr_sem);
return ret;
}
@@ -1316,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
mlog_errno(ret);
return ret;
}
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 0);
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
goto out;
}
- if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+ if (!indexed)
ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
else
ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);