From 7945619794314414a5c44df11fca4d3f2a3389cf Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjody@steamballoon.com>
Date: Mon, 7 Nov 2005 06:29:39 -0500
Subject: sbp2_command_orb_lock must be held when accessing the _orb_inuse
 list. Fixes an oops in sbp2util_find_command_for_SCpnt after sbp2scsi_abort:
 https://bugzilla.novell.com/show_bug.cgi?id=113734

Signed-off-by: Jody McIntyre <scjody@steamballoon.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/ieee1394/sbp2.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 12cec7c..f7e18cc 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -2350,6 +2350,7 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
 	struct scsi_cmnd *SCpnt = NULL;
 	u32 scsi_status = SBP2_SCSI_STATUS_GOOD;
 	struct sbp2_command_info *command;
+	unsigned long flags;
 
 	SBP2_DEBUG("sbp2_handle_status_write");
 
@@ -2451,9 +2452,11 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
 		 * null out last orb so that next time around we write directly to the orb pointer...
 		 * Quick start saves one 1394 bus transaction.
 		 */
+		spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
 		if (list_empty(&scsi_id->sbp2_command_orb_inuse)) {
 			scsi_id->last_orb = NULL;
 		}
+		spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
 
 	} else {
 
@@ -2563,9 +2566,11 @@ static void sbp2scsi_complete_all_commands(struct scsi_id_instance_data *scsi_id
 	struct sbp2scsi_host_info *hi = scsi_id->hi;
 	struct list_head *lh;
 	struct sbp2_command_info *command;
+	unsigned long flags;
 
 	SBP2_DEBUG("sbp2scsi_complete_all_commands");
 
+	spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
 	while (!list_empty(&scsi_id->sbp2_command_orb_inuse)) {
 		SBP2_DEBUG("Found pending command to complete");
 		lh = scsi_id->sbp2_command_orb_inuse.next;
@@ -2582,6 +2587,7 @@ static void sbp2scsi_complete_all_commands(struct scsi_id_instance_data *scsi_id
 			command->Current_done(command->Current_SCpnt);
 		}
 	}
+	spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
 
 	return;
 }
-- 
cgit v1.1


From 365c786f0be44ee92e018773cb0bc4b19080b6aa Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Mon, 7 Nov 2005 06:31:24 -0500
Subject: sbp2: Merge TYPE_RBC and 10byte removal patch from scsi maintainers.
 Added more cleanups to remove unused code.

Signed-off-by: Ben Collins <bcollins@debian.org>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 151 +-----------------------------------------------
 drivers/ieee1394/sbp2.h |   1 -
 2 files changed, 2 insertions(+), 150 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index f7e18cc..d53c8cf 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -1089,16 +1089,6 @@ static int sbp2_handle_physdma_read(struct hpsb_host *host, int nodeid, quadlet_
  **************************************/
 
 /*
- * This function determines if we should convert scsi commands for a particular sbp2 device type
- */
-static __inline__ int sbp2_command_conversion_device_type(u8 device_type)
-{
-	return (((device_type == TYPE_DISK) ||
-		 (device_type == TYPE_RBC) ||
-		 (device_type == TYPE_ROM)) ? 1:0);
-}
-
-/*
  * This function queries the device for the maximum concurrent logins it
  * supports.
  */
@@ -2106,11 +2096,6 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
 	sbp2_create_command_orb(scsi_id, command, cmd, SCpnt->use_sg,
 				request_bufflen, SCpnt->request_buffer,
 				SCpnt->sc_data_direction);
-	/*
-	 * Update our cdb if necessary (to handle sbp2 RBC command set
-	 * differences). This is where the command set hacks go!   =)
-	 */
-	sbp2_check_sbp2_command(scsi_id, command->command_orb.cdb);
 
 	sbp2util_packet_dump(&command->command_orb, sizeof(struct sbp2_command_orb),
 			     "sbp2 command orb", command->command_orb_dma);
@@ -2130,110 +2115,6 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
 
 
 /*
- * This function deals with command set differences between Linux scsi
- * command set and sbp2 RBC command set.
- */
-static void sbp2_check_sbp2_command(struct scsi_id_instance_data *scsi_id, unchar *cmd)
-{
-	unchar new_cmd[16];
-	u8 device_type = SBP2_DEVICE_TYPE (scsi_id->sbp2_device_type_and_lun);
-
-	SBP2_DEBUG("sbp2_check_sbp2_command");
-
-	switch (*cmd) {
-
-		case READ_6:
-
-			if (sbp2_command_conversion_device_type(device_type)) {
-
-				SBP2_DEBUG("Convert READ_6 to READ_10");
-
-				/*
-				 * Need to turn read_6 into read_10
-				 */
-				new_cmd[0] = 0x28;
-				new_cmd[1] = (cmd[1] & 0xe0);
-				new_cmd[2] = 0x0;
-				new_cmd[3] = (cmd[1] & 0x1f);
-				new_cmd[4] = cmd[2];
-				new_cmd[5] = cmd[3];
-				new_cmd[6] = 0x0;
-				new_cmd[7] = 0x0;
-				new_cmd[8] = cmd[4];
-				new_cmd[9] = cmd[5];
-
-				memcpy(cmd, new_cmd, 10);
-
-			}
-
-			break;
-
-		case WRITE_6:
-
-			if (sbp2_command_conversion_device_type(device_type)) {
-
-				SBP2_DEBUG("Convert WRITE_6 to WRITE_10");
-
-				/*
-				 * Need to turn write_6 into write_10
-				 */
-				new_cmd[0] = 0x2a;
-				new_cmd[1] = (cmd[1] & 0xe0);
-				new_cmd[2] = 0x0;
-				new_cmd[3] = (cmd[1] & 0x1f);
-				new_cmd[4] = cmd[2];
-				new_cmd[5] = cmd[3];
-				new_cmd[6] = 0x0;
-				new_cmd[7] = 0x0;
-				new_cmd[8] = cmd[4];
-				new_cmd[9] = cmd[5];
-
-				memcpy(cmd, new_cmd, 10);
-
-			}
-
-			break;
-
-		case MODE_SENSE:
-
-			if (sbp2_command_conversion_device_type(device_type)) {
-
-				SBP2_DEBUG("Convert MODE_SENSE_6 to MODE_SENSE_10");
-
-				/*
-				 * Need to turn mode_sense_6 into mode_sense_10
-				 */
-				new_cmd[0] = 0x5a;
-				new_cmd[1] = cmd[1];
-				new_cmd[2] = cmd[2];
-				new_cmd[3] = 0x0;
-				new_cmd[4] = 0x0;
-				new_cmd[5] = 0x0;
-				new_cmd[6] = 0x0;
-				new_cmd[7] = 0x0;
-				new_cmd[8] = cmd[4];
-				new_cmd[9] = cmd[5];
-
-				memcpy(cmd, new_cmd, 10);
-
-			}
-
-			break;
-
-		case MODE_SELECT:
-
-			/*
-			 * TODO. Probably need to change mode select to 10 byte version
-			 */
-
-		default:
-			break;
-	}
-
-	return;
-}
-
-/*
  * Translates SBP-2 status into SCSI sense data for check conditions
  */
 static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense_data)
@@ -2271,7 +2152,6 @@ static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 				     struct scsi_cmnd *SCpnt)
 {
 	u8 *scsi_buf = SCpnt->request_buffer;
-	u8 device_type = SBP2_DEVICE_TYPE (scsi_id->sbp2_device_type_and_lun);
 
 	SBP2_DEBUG("sbp2_check_sbp2_response");
 
@@ -2296,14 +2176,6 @@ static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 			}
 
 			/*
-			 * Check for Simple Direct Access Device and change it to TYPE_DISK
-			 */
-			if ((scsi_buf[0] & 0x1f) == TYPE_RBC) {
-				SBP2_DEBUG("Changing TYPE_RBC to TYPE_DISK");
-				scsi_buf[0] &= 0xe0;
-			}
-
-			/*
 			 * Fix ansi revision and response data format
 			 */
 			scsi_buf[2] |= 2;
@@ -2311,27 +2183,6 @@ static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 
 			break;
 
-		case MODE_SENSE:
-
-			if (sbp2_command_conversion_device_type(device_type)) {
-
-				SBP2_DEBUG("Modify mode sense response (10 byte version)");
-
-				scsi_buf[0] = scsi_buf[1];	/* Mode data length */
-				scsi_buf[1] = scsi_buf[2];	/* Medium type */
-				scsi_buf[2] = scsi_buf[3];	/* Device specific parameter */
-				scsi_buf[3] = scsi_buf[7];	/* Block descriptor length */
-				memcpy(scsi_buf + 4, scsi_buf + 8, scsi_buf[0]);
-			}
-
-			break;
-
-		case MODE_SELECT:
-
-			/*
-			 * TODO. Probably need to change mode select to 10 byte version
-			 */
-
 		default:
 			break;
 	}
@@ -2713,6 +2564,8 @@ static int sbp2scsi_slave_alloc(struct scsi_device *sdev)
 static int sbp2scsi_slave_configure(struct scsi_device *sdev)
 {
 	blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
+	sdev->use_10_for_rw = 1;
+	sdev->use_10_for_ms = 1;
 	return 0;
 }
 
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index cd425be..cb111d7 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -469,7 +469,6 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
 			     struct scsi_cmnd *SCpnt,
 			     void (*done)(struct scsi_cmnd *));
 static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense_data);
-static void sbp2_check_sbp2_command(struct scsi_id_instance_data *scsi_id, unchar *cmd);
 static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 				     struct scsi_cmnd *SCpnt);
 static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
-- 
cgit v1.1


From e309fc6d71d61bb0f049ab6d0da10c845da9513f Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Mon, 7 Nov 2005 06:31:34 -0500
Subject: sbp2: Remove our tracking of device type, since we no longer need to
 worry about it. Depends on patch "ieee1394: remove sbp2's TYPE_RBC and 10byte
 handling".

Signed-off-by: Ben Collins <bcollins@debian.org>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 34 ++++++----------------------------
 drivers/ieee1394/sbp2.h |  7 +------
 2 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index d53c8cf..747dbd1 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -735,7 +735,7 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
 	INIT_LIST_HEAD(&scsi_id->sbp2_command_orb_completed);
 	INIT_LIST_HEAD(&scsi_id->scsi_list);
 	spin_lock_init(&scsi_id->sbp2_command_orb_lock);
-	scsi_id->sbp2_device_type_and_lun = SBP2_DEVICE_TYPE_LUN_UNINITIALIZED;
+	scsi_id->sbp2_lun = 0;
 
 	ud->device.driver_data = scsi_id;
 
@@ -1110,11 +1110,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
 
 	scsi_id->query_logins_orb->lun_misc = ORB_SET_FUNCTION(SBP2_QUERY_LOGINS_REQUEST);
 	scsi_id->query_logins_orb->lun_misc |= ORB_SET_NOTIFY(1);
-	if (scsi_id->sbp2_device_type_and_lun != SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) {
-		scsi_id->query_logins_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
-		SBP2_DEBUG("sbp2_query_logins: set lun to %d",
-			   ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun));
-	}
+	scsi_id->query_logins_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_lun);
 	SBP2_DEBUG("sbp2_query_logins: lun_misc initialized");
 
 	scsi_id->query_logins_orb->reserved_resp_length =
@@ -1223,12 +1219,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 	scsi_id->login_orb->lun_misc |= ORB_SET_RECONNECT(0);	/* One second reconnect time */
 	scsi_id->login_orb->lun_misc |= ORB_SET_EXCLUSIVE(exclusive_login);	/* Exclusive access to device */
 	scsi_id->login_orb->lun_misc |= ORB_SET_NOTIFY(1);	/* Notify us of login complete */
-	/* Set the lun if we were able to pull it from the device's unit directory */
-	if (scsi_id->sbp2_device_type_and_lun != SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) {
-		scsi_id->login_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
-		SBP2_DEBUG("sbp2_query_logins: set lun to %d",
-			   ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun));
-	}
+	scsi_id->login_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_lun);
 	SBP2_DEBUG("sbp2_login_device: lun_misc initialized");
 
 	scsi_id->login_orb->passwd_resp_lengths =
@@ -1543,7 +1534,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 				SBP2_DEBUG("sbp2_management_agent_addr = %x",
 					   (unsigned int) management_agent_addr);
 			} else if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) {
-				scsi_id->sbp2_device_type_and_lun = kv->value.immediate;
+				scsi_id->sbp2_lun = ORB_SET_LUN(kv->value.immediate);
 			}
 			break;
 
@@ -1636,7 +1627,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 		scsi_id->sbp2_firmware_revision = firmware_revision;
 		scsi_id->workarounds = workarounds;
 		if (ud->flags & UNIT_DIRECTORY_HAS_LUN)
-			scsi_id->sbp2_device_type_and_lun = ud->lun;
+			scsi_id->sbp2_lun = ORB_SET_LUN(ud->lun);
 	}
 }
 
@@ -2158,16 +2149,6 @@ static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 	switch (SCpnt->cmnd[0]) {
 
 		case INQUIRY:
-
-			/*
-			 * If scsi_id->sbp2_device_type_and_lun is uninitialized, then fill 
-			 * this information in from the inquiry response data. Lun is set to zero.
-			 */
-			if (scsi_id->sbp2_device_type_and_lun == SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) {
-				SBP2_DEBUG("Creating sbp2_device_type_and_lun from scsi inquiry data");
-				scsi_id->sbp2_device_type_and_lun = (scsi_buf[0] & 0x1f) << 16;
-			}
-
 			/*
 			 * Make sure data length is ok. Minimum length is 36 bytes
 			 */
@@ -2665,10 +2646,7 @@ static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev, struct device_att
 	if (!(scsi_id = (struct scsi_id_instance_data *)sdev->host->hostdata[0]))
 		return 0;
 
-	if (scsi_id->sbp2_device_type_and_lun == SBP2_DEVICE_TYPE_LUN_UNINITIALIZED)
-		lun = 0;
-	else
-		lun = ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
+	lun = ORB_SET_LUN(scsi_id->sbp2_lun);
 
 	return sprintf(buf, "%016Lx:%d:%d\n", (unsigned long long)scsi_id->ne->guid,
 		       scsi_id->ud->id, lun);
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index cb111d7..890be13 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -229,9 +229,6 @@ struct sbp2_status_block {
 #define SBP2_DEVICE_TYPE_AND_LUN_KEY				0x14
 #define SBP2_FIRMWARE_REVISION_KEY				0x3c
 
-#define SBP2_DEVICE_TYPE(q)					(((q) >> 16) & 0x1f)
-#define SBP2_DEVICE_LUN(q)					((q) & 0xffff)
-
 #define SBP2_AGENT_STATE_OFFSET					0x00ULL
 #define SBP2_AGENT_RESET_OFFSET					0x04ULL
 #define SBP2_ORB_POINTER_OFFSET					0x08ULL
@@ -256,8 +253,6 @@ struct sbp2_status_block {
  */
 #define SBP2_128KB_BROKEN_FIRMWARE				0xa0b800
 
-#define SBP2_DEVICE_TYPE_LUN_UNINITIALIZED			0xffffffff
-
 /*
  * SCSI specific stuff
  */
@@ -379,7 +374,7 @@ struct scsi_id_instance_data {
 	u32 sbp2_command_set_spec_id;
 	u32 sbp2_command_set;
 	u32 sbp2_unit_characteristics;
-	u32 sbp2_device_type_and_lun;
+	u32 sbp2_lun;
 	u32 sbp2_firmware_revision;
 
 	/*
-- 
cgit v1.1


From a237f35fdd81d85037dccdacd2e94028227b59fb Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 7 Nov 2005 06:31:39 -0500
Subject: sbp2, ohci1394 cleanups: sbp2: various code formatting cleanups
 ohci1394: remove form feed characters

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ohci1394.c |   5 -
 drivers/ieee1394/sbp2.c     | 433 ++++++++++++++++++++++----------------------
 drivers/ieee1394/sbp2.h     |  15 +-
 3 files changed, 224 insertions(+), 229 deletions(-)

diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 4cf9b8f..dcb5776 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -3201,8 +3201,6 @@ static struct hpsb_host_driver ohci1394_driver = {
 	.hw_csr_reg =		ohci_hw_csr_reg,
 };
 
-
-
 /***********************************
  * PCI Driver Interface functions  *
  ***********************************/
@@ -3606,8 +3604,6 @@ static struct pci_driver ohci1394_pci_driver = {
 	.suspend =	ohci1394_pci_suspend,
 };
 
-
-
 /***********************************
  * OHCI1394 Video Interface        *
  ***********************************/
@@ -3714,7 +3710,6 @@ EXPORT_SYMBOL(ohci1394_init_iso_tasklet);
 EXPORT_SYMBOL(ohci1394_register_iso_tasklet);
 EXPORT_SYMBOL(ohci1394_unregister_iso_tasklet);
 
-
 /***********************************
  * General module initialization   *
  ***********************************/
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 747dbd1..073ede9 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -151,18 +151,15 @@ static int force_inquiry_hack;
 module_param(force_inquiry_hack, int, 0444);
 MODULE_PARM_DESC(force_inquiry_hack, "Force SCSI inquiry hack (default = 0)");
 
-
 /*
  * Export information about protocols/devices supported by this driver.
  */
 static struct ieee1394_device_id sbp2_id_table[] = {
 	{
-		.match_flags =IEEE1394_MATCH_SPECIFIER_ID |
-		              IEEE1394_MATCH_VERSION,
-		.specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff,
-		.version =    SBP2_SW_VERSION_ENTRY & 0xffffff
-	},
-	{ }
+	 .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION,
+	 .specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff,
+	 .version = SBP2_SW_VERSION_ENTRY & 0xffffff},
+	{}
 };
 
 MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table);
@@ -221,7 +218,6 @@ static u32 global_outstanding_dmas = 0;
 
 #define SBP2_ERR(fmt, args...)		HPSB_ERR("sbp2: "fmt, ## args)
 
-
 /*
  * Globals
  */
@@ -254,8 +250,8 @@ static struct hpsb_address_ops sbp2_ops = {
 
 #ifdef CONFIG_IEEE1394_SBP2_PHYS_DMA
 static struct hpsb_address_ops sbp2_physdma_ops = {
-        .read = sbp2_handle_physdma_read,
-        .write = sbp2_handle_physdma_write,
+	.read = sbp2_handle_physdma_read,
+	.write = sbp2_handle_physdma_write,
 };
 #endif
 
@@ -287,7 +283,6 @@ static u32 sbp2_broken_inquiry_list[] = {
  * General utility functions
  **************************************/
 
-
 #ifndef __BIG_ENDIAN
 /*
  * Converts a buffer from be32 to cpu byte ordering. Length is in bytes.
@@ -324,7 +319,8 @@ static __inline__ void sbp2util_cpu_to_be32_buffer(void *buffer, int length)
 /*
  * Debug packet dump routine. Length is in bytes.
  */
-static void sbp2util_packet_dump(void *buffer, int length, char *dump_name, u32 dump_phys_addr)
+static void sbp2util_packet_dump(void *buffer, int length, char *dump_name,
+				 u32 dump_phys_addr)
 {
 	int i;
 	unsigned char *dump = buffer;
@@ -345,7 +341,7 @@ static void sbp2util_packet_dump(void *buffer, int length, char *dump_name, u32
 			printk("  ");
 		if ((i & 0xf) == 0)
 			printk("\n   ");
-		printk("%02x ", (int) dump[i]);
+		printk("%02x ", (int)dump[i]);
 	}
 	printk("\n");
 
@@ -364,9 +360,9 @@ static int sbp2util_down_timeout(atomic_t *done, int timeout)
 
 	for (i = timeout; (i > 0 && atomic_read(done) == 0); i-= HZ/10) {
 		if (msleep_interruptible(100))	/* 100ms */
-			return(1);
+			return 1;
 	}
-	return ((i > 0) ? 0:1);
+	return (i > 0) ? 0 : 1;
 }
 
 /* Free's an allocated packet */
@@ -380,21 +376,22 @@ static void sbp2_free_packet(struct hpsb_packet *packet)
  * subaction and returns immediately. Can be used from interrupts.
  */
 static int sbp2util_node_write_no_wait(struct node_entry *ne, u64 addr,
-				quadlet_t *buffer, size_t length)
+				       quadlet_t *buffer, size_t length)
 {
 	struct hpsb_packet *packet;
 
 	packet = hpsb_make_writepacket(ne->host, ne->nodeid,
 				       addr, buffer, length);
-        if (!packet)
-                return -ENOMEM;
+	if (!packet)
+		return -ENOMEM;
 
-	hpsb_set_packet_complete_task(packet, (void (*)(void*))sbp2_free_packet,
+	hpsb_set_packet_complete_task(packet,
+				      (void (*)(void *))sbp2_free_packet,
 				      packet);
 
 	hpsb_node_fill_packet(ne, packet);
 
-        if (hpsb_send_packet(packet) < 0) {
+	if (hpsb_send_packet(packet) < 0) {
 		sbp2_free_packet(packet);
 		return -EIO;
 	}
@@ -420,19 +417,21 @@ static int sbp2util_create_command_orb_pool(struct scsi_id_instance_data *scsi_i
 		command = (struct sbp2_command_info *)
 		    kmalloc(sizeof(struct sbp2_command_info), GFP_ATOMIC);
 		if (!command) {
-			spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
-			return(-ENOMEM);
+			spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock,
+					       flags);
+			return -ENOMEM;
 		}
 		memset(command, '\0', sizeof(struct sbp2_command_info));
 		command->command_orb_dma =
-			pci_map_single (hi->host->pdev, &command->command_orb,
-					sizeof(struct sbp2_command_orb),
-					PCI_DMA_BIDIRECTIONAL);
+		    pci_map_single(hi->host->pdev, &command->command_orb,
+				   sizeof(struct sbp2_command_orb),
+				   PCI_DMA_BIDIRECTIONAL);
 		SBP2_DMA_ALLOC("single command orb DMA");
 		command->sge_dma =
-			pci_map_single (hi->host->pdev, &command->scatter_gather_element,
-					sizeof(command->scatter_gather_element),
-					PCI_DMA_BIDIRECTIONAL);
+		    pci_map_single(hi->host->pdev,
+				   &command->scatter_gather_element,
+				   sizeof(command->scatter_gather_element),
+				   PCI_DMA_BIDIRECTIONAL);
 		SBP2_DMA_ALLOC("scatter_gather_element");
 		INIT_LIST_HEAD(&command->list);
 		list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed);
@@ -488,7 +487,7 @@ static struct sbp2_command_info *sbp2util_find_command_for_orb(
 		list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) {
 			if (command->command_orb_dma == orb) {
 				spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
-				return (command);
+				return command;
 			}
 		}
 	}
@@ -496,7 +495,7 @@ static struct sbp2_command_info *sbp2util_find_command_for_orb(
 
 	SBP2_ORB_DEBUG("could not match command orb %x", (unsigned int)orb);
 
-	return(NULL);
+	return NULL;
 }
 
 /*
@@ -513,12 +512,12 @@ static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(struct scsi_id_
 		list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) {
 			if (command->Current_SCpnt == SCpnt) {
 				spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
-				return (command);
+				return command;
 			}
 		}
 	}
 	spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
-	return(NULL);
+	return NULL;
 }
 
 /*
@@ -545,7 +544,7 @@ static struct sbp2_command_info *sbp2util_allocate_command_orb(
 		SBP2_ERR("sbp2util_allocate_command_orb - No orbs available!");
 	}
 	spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
-	return (command);
+	return command;
 }
 
 /* Free our DMA's */
@@ -587,7 +586,8 @@ static void sbp2util_free_command_dma(struct sbp2_command_info *command)
 /*
  * This function moves a command to the completed orb list.
  */
-static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id, struct sbp2_command_info *command)
+static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id,
+					    struct sbp2_command_info *command)
 {
 	unsigned long flags;
 
@@ -606,8 +606,6 @@ static inline int sbp2util_node_is_available(struct scsi_id_instance_data *scsi_
 	return scsi_id && scsi_id->ne && !scsi_id->ne->in_limbo;
 }
 
-
-
 /*********************************************
  * IEEE-1394 core driver stack related section
  *********************************************/
@@ -627,14 +625,14 @@ static int sbp2_probe(struct device *dev)
 	if (ud->flags & UNIT_DIRECTORY_HAS_LUN_DIRECTORY)
 		return -ENODEV;
 
-        scsi_id = sbp2_alloc_device(ud);
+	scsi_id = sbp2_alloc_device(ud);
 
-        if (!scsi_id)
-                return -ENOMEM;
+	if (!scsi_id)
+		return -ENOMEM;
 
-        sbp2_parse_unit_directory(scsi_id, ud);
+	sbp2_parse_unit_directory(scsi_id, ud);
 
-        return sbp2_start_device(scsi_id);
+	return sbp2_start_device(scsi_id);
 }
 
 static int sbp2_remove(struct device *dev)
@@ -769,7 +767,7 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
 
 	/* Register our host with the SCSI stack. */
 	scsi_host = scsi_host_alloc(&scsi_driver_template,
-				    sizeof (unsigned long));
+				    sizeof(unsigned long));
 	if (!scsi_host) {
 		SBP2_ERR("failed to register scsi host");
 		goto failed_alloc;
@@ -790,7 +788,6 @@ failed_alloc:
 	return NULL;
 }
 
-
 static void sbp2_host_reset(struct hpsb_host *host)
 {
 	struct sbp2scsi_host_info *hi;
@@ -804,7 +801,6 @@ static void sbp2_host_reset(struct hpsb_host *host)
 	}
 }
 
-
 /*
  * This function is where we first pull the node unique ids, and then
  * allocate memory and register a SBP-2 device.
@@ -818,7 +814,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Login FIFO DMA */
 	scsi_id->login_response =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_login_response),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_login_response),
 				     &scsi_id->login_response_dma);
 	if (!scsi_id->login_response)
 		goto alloc_fail;
@@ -826,7 +823,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Query logins ORB DMA */
 	scsi_id->query_logins_orb =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_query_logins_orb),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_query_logins_orb),
 				     &scsi_id->query_logins_orb_dma);
 	if (!scsi_id->query_logins_orb)
 		goto alloc_fail;
@@ -834,7 +832,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Query logins response DMA */
 	scsi_id->query_logins_response =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_query_logins_response),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_query_logins_response),
 				     &scsi_id->query_logins_response_dma);
 	if (!scsi_id->query_logins_response)
 		goto alloc_fail;
@@ -842,7 +841,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Reconnect ORB DMA */
 	scsi_id->reconnect_orb =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_reconnect_orb),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_reconnect_orb),
 				     &scsi_id->reconnect_orb_dma);
 	if (!scsi_id->reconnect_orb)
 		goto alloc_fail;
@@ -850,7 +850,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Logout ORB DMA */
 	scsi_id->logout_orb =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_logout_orb),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_logout_orb),
 				     &scsi_id->logout_orb_dma);
 	if (!scsi_id->logout_orb)
 		goto alloc_fail;
@@ -858,7 +859,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 
 	/* Login ORB DMA */
 	scsi_id->login_orb =
-		pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_login_orb),
+		pci_alloc_consistent(hi->host->pdev,
+				     sizeof(struct sbp2_login_orb),
 				     &scsi_id->login_orb_dma);
 	if (!scsi_id->login_orb) {
 alloc_fail:
@@ -880,25 +882,25 @@ alloc_fail:
 
 		if (scsi_id->logout_orb) {
 			pci_free_consistent(hi->host->pdev,
-					sizeof(struct sbp2_logout_orb),
-					scsi_id->logout_orb,
-					scsi_id->logout_orb_dma);
+					    sizeof(struct sbp2_logout_orb),
+					    scsi_id->logout_orb,
+					    scsi_id->logout_orb_dma);
 			SBP2_DMA_FREE("logout ORB DMA");
 		}
 
 		if (scsi_id->reconnect_orb) {
 			pci_free_consistent(hi->host->pdev,
-					sizeof(struct sbp2_reconnect_orb),
-					scsi_id->reconnect_orb,
-					scsi_id->reconnect_orb_dma);
+					    sizeof(struct sbp2_reconnect_orb),
+					    scsi_id->reconnect_orb,
+					    scsi_id->reconnect_orb_dma);
 			SBP2_DMA_FREE("reconnect ORB DMA");
 		}
 
 		if (scsi_id->login_response) {
 			pci_free_consistent(hi->host->pdev,
-					sizeof(struct sbp2_login_response),
-					scsi_id->login_response,
-					scsi_id->login_response_dma);
+					    sizeof(struct sbp2_login_response),
+					    scsi_id->login_response,
+					    scsi_id->login_response_dma);
 			SBP2_DMA_FREE("login FIFO DMA");
 		}
 
@@ -906,7 +908,7 @@ alloc_fail:
 
 		kfree(scsi_id);
 
-		SBP2_ERR ("Could not allocate memory for scsi_id");
+		SBP2_ERR("Could not allocate memory for scsi_id");
 
 		return -ENOMEM;
 	}
@@ -935,7 +937,7 @@ alloc_fail:
 		sbp2_remove_device(scsi_id);
 		return -EINTR;
 	}
-	
+
 	/*
 	 * Login to the sbp-2 device
 	 */
@@ -1054,36 +1056,39 @@ static void sbp2_remove_device(struct scsi_id_instance_data *scsi_id)
  * This function deals with physical dma write requests (for adapters that do not support
  * physical dma in hardware). Mostly just here for debugging...
  */
-static int sbp2_handle_physdma_write(struct hpsb_host *host, int nodeid, int destid, quadlet_t *data,
-                                     u64 addr, size_t length, u16 flags)
+static int sbp2_handle_physdma_write(struct hpsb_host *host, int nodeid,
+				     int destid, quadlet_t *data, u64 addr,
+				     size_t length, u16 flags)
 {
 
-        /*
-         * Manually put the data in the right place.
-         */
-        memcpy(bus_to_virt((u32)addr), data, length);
-	sbp2util_packet_dump(data, length, "sbp2 phys dma write by device", (u32)addr);
-        return(RCODE_COMPLETE);
+	/*
+	 * Manually put the data in the right place.
+	 */
+	memcpy(bus_to_virt((u32) addr), data, length);
+	sbp2util_packet_dump(data, length, "sbp2 phys dma write by device",
+			     (u32) addr);
+	return RCODE_COMPLETE;
 }
 
 /*
  * This function deals with physical dma read requests (for adapters that do not support
  * physical dma in hardware). Mostly just here for debugging...
  */
-static int sbp2_handle_physdma_read(struct hpsb_host *host, int nodeid, quadlet_t *data,
-                                    u64 addr, size_t length, u16 flags)
+static int sbp2_handle_physdma_read(struct hpsb_host *host, int nodeid,
+				    quadlet_t *data, u64 addr, size_t length,
+				    u16 flags)
 {
 
-        /*
-         * Grab data from memory and send a read response.
-         */
-        memcpy(data, bus_to_virt((u32)addr), length);
-	sbp2util_packet_dump(data, length, "sbp2 phys dma read by device", (u32)addr);
-        return(RCODE_COMPLETE);
+	/*
+	 * Grab data from memory and send a read response.
+	 */
+	memcpy(data, bus_to_virt((u32) addr), length);
+	sbp2util_packet_dump(data, length, "sbp2 phys dma read by device",
+			     (u32) addr);
+	return RCODE_COMPLETE;
 }
 #endif
 
-
 /**************************************
  * SBP-2 protocol related section
  **************************************/
@@ -1147,12 +1152,12 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
 
 	if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 2*HZ)) {
 		SBP2_INFO("Error querying logins to SBP-2 device - timed out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	if (scsi_id->status_block.ORB_offset_lo != scsi_id->query_logins_orb_dma) {
 		SBP2_INFO("Error querying logins to SBP-2 device - timed out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	if (STATUS_GET_RESP(scsi_id->status_block.ORB_offset_hi_misc) ||
@@ -1160,7 +1165,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
 	    STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
 
 		SBP2_INFO("Error querying logins to SBP-2 device - timed out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	sbp2util_cpu_to_be32_buffer(scsi_id->query_logins_response, sizeof(struct sbp2_query_logins_response));
@@ -1177,7 +1182,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
 	SBP2_DEBUG("Number of active logins: %d", active_logins);
 
 	if (active_logins >= max_logins) {
-		return(-EIO);
+		return -EIO;
 	}
 
 	return 0;
@@ -1196,13 +1201,13 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 
 	if (!scsi_id->login_orb) {
 		SBP2_DEBUG("sbp2_login_device: login_orb not alloc'd!");
-		return(-EIO);
+		return -EIO;
 	}
 
 	if (!exclusive_login) {
 		if (sbp2_query_logins(scsi_id)) {
 			SBP2_INFO("Device does not support any more concurrent logins");
-			return(-EIO);
+			return -EIO;
 		}
 	}
 
@@ -1269,7 +1274,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 	 */
 	if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 20*HZ)) {
 		SBP2_ERR("Error logging into SBP-2 device - login timed-out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -1277,7 +1282,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 	 */
 	if (scsi_id->status_block.ORB_offset_lo != scsi_id->login_orb_dma) {
 		SBP2_ERR("Error logging into SBP-2 device - login timed-out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -1288,7 +1293,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 	    STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
 
 		SBP2_ERR("Error logging into SBP-2 device - login failed");
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -1312,7 +1317,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
 
 	SBP2_INFO("Logged into SBP-2 device");
 
-	return(0);
+	return 0;
 
 }
 
@@ -1366,8 +1371,7 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id)
 	atomic_set(&scsi_id->sbp2_login_complete, 0);
 
 	error = hpsb_node_write(scsi_id->ne,
-	                            scsi_id->sbp2_management_agent_addr,
-	                            data, 8);
+				scsi_id->sbp2_management_agent_addr, data, 8);
 	if (error)
 		return error;
 
@@ -1377,7 +1381,7 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id)
 
 	SBP2_INFO("Logged out of SBP-2 device");
 
-	return(0);
+	return 0;
 
 }
 
@@ -1437,8 +1441,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
 	atomic_set(&scsi_id->sbp2_login_complete, 0);
 
 	error = hpsb_node_write(scsi_id->ne,
-	                            scsi_id->sbp2_management_agent_addr,
-	                            data, 8);
+				scsi_id->sbp2_management_agent_addr, data, 8);
 	if (error)
 		return error;
 
@@ -1447,7 +1450,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
 	 */
 	if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, HZ)) {
 		SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -1455,7 +1458,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
 	 */
 	if (scsi_id->status_block.ORB_offset_lo != scsi_id->reconnect_orb_dma) {
 		SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out");
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -1466,12 +1469,12 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
 	    STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
 
 		SBP2_ERR("Error reconnecting to SBP-2 device - reconnect failed");
-		return(-EIO);
+		return -EIO;
 	}
 
 	HPSB_DEBUG("Reconnected to SBP-2 device");
 
-	return(0);
+	return 0;
 
 }
 
@@ -1494,10 +1497,9 @@ static int sbp2_set_busy_timeout(struct scsi_id_instance_data *scsi_id)
 		SBP2_ERR("sbp2_set_busy_timeout error");
 	}
 
-	return(0);
+	return 0;
 }
 
-
 /*
  * This function is called to parse sbp2 device's config rom unit
  * directory. Used to determine things like sbp2 management agent offset,
@@ -1510,7 +1512,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 	struct csr1212_dentry *dentry;
 	u64 management_agent_addr;
 	u32 command_set_spec_id, command_set, unit_characteristics,
-		firmware_revision, workarounds;
+	    firmware_revision, workarounds;
 	int i;
 
 	SBP2_DEBUG("sbp2_parse_unit_directory");
@@ -1528,13 +1530,14 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 			if (kv->key.type == CSR1212_KV_TYPE_CSR_OFFSET) {
 				/* Save off the management agent address */
 				management_agent_addr =
-					CSR1212_REGISTER_SPACE_BASE +
-					(kv->value.csr_offset << 2);
+				    CSR1212_REGISTER_SPACE_BASE +
+				    (kv->value.csr_offset << 2);
 
 				SBP2_DEBUG("sbp2_management_agent_addr = %x",
-					   (unsigned int) management_agent_addr);
+					   (unsigned int)management_agent_addr);
 			} else if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) {
-				scsi_id->sbp2_lun = ORB_SET_LUN(kv->value.immediate);
+				scsi_id->sbp2_lun =
+				    ORB_SET_LUN(kv->value.immediate);
 			}
 			break;
 
@@ -1542,14 +1545,14 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 			/* Command spec organization */
 			command_set_spec_id = kv->value.immediate;
 			SBP2_DEBUG("sbp2_command_set_spec_id = %x",
-				   (unsigned int) command_set_spec_id);
+				   (unsigned int)command_set_spec_id);
 			break;
 
 		case SBP2_COMMAND_SET_KEY:
 			/* Command set used by sbp2 device */
 			command_set = kv->value.immediate;
 			SBP2_DEBUG("sbp2_command_set = %x",
-				   (unsigned int) command_set);
+				   (unsigned int)command_set);
 			break;
 
 		case SBP2_UNIT_CHARACTERISTICS_KEY:
@@ -1559,7 +1562,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 			 */
 			unit_characteristics = kv->value.immediate;
 			SBP2_DEBUG("sbp2_unit_characteristics = %x",
-				   (unsigned int) unit_characteristics);
+				   (unsigned int)unit_characteristics);
 			break;
 
 		case SBP2_FIRMWARE_REVISION_KEY:
@@ -1567,9 +1570,10 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
 			firmware_revision = kv->value.immediate;
 			if (force_inquiry_hack)
 				SBP2_INFO("sbp2_firmware_revision = %x",
-				   (unsigned int) firmware_revision);
-			else	SBP2_DEBUG("sbp2_firmware_revision = %x",
-				   (unsigned int) firmware_revision);
+					  (unsigned int)firmware_revision);
+			else
+				SBP2_DEBUG("sbp2_firmware_revision = %x",
+					   (unsigned int)firmware_revision);
 			break;
 
 		default:
@@ -1647,8 +1651,9 @@ static int sbp2_max_speed_and_size(struct scsi_id_instance_data *scsi_id)
 	SBP2_DEBUG("sbp2_max_speed_and_size");
 
 	/* Initial setting comes from the hosts speed map */
-	scsi_id->speed_code = hi->host->speed_map[NODEID_TO_NODE(hi->host->node_id) * 64
-						  + NODEID_TO_NODE(scsi_id->ne->nodeid)];
+	scsi_id->speed_code =
+	    hi->host->speed_map[NODEID_TO_NODE(hi->host->node_id) * 64 +
+				NODEID_TO_NODE(scsi_id->ne->nodeid)];
 
 	/* Bump down our speed if the user requested it */
 	if (scsi_id->speed_code > max_speed) {
@@ -1659,15 +1664,16 @@ static int sbp2_max_speed_and_size(struct scsi_id_instance_data *scsi_id)
 
 	/* Payload size is the lesser of what our speed supports and what
 	 * our host supports.  */
-	scsi_id->max_payload_size = min(sbp2_speedto_max_payload[scsi_id->speed_code],
-					(u8)(hi->host->csr.max_rec - 1));
+	scsi_id->max_payload_size =
+	    min(sbp2_speedto_max_payload[scsi_id->speed_code],
+		(u8) (hi->host->csr.max_rec - 1));
 
 	HPSB_DEBUG("Node " NODE_BUS_FMT ": Max speed [%s] - Max payload [%u]",
 		   NODE_BUS_ARGS(hi->host, scsi_id->ne->nodeid),
 		   hpsb_speedto_str[scsi_id->speed_code],
-		   1 << ((u32)scsi_id->max_payload_size + 2));
+		   1 << ((u32) scsi_id->max_payload_size + 2));
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1702,7 +1708,7 @@ static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait)
 	 */
 	scsi_id->last_orb = NULL;
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1716,10 +1722,9 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 				   unsigned int scsi_request_bufflen,
 				   void *scsi_request_buffer,
 				   enum dma_data_direction dma_dir)
-
 {
 	struct sbp2scsi_host_info *hi = scsi_id->hi;
-	struct scatterlist *sgpnt = (struct scatterlist *) scsi_request_buffer;
+	struct scatterlist *sgpnt = (struct scatterlist *)scsi_request_buffer;
 	struct sbp2_command_orb *command_orb = &command->command_orb;
 	struct sbp2_unrestricted_page_table *scatter_gather_element =
 		&command->scatter_gather_element[0];
@@ -1739,30 +1744,30 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 	command_orb->next_ORB_lo = 0x0;
 	command_orb->misc = ORB_SET_MAX_PAYLOAD(scsi_id->max_payload_size);
 	command_orb->misc |= ORB_SET_SPEED(scsi_id->speed_code);
-	command_orb->misc |= ORB_SET_NOTIFY(1);		/* Notify us when complete */
+	command_orb->misc |= ORB_SET_NOTIFY(1);	/* Notify us when complete */
 
 	/*
 	 * Get the direction of the transfer. If the direction is unknown, then use our
 	 * goofy table as a back-up.
 	 */
 	switch (dma_dir) {
-		case DMA_NONE:
-			orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
-			break;
-		case DMA_TO_DEVICE:
-			orb_direction = ORB_DIRECTION_WRITE_TO_MEDIA;
-			break;
-		case DMA_FROM_DEVICE:
-			orb_direction = ORB_DIRECTION_READ_FROM_MEDIA;
-			break;
-		case DMA_BIDIRECTIONAL:
-		default:
-			SBP2_ERR("SCSI data transfer direction not specified. "
-				 "Update the SBP2 direction table in sbp2.h if "
-				 "necessary for your application");
-			__scsi_print_command(scsi_cmd);
-			orb_direction = sbp2scsi_direction_table[*scsi_cmd];
-			break;
+	case DMA_NONE:
+		orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
+		break;
+	case DMA_TO_DEVICE:
+		orb_direction = ORB_DIRECTION_WRITE_TO_MEDIA;
+		break;
+	case DMA_FROM_DEVICE:
+		orb_direction = ORB_DIRECTION_READ_FROM_MEDIA;
+		break;
+	case DMA_BIDIRECTIONAL:
+	default:
+		SBP2_ERR("SCSI data transfer direction not specified. "
+			 "Update the SBP2 direction table in sbp2.h if "
+			 "necessary for your application");
+		__scsi_print_command(scsi_cmd);
+		orb_direction = sbp2scsi_direction_table[*scsi_cmd];
+		break;
 	}
 
 	/*
@@ -1865,9 +1870,9 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 		command->dma_dir = dma_dir;
 		command->dma_size = scsi_request_bufflen;
 		command->dma_type = CMD_DMA_SINGLE;
-		command->cmd_dma = pci_map_single (hi->host->pdev, scsi_request_buffer,
-						   command->dma_size,
-						   command->dma_dir);
+		command->cmd_dma =
+		    pci_map_single(hi->host->pdev, scsi_request_buffer,
+				   command->dma_size, command->dma_dir);
 		SBP2_DMA_ALLOC("single bulk");
 
 		/*
@@ -1954,7 +1959,7 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 	memset(command_orb->cdb, 0, 12);
 	memcpy(command_orb->cdb, scsi_cmd, COMMAND_SIZE(*scsi_cmd));
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -1970,7 +1975,7 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
 
 	outstanding_orb_incr;
 	SBP2_ORB_DEBUG("sending command orb %p, total orbs = %x",
-			command_orb, global_outstanding_command_orbs);
+		       command_orb, global_outstanding_command_orbs);
 
 	pci_dma_sync_single_for_device(hi->host->pdev, command->command_orb_dma,
 				       sizeof(struct sbp2_command_orb),
@@ -2015,10 +2020,11 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
 		 * both by the sbp2 device and us.
 		 */
 		scsi_id->last_orb->next_ORB_lo =
-			cpu_to_be32(command->command_orb_dma);
+		    cpu_to_be32(command->command_orb_dma);
 		/* Tells hardware that this pointer is valid */
 		scsi_id->last_orb->next_ORB_hi = 0x0;
-		pci_dma_sync_single_for_device(hi->host->pdev, scsi_id->last_orb_dma,
+		pci_dma_sync_single_for_device(hi->host->pdev,
+					       scsi_id->last_orb_dma,
 					       sizeof(struct sbp2_command_orb),
 					       PCI_DMA_BIDIRECTIONAL);
 
@@ -2032,14 +2038,14 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
 
 		if (sbp2util_node_write_no_wait(ne, addr, &data, 4) < 0) {
 			SBP2_ERR("sbp2util_node_write_no_wait failed");
-			return(-EIO);
+			return -EIO;
 		}
 
 		scsi_id->last_orb = command_orb;
 		scsi_id->last_orb_dma = command->command_orb_dma;
 
 	}
-       	return(0);
+	return 0;
 }
 
 /*
@@ -2066,7 +2072,7 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
 	 */
 	command = sbp2util_allocate_command_orb(scsi_id, SCpnt, done);
 	if (!command) {
-		return(-EIO);
+		return -EIO;
 	}
 
 	/*
@@ -2101,10 +2107,9 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
 	 */
 	sbp2_link_orb_command(scsi_id, command);
 
-	return(0);
+	return 0;
 }
 
-
 /*
  * Translates SBP-2 status into SCSI sense data for check conditions
  */
@@ -2132,14 +2137,14 @@ static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense
 	sense_data[14] = sbp2_status[20];
 	sense_data[15] = sbp2_status[21];
 
-	return(sbp2_status[8] & 0x3f);	/* return scsi status */
+	return sbp2_status[8] & 0x3f;	/* return scsi status */
 }
 
 /*
  * This function is called after a command is completed, in order to do any necessary SBP-2
  * response data translations for the SCSI stack
  */
-static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id, 
+static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 				     struct scsi_cmnd *SCpnt)
 {
 	u8 *scsi_buf = SCpnt->request_buffer;
@@ -2148,24 +2153,24 @@ static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
 
 	switch (SCpnt->cmnd[0]) {
 
-		case INQUIRY:
-			/*
-			 * Make sure data length is ok. Minimum length is 36 bytes
-			 */
-			if (scsi_buf[4] == 0) {
-				scsi_buf[4] = 36 - 5;
-			}
+	case INQUIRY:
+		/*
+		 * Make sure data length is ok. Minimum length is 36 bytes
+		 */
+		if (scsi_buf[4] == 0) {
+			scsi_buf[4] = 36 - 5;
+		}
 
-			/*
-			 * Fix ansi revision and response data format
-			 */
-			scsi_buf[2] |= 2;
-			scsi_buf[3] = (scsi_buf[3] & 0xf0) | 2;
+		/*
+		 * Fix ansi revision and response data format
+		 */
+		scsi_buf[2] |= 2;
+		scsi_buf[3] = (scsi_buf[3] & 0xf0) | 2;
 
-			break;
+		break;
 
-		default:
-			break;
+	default:
+		break;
 	}
 	return;
 }
@@ -2190,14 +2195,14 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
 
 	if (!host) {
 		SBP2_ERR("host is NULL - this is bad!");
-		return(RCODE_ADDRESS_ERROR);
+		return RCODE_ADDRESS_ERROR;
 	}
 
 	hi = hpsb_get_hostinfo(&sbp2_highlevel, host);
 
 	if (!hi) {
 		SBP2_ERR("host info is NULL - this is bad!");
-		return(RCODE_ADDRESS_ERROR);
+		return RCODE_ADDRESS_ERROR;
 	}
 
 	/*
@@ -2214,7 +2219,7 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
 
 	if (!scsi_id) {
 		SBP2_ERR("scsi_id is NULL - device is gone?");
-		return(RCODE_ADDRESS_ERROR);
+		return RCODE_ADDRESS_ERROR;
 	}
 
 	/*
@@ -2312,10 +2317,9 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
 		SBP2_ORB_DEBUG("command orb completed");
 	}
 
-	return(RCODE_COMPLETE);
+	return RCODE_COMPLETE;
 }
 
-
 /**************************************
  * SCSI interface related section
  **************************************/
@@ -2448,55 +2452,56 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
 	 * complete the command, just let it get retried at the end of the
 	 * bus reset.
 	 */
-	if (!hpsb_node_entry_valid(scsi_id->ne) && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
+	if (!hpsb_node_entry_valid(scsi_id->ne)
+	    && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
 		SBP2_ERR("Bus reset in progress - retry command later");
 		return;
 	}
- 
+
 	/*
 	 * Switch on scsi status
 	 */
 	switch (scsi_status) {
-		case SBP2_SCSI_STATUS_GOOD:
-			SCpnt->result = DID_OK;
-			break;
+	case SBP2_SCSI_STATUS_GOOD:
+		SCpnt->result = DID_OK;
+		break;
 
-		case SBP2_SCSI_STATUS_BUSY:
-			SBP2_ERR("SBP2_SCSI_STATUS_BUSY");
-			SCpnt->result = DID_BUS_BUSY << 16;
-			break;
+	case SBP2_SCSI_STATUS_BUSY:
+		SBP2_ERR("SBP2_SCSI_STATUS_BUSY");
+		SCpnt->result = DID_BUS_BUSY << 16;
+		break;
 
-		case SBP2_SCSI_STATUS_CHECK_CONDITION:
-			SBP2_DEBUG("SBP2_SCSI_STATUS_CHECK_CONDITION");
-			SCpnt->result = CHECK_CONDITION << 1;
+	case SBP2_SCSI_STATUS_CHECK_CONDITION:
+		SBP2_DEBUG("SBP2_SCSI_STATUS_CHECK_CONDITION");
+		SCpnt->result = CHECK_CONDITION << 1;
 
-			/*
-			 * Debug stuff
-			 */
+		/*
+		 * Debug stuff
+		 */
 #if CONFIG_IEEE1394_SBP2_DEBUG >= 1
-			scsi_print_command(SCpnt);
-			scsi_print_sense("bh", SCpnt);
+		scsi_print_command(SCpnt);
+		scsi_print_sense("bh", SCpnt);
 #endif
 
-			break;
+		break;
 
-		case SBP2_SCSI_STATUS_SELECTION_TIMEOUT:
-			SBP2_ERR("SBP2_SCSI_STATUS_SELECTION_TIMEOUT");
-			SCpnt->result = DID_NO_CONNECT << 16;
-			scsi_print_command(SCpnt);
-			break;
+	case SBP2_SCSI_STATUS_SELECTION_TIMEOUT:
+		SBP2_ERR("SBP2_SCSI_STATUS_SELECTION_TIMEOUT");
+		SCpnt->result = DID_NO_CONNECT << 16;
+		scsi_print_command(SCpnt);
+		break;
 
-		case SBP2_SCSI_STATUS_CONDITION_MET:
-		case SBP2_SCSI_STATUS_RESERVATION_CONFLICT:
-		case SBP2_SCSI_STATUS_COMMAND_TERMINATED:
-			SBP2_ERR("Bad SCSI status = %x", scsi_status);
-			SCpnt->result = DID_ERROR << 16;
-			scsi_print_command(SCpnt);
-			break;
+	case SBP2_SCSI_STATUS_CONDITION_MET:
+	case SBP2_SCSI_STATUS_RESERVATION_CONFLICT:
+	case SBP2_SCSI_STATUS_COMMAND_TERMINATED:
+		SBP2_ERR("Bad SCSI status = %x", scsi_status);
+		SCpnt->result = DID_ERROR << 16;
+		scsi_print_command(SCpnt);
+		break;
 
-		default:
-			SBP2_ERR("Unsupported SCSI status = %x", scsi_status);
-			SCpnt->result = DID_ERROR << 16;
+	default:
+		SBP2_ERR("Unsupported SCSI status = %x", scsi_status);
+		SCpnt->result = DID_ERROR << 16;
 	}
 
 	/*
@@ -2510,7 +2515,8 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
 	 * If a bus reset is in progress and there was an error, complete
 	 * the command as busy so that it will get retried.
 	 */
-	if (!hpsb_node_entry_valid(scsi_id->ne) && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
+	if (!hpsb_node_entry_valid(scsi_id->ne)
+	    && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
 		SBP2_ERR("Completing command with busy (bus reset)");
 		SCpnt->result = DID_BUS_BUSY << 16;
 	}
@@ -2531,17 +2537,15 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
 	/*
 	 * Tell scsi stack that we're done with this command
 	 */
-	done (SCpnt);
+	done(SCpnt);
 }
 
-
 static int sbp2scsi_slave_alloc(struct scsi_device *sdev)
 {
 	((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = sdev;
 	return 0;
 }
 
-
 static int sbp2scsi_slave_configure(struct scsi_device *sdev)
 {
 	blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
@@ -2550,14 +2554,12 @@ static int sbp2scsi_slave_configure(struct scsi_device *sdev)
 	return 0;
 }
 
-
 static void sbp2scsi_slave_destroy(struct scsi_device *sdev)
 {
 	((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = NULL;
 	return;
 }
 
-
 /*
  * Called by scsi stack when something has really gone wrong.  Usually
  * called when a command has timed-out for some reason.
@@ -2603,7 +2605,7 @@ static int sbp2scsi_abort(struct scsi_cmnd *SCpnt)
 		sbp2scsi_complete_all_commands(scsi_id, DID_BUS_BUSY);
 	}
 
-	return(SUCCESS);
+	return SUCCESS;
 }
 
 /*
@@ -2629,12 +2631,14 @@ static int sbp2scsi_reset(struct scsi_cmnd *SCpnt)
 	return SUCCESS;
 }
 
-static const char *sbp2scsi_info (struct Scsi_Host *host)
+static const char *sbp2scsi_info(struct Scsi_Host *host)
 {
-        return "SCSI emulation for IEEE-1394 SBP-2 Devices";
+	return "SCSI emulation for IEEE-1394 SBP-2 Devices";
 }
 
-static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
 {
 	struct scsi_device *sdev;
 	struct scsi_id_instance_data *scsi_id;
@@ -2705,7 +2709,6 @@ static int sbp2_module_init(void)
 	/* Set max sectors (module load option). Default is 255 sectors. */
 	scsi_driver_template.max_sectors = max_sectors;
 
-
 	/* Register our high level driver with 1394 stack */
 	hpsb_register_highlevel(&sbp2_highlevel);
 
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index 890be13..abc647b 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -119,8 +119,8 @@ struct sbp2_query_logins_response {
 struct sbp2_reconnect_orb {
 	u32 reserved1;
 	u32 reserved2;
-        u32 reserved3;
-        u32 reserved4;
+	u32 reserved3;
+	u32 reserved4;
 	u32 login_ID_misc;
 	u32 reserved5;
 	u32 status_FIFO_hi;
@@ -130,8 +130,8 @@ struct sbp2_reconnect_orb {
 struct sbp2_logout_orb {
 	u32 reserved1;
 	u32 reserved2;
-        u32 reserved3;
-        u32 reserved4;
+	u32 reserved3;
+	u32 reserved4;
 	u32 login_ID_misc;
 	u32 reserved5;
 	u32 status_FIFO_hi;
@@ -188,7 +188,7 @@ struct sbp2_unrestricted_page_table {
 struct sbp2_status_block {
 	u32 ORB_offset_hi_misc;
 	u32 ORB_offset_lo;
-        u8 command_set_dependent[24];
+	u8 command_set_dependent[24];
 };
 
 /*
@@ -211,7 +211,7 @@ struct sbp2_status_block {
  * specified for write posting, where the ohci controller will
  * automatically send an ack_complete when the status is written by the
  * sbp2 device... saving a split transaction.   =)
- */ 
+ */
 #define SBP2_STATUS_FIFO_ADDRESS				0xfffe00000000ULL
 #define SBP2_STATUS_FIFO_ADDRESS_HI                             0xfffe
 #define SBP2_STATUS_FIFO_ADDRESS_LO                             0x0
@@ -333,10 +333,8 @@ struct sbp2_command_info {
 #define SBP2_BREAKAGE_128K_MAX_TRANSFER		0x1
 #define SBP2_BREAKAGE_INQUIRY_HACK		0x2
 
-
 struct sbp2scsi_host_info;
 
-
 /*
  * Information needed on a per scsi id basis (one for each sbp2 device)
  */
@@ -406,7 +404,6 @@ struct scsi_id_instance_data {
 	u32 workarounds;
 };
 
-
 /* Sbp2 host data structure (one per IEEE1394 host) */
 struct sbp2scsi_host_info {
 	struct hpsb_host *host;		/* IEEE1394 host */
-- 
cgit v1.1


From 7afa1467761f06bd9649efd66a4a6b3ff9f29a1f Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 7 Nov 2005 06:31:42 -0500
Subject: Remove version strings from eth1394, ohci1394, sbp2. Their version
 information is not trustworthy.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/eth1394.c  | 8 --------
 drivers/ieee1394/ohci1394.c | 8 --------
 drivers/ieee1394/sbp2.c     | 5 -----
 3 files changed, 21 deletions(-)

diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index c9e92d8..6984a92 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -88,9 +88,6 @@
 	printk(KERN_ERR "%s:%s[%d]: " fmt "\n", driver_name, __FUNCTION__, __LINE__, ## args)
 #define TRACE() printk(KERN_ERR "%s:%s[%d] ---- TRACE\n", driver_name, __FUNCTION__, __LINE__)
 
-static char version[] __devinitdata =
-	"$Rev: 1312 $ Ben Collins <bcollins@debian.org>";
-
 struct fragment_info {
 	struct list_head list;
 	int offset;
@@ -566,7 +563,6 @@ static void ether1394_add_host (struct hpsb_host *host)
 	struct eth1394_host_info *hi = NULL;
 	struct net_device *dev = NULL;
 	struct eth1394_priv *priv;
-	static int version_printed = 0;
 	u64 fifo_addr;
 
 	if (!(host->config_roms & HPSB_CONFIG_ROM_ENTRY_IP1394))
@@ -581,9 +577,6 @@ static void ether1394_add_host (struct hpsb_host *host)
 	if (fifo_addr == ~0ULL)
 		goto out;
 
-	if (version_printed++ == 0)
-		ETH1394_PRINT_G (KERN_INFO, "%s\n", version);
-
 	/* We should really have our own alloc_hpsbdev() function in
 	 * net_init.c instead of calling the one for ethernet then hijacking
 	 * it for ourselves.  That way we'd be a real networking device. */
@@ -1768,7 +1761,6 @@ fail:
 static void ether1394_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
 {
 	strcpy (info->driver, driver_name);
-	strcpy (info->version, "$Rev: 1312 $");
 	/* FIXME XXX provide sane businfo */
 	strcpy (info->bus_info, "ieee1394");
 }
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index dcb5776..8355068 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -161,9 +161,6 @@ printk(level "%s: " fmt "\n" , OHCI1394_DRIVER_NAME , ## args)
 #define PRINT(level, fmt, args...) \
 printk(level "%s: fw-host%d: " fmt "\n" , OHCI1394_DRIVER_NAME, ohci->host->id , ## args)
 
-static char version[] __devinitdata =
-	"$Rev: 1313 $ Ben Collins <bcollins@debian.org>";
-
 /* Module Parameters */
 static int phys_dma = 1;
 module_param(phys_dma, int, 0644);
@@ -3215,15 +3212,10 @@ do {						\
 static int __devinit ohci1394_pci_probe(struct pci_dev *dev,
 					const struct pci_device_id *ent)
 {
-	static int version_printed = 0;
-
 	struct hpsb_host *host;
 	struct ti_ohci *ohci;	/* shortcut to currently handled device */
 	unsigned long ohci_base;
 
-	if (version_printed++ == 0)
-		PRINT_G(KERN_INFO, "%s", version);
-
         if (pci_enable_device(dev))
 		FAIL(-ENXIO, "Failed to enable OHCI hardware");
         pci_set_master(dev);
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 073ede9..b871116 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -80,9 +80,6 @@
 #include "ieee1394_transactions.h"
 #include "sbp2.h"
 
-static char version[] __devinitdata =
-	"$Rev: 1306 $ Ben Collins <bcollins@debian.org>";
-
 /*
  * Module load parameter definitions
  */
@@ -2696,8 +2693,6 @@ static int sbp2_module_init(void)
 
 	SBP2_DEBUG("sbp2_module_init");
 
-	printk(KERN_INFO "sbp2: %s\n", version);
-
 	/* Module load debug option to force one command at a time (serializing I/O) */
 	if (serialize_io) {
 		SBP2_INFO("Driver forced to serialize I/O (serialize_io=1)");
-- 
cgit v1.1


From 8551158abc8ef45a7f473a87e69624d05ebfd684 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 7 Nov 2005 06:31:45 -0500
Subject: kmalloc/kzalloc changes: dv1394, eth1394, ieee1394, ohci1394,
 pcilynx, raw1394, sbp2c, video1394:  - use kzalloc  - provide safer size
 arguments to kmalloc and kzalloc  - omit some casts

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/csr1212.c   | 11 +++---
 drivers/ieee1394/csr1212.h   |  2 +-
 drivers/ieee1394/dv1394.c    |  4 +--
 drivers/ieee1394/eth1394.c   | 12 +++----
 drivers/ieee1394/highlevel.c | 18 ++++------
 drivers/ieee1394/hosts.c     |  6 ++--
 drivers/ieee1394/nodemgr.c   | 27 +++++++--------
 drivers/ieee1394/ohci1394.c  | 20 ++++-------
 drivers/ieee1394/pcilynx.c   |  2 +-
 drivers/ieee1394/raw1394.c   | 38 +++++++++------------
 drivers/ieee1394/sbp2.c      |  7 ++--
 drivers/ieee1394/video1394.c | 81 ++++++++++++++------------------------------
 12 files changed, 85 insertions(+), 143 deletions(-)

diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index 61ddd5d..c0f8ed6 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -1261,7 +1261,7 @@ static int csr1212_parse_bus_info_block(struct csr1212_csr *csr)
 		return CSR1212_EINVAL;
 #endif
 
-	cr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region));
+	cr = CSR1212_MALLOC(sizeof(*cr));
 	if (!cr)
 		return CSR1212_ENOMEM;
 
@@ -1393,8 +1393,7 @@ int csr1212_parse_keyval(struct csr1212_keyval *kv,
 	case CSR1212_KV_TYPE_LEAF:
 		if (kv->key.id != CSR1212_KV_ID_EXTENDED_ROM) {
 			kv->value.leaf.data = CSR1212_MALLOC(quads_to_bytes(kvi_len));
-			if (!kv->value.leaf.data)
-			{
+			if (!kv->value.leaf.data) {
 				ret = CSR1212_ENOMEM;
 				goto fail;
 			}
@@ -1462,7 +1461,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
 		cache->next = NULL;
 		csr->cache_tail = cache;
 		cache->filled_head =
-			CSR1212_MALLOC(sizeof(struct csr1212_cache_region));
+			CSR1212_MALLOC(sizeof(*cache->filled_head));
 		if (!cache->filled_head) {
 			return CSR1212_ENOMEM;
 		}
@@ -1484,7 +1483,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
 	/* Now seach read portions of the cache to see if it is there. */
 	for (cr = cache->filled_head; cr; cr = cr->next) {
 		if (cache_index < cr->offset_start) {
-			newcr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region));
+			newcr = CSR1212_MALLOC(sizeof(*newcr));
 			if (!newcr)
 				return CSR1212_ENOMEM;
 
@@ -1508,7 +1507,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
 
 	if (!cr) {
 		cr = cache->filled_tail;
-		newcr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region));
+		newcr = CSR1212_MALLOC(sizeof(*newcr));
 		if (!newcr)
 			return CSR1212_ENOMEM;
 
diff --git a/drivers/ieee1394/csr1212.h b/drivers/ieee1394/csr1212.h
index 28c5f4b..cecd587 100644
--- a/drivers/ieee1394/csr1212.h
+++ b/drivers/ieee1394/csr1212.h
@@ -646,7 +646,7 @@ static inline struct csr1212_csr_rom_cache *csr1212_rom_cache_malloc(u_int32_t o
 {
 	struct csr1212_csr_rom_cache *cache;
 
-	cache = CSR1212_MALLOC(sizeof(struct csr1212_csr_rom_cache) + size);
+	cache = CSR1212_MALLOC(sizeof(*cache) + size);
 	if (!cache)
 		return NULL;
 
diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index cbbbe14..d204ec7 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -2218,14 +2218,12 @@ static int dv1394_init(struct ti_ohci *ohci, enum pal_or_ntsc format, enum modes
 	unsigned long flags;
 	int i;
 
-	video = kmalloc(sizeof(struct video_card), GFP_KERNEL);
+	video = kzalloc(sizeof(*video), GFP_KERNEL);
 	if (!video) {
 		printk(KERN_ERR "dv1394: cannot allocate video_card\n");
 		goto err;
 	}
 
-	memset(video, 0, sizeof(struct video_card));
-
 	video->ohci = ohci;
 	/* lower 2 bits of id indicate which of four "plugs"
 	   per host */
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index 6984a92..30fa0d4 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -352,12 +352,12 @@ static int eth1394_probe(struct device *dev)
 	if (!hi)
 		return -ENOENT;
 
-	new_node = kmalloc(sizeof(struct eth1394_node_ref),
+	new_node = kmalloc(sizeof(*new_node),
 			   in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 	if (!new_node)
 		return -ENOMEM;
 
-	node_info = kmalloc(sizeof(struct eth1394_node_info),
+	node_info = kmalloc(sizeof(*node_info),
 			    in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 	if (!node_info) {
 		kfree(new_node);
@@ -433,12 +433,12 @@ static int eth1394_update(struct unit_directory *ud)
 	node = eth1394_find_node(&priv->ip_node_list, ud);
 
 	if (!node) {
-		node = kmalloc(sizeof(struct eth1394_node_ref),
+		node = kmalloc(sizeof(*node),
 			       in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 		if (!node)
 			return -ENOMEM;
 
-		node_info = kmalloc(sizeof(struct eth1394_node_info),
+		node_info = kmalloc(sizeof(*node_info),
 				    in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
 		if (!node_info) {
 			kfree(node);
@@ -1014,7 +1014,7 @@ static inline int new_fragment(struct list_head *frag_info, int offset, int len)
 		}
 	}
 
-	new = kmalloc(sizeof(struct fragment_info), GFP_ATOMIC);
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new)
 		return -ENOMEM;
 
@@ -1033,7 +1033,7 @@ static inline int new_partial_datagram(struct net_device *dev,
 {
 	struct partial_datagram *new;
 
-	new = kmalloc(sizeof(struct partial_datagram), GFP_ATOMIC);
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
 	if (!new)
 		return -ENOMEM;
 
diff --git a/drivers/ieee1394/highlevel.c b/drivers/ieee1394/highlevel.c
index 997e1bf..734b121 100644
--- a/drivers/ieee1394/highlevel.c
+++ b/drivers/ieee1394/highlevel.c
@@ -101,12 +101,10 @@ void *hpsb_create_hostinfo(struct hpsb_highlevel *hl, struct hpsb_host *host,
 		return NULL;
 	}
 
-	hi = kmalloc(sizeof(*hi) + data_size, GFP_ATOMIC);
+	hi = kzalloc(sizeof(*hi) + data_size, GFP_ATOMIC);
 	if (!hi)
 		return NULL;
 
-	memset(hi, 0, sizeof(*hi) + data_size);
-
 	if (data_size) {
 		data = hi->data = hi + 1;
 		hi->size = data_size;
@@ -326,11 +324,9 @@ u64 hpsb_allocate_and_register_addrspace(struct hpsb_highlevel *hl,
 		return retval;
 	}
 
-	as = (struct hpsb_address_serve *)
-		kmalloc(sizeof(struct hpsb_address_serve), GFP_KERNEL);
-	if (as == NULL) {
+	as = kmalloc(sizeof(*as), GFP_KERNEL);
+	if (!as)
 		return retval;
-	}
 
 	INIT_LIST_HEAD(&as->host_list);
 	INIT_LIST_HEAD(&as->hl_list);
@@ -383,11 +379,9 @@ int hpsb_register_addrspace(struct hpsb_highlevel *hl, struct hpsb_host *host,
                 return 0;
         }
 
-        as = (struct hpsb_address_serve *)
-                kmalloc(sizeof(struct hpsb_address_serve), GFP_ATOMIC);
-        if (as == NULL) {
-                return 0;
-        }
+	as = kmalloc(sizeof(*as), GFP_ATOMIC);
+	if (!as)
+		return 0;
 
         INIT_LIST_HEAD(&as->host_list);
         INIT_LIST_HEAD(&as->hl_list);
diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index aeeaeb6..d245abe 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -114,9 +114,9 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
 	int i;
 	int hostnum = 0;
 
-        h = kmalloc(sizeof(struct hpsb_host) + extra, SLAB_KERNEL);
-        if (!h) return NULL;
-        memset(h, 0, sizeof(struct hpsb_host) + extra);
+        h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL);
+        if (!h)
+		return NULL;
 
 	h->csr.rom = csr1212_create_csr(&csr_bus_ops, CSR_BUS_INFO_SIZE, h);
 	if (!h->csr.rom) {
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 7fff5a1..3f0917b 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -743,21 +743,20 @@ static struct node_entry *nodemgr_create_node(octlet_t guid, struct csr1212_csr
 					      unsigned int generation)
 {
 	struct hpsb_host *host = hi->host;
-        struct node_entry *ne;
-
-	ne = kmalloc(sizeof(struct node_entry), GFP_KERNEL);
-        if (!ne) return NULL;
+	struct node_entry *ne;
 
-	memset(ne, 0, sizeof(struct node_entry));
+	ne = kzalloc(sizeof(*ne), GFP_KERNEL);
+	if (!ne)
+		return NULL;
 
 	ne->tpool = &host->tpool[nodeid & NODE_MASK];
 
-        ne->host = host;
-        ne->nodeid = nodeid;
+	ne->host = host;
+	ne->nodeid = nodeid;
 	ne->generation = generation;
 	ne->needs_probe = 1;
 
-        ne->guid = guid;
+	ne->guid = guid;
 	ne->guid_vendor_id = (guid >> 40) & 0xffffff;
 	ne->guid_vendor_oui = nodemgr_find_oui_name(ne->guid_vendor_id);
 	ne->csr = csr;
@@ -787,7 +786,7 @@ static struct node_entry *nodemgr_create_node(octlet_t guid, struct csr1212_csr
 		   (host->node_id == nodeid) ? "Host" : "Node",
 		   NODE_BUS_ARGS(host, nodeid), (unsigned long long)guid);
 
-        return ne;
+	return ne;
 }
 
 
@@ -872,12 +871,10 @@ static struct unit_directory *nodemgr_process_unit_directory
 	struct csr1212_keyval *kv;
 	u8 last_key_id = 0;
 
-	ud = kmalloc(sizeof(struct unit_directory), GFP_KERNEL);
+	ud = kzalloc(sizeof(*ud), GFP_KERNEL);
 	if (!ud)
 		goto unit_directory_error;
 
-	memset (ud, 0, sizeof(struct unit_directory));
-
 	ud->ne = ne;
 	ud->ignore_driver = ignore_drivers;
 	ud->address = ud_kv->offset + CSR1212_CONFIG_ROM_SPACE_BASE;
@@ -937,10 +934,10 @@ static struct unit_directory *nodemgr_process_unit_directory
 			/* Logical Unit Number */
 			if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) {
 				if (ud->flags & UNIT_DIRECTORY_HAS_LUN) {
-					ud_child = kmalloc(sizeof(struct unit_directory), GFP_KERNEL);
+					ud_child = kmalloc(sizeof(*ud_child), GFP_KERNEL);
 					if (!ud_child)
 						goto unit_directory_error;
-					memcpy(ud_child, ud, sizeof(struct unit_directory));
+					memcpy(ud_child, ud, sizeof(*ud_child));
 					nodemgr_register_device(ne, ud_child, &ne->device);
 					ud_child = NULL;
 					
@@ -1200,7 +1197,7 @@ static void nodemgr_node_scan_one(struct host_info *hi,
 	struct csr1212_csr *csr;
 	struct nodemgr_csr_info *ci;
 
-	ci = kmalloc(sizeof(struct nodemgr_csr_info), GFP_KERNEL);
+	ci = kmalloc(sizeof(*ci), GFP_KERNEL);
 	if (!ci)
 		return;
 
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 8355068..97b6f48 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -2957,28 +2957,23 @@ alloc_dma_rcv_ctx(struct ti_ohci *ohci, struct dma_rcv_ctx *d,
 	d->ctrlClear = 0;
 	d->cmdPtr = 0;
 
-	d->buf_cpu = kmalloc(d->num_desc * sizeof(quadlet_t*), GFP_ATOMIC);
-	d->buf_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_ATOMIC);
+	d->buf_cpu = kzalloc(d->num_desc * sizeof(*d->buf_cpu), GFP_ATOMIC);
+	d->buf_bus = kzalloc(d->num_desc * sizeof(*d->buf_bus), GFP_ATOMIC);
 
 	if (d->buf_cpu == NULL || d->buf_bus == NULL) {
 		PRINT(KERN_ERR, "Failed to allocate dma buffer");
 		free_dma_rcv_ctx(d);
 		return -ENOMEM;
 	}
-	memset(d->buf_cpu, 0, d->num_desc * sizeof(quadlet_t*));
-	memset(d->buf_bus, 0, d->num_desc * sizeof(dma_addr_t));
 
-	d->prg_cpu = kmalloc(d->num_desc * sizeof(struct dma_cmd*),
-				GFP_ATOMIC);
-	d->prg_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_ATOMIC);
+	d->prg_cpu = kzalloc(d->num_desc * sizeof(*d->prg_cpu), GFP_ATOMIC);
+	d->prg_bus = kzalloc(d->num_desc * sizeof(*d->prg_bus), GFP_ATOMIC);
 
 	if (d->prg_cpu == NULL || d->prg_bus == NULL) {
 		PRINT(KERN_ERR, "Failed to allocate dma prg");
 		free_dma_rcv_ctx(d);
 		return -ENOMEM;
 	}
-	memset(d->prg_cpu, 0, d->num_desc * sizeof(struct dma_cmd*));
-	memset(d->prg_bus, 0, d->num_desc * sizeof(dma_addr_t));
 
 	d->spb = kmalloc(d->split_buf_size, GFP_ATOMIC);
 
@@ -3090,17 +3085,14 @@ alloc_dma_trm_ctx(struct ti_ohci *ohci, struct dma_trm_ctx *d,
 	d->ctrlClear = 0;
 	d->cmdPtr = 0;
 
-	d->prg_cpu = kmalloc(d->num_desc * sizeof(struct at_dma_prg*),
-			     GFP_KERNEL);
-	d->prg_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_KERNEL);
+	d->prg_cpu = kzalloc(d->num_desc * sizeof(*d->prg_cpu), GFP_KERNEL);
+	d->prg_bus = kzalloc(d->num_desc * sizeof(*d->prg_bus), GFP_KERNEL);
 
 	if (d->prg_cpu == NULL || d->prg_bus == NULL) {
 		PRINT(KERN_ERR, "Failed to allocate at dma prg");
 		free_dma_trm_ctx(d);
 		return -ENOMEM;
 	}
-	memset(d->prg_cpu, 0, d->num_desc * sizeof(struct at_dma_prg*));
-	memset(d->prg_bus, 0, d->num_desc * sizeof(dma_addr_t));
 
 	len = sprintf(pool_name, "ohci1394_trm_prg");
 	sprintf(pool_name+len, "%d", num_allocs);
diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c
index 6b1ab87..e2edc41 100644
--- a/drivers/ieee1394/pcilynx.c
+++ b/drivers/ieee1394/pcilynx.c
@@ -1435,7 +1435,7 @@ static int __devinit add_card(struct pci_dev *dev,
         	struct i2c_algo_bit_data i2c_adapter_data;
 
         	error = -ENOMEM;
-		i2c_ad = kmalloc(sizeof(struct i2c_adapter), SLAB_KERNEL);
+		i2c_ad = kmalloc(sizeof(*i2c_ad), SLAB_KERNEL);
         	if (!i2c_ad) FAIL("failed to allocate I2C adapter memory");
 
 		memcpy(i2c_ad, &bit_ops, sizeof(struct i2c_adapter));
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 24411e6..0278dc5d 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -102,12 +102,9 @@ static struct pending_request *__alloc_pending_request(gfp_t flags)
 {
 	struct pending_request *req;
 
-	req = (struct pending_request *)kmalloc(sizeof(struct pending_request),
-						flags);
-	if (req != NULL) {
-		memset(req, 0, sizeof(struct pending_request));
+	req = kzalloc(sizeof(*req), flags);
+	if (req)
 		INIT_LIST_HEAD(&req->list);
-	}
 
 	return req;
 }
@@ -192,9 +189,9 @@ static void add_host(struct hpsb_host *host)
 	struct host_info *hi;
 	unsigned long flags;
 
-	hi = (struct host_info *)kmalloc(sizeof(struct host_info), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
 
-	if (hi != NULL) {
+	if (hi) {
 		INIT_LIST_HEAD(&hi->list);
 		hi->host = host;
 		INIT_LIST_HEAD(&hi->file_info_list);
@@ -315,8 +312,8 @@ static void iso_receive(struct hpsb_host *host, int channel, quadlet_t * data,
 				break;
 
 			if (!ibs) {
-				ibs = kmalloc(sizeof(struct iso_block_store)
-					      + length, SLAB_ATOMIC);
+				ibs = kmalloc(sizeof(*ibs) + length,
+					      SLAB_ATOMIC);
 				if (!ibs) {
 					kfree(req);
 					break;
@@ -376,8 +373,8 @@ static void fcp_request(struct hpsb_host *host, int nodeid, int direction,
 				break;
 
 			if (!ibs) {
-				ibs = kmalloc(sizeof(struct iso_block_store)
-					      + length, SLAB_ATOMIC);
+				ibs = kmalloc(sizeof(*ibs) + length,
+					      SLAB_ATOMIC);
 				if (!ibs) {
 					kfree(req);
 					break;
@@ -502,10 +499,9 @@ static int state_initialized(struct file_info *fi, struct pending_request *req)
 	switch (req->req.type) {
 	case RAW1394_REQ_LIST_CARDS:
 		spin_lock_irqsave(&host_info_lock, flags);
-		khl = kmalloc(sizeof(struct raw1394_khost_list) * host_count,
-			      SLAB_ATOMIC);
+		khl = kmalloc(sizeof(*khl) * host_count, SLAB_ATOMIC);
 
-		if (khl != NULL) {
+		if (khl) {
 			req->req.misc = host_count;
 			req->data = (quadlet_t *) khl;
 
@@ -517,7 +513,7 @@ static int state_initialized(struct file_info *fi, struct pending_request *req)
 		}
 		spin_unlock_irqrestore(&host_info_lock, flags);
 
-		if (khl != NULL) {
+		if (khl) {
 			req->req.error = RAW1394_ERROR_NONE;
 			req->req.length = min(req->req.length,
 					      (u32) (sizeof
@@ -1647,13 +1643,13 @@ static int arm_register(struct file_info *fi, struct pending_request *req)
 		return (-EINVAL);
 	}
 	/* addr-list-entry for fileinfo */
-	addr = (struct arm_addr *)kmalloc(sizeof(struct arm_addr), SLAB_KERNEL);
+	addr = kmalloc(sizeof(*addr), SLAB_KERNEL);
 	if (!addr) {
 		req->req.length = 0;
 		return (-ENOMEM);
 	}
 	/* allocation of addr_space_buffer */
-	addr->addr_space_buffer = (u8 *) vmalloc(req->req.length);
+	addr->addr_space_buffer = vmalloc(req->req.length);
 	if (!(addr->addr_space_buffer)) {
 		kfree(addr);
 		req->req.length = 0;
@@ -2122,8 +2118,7 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
 		return -ENOMEM;
 	}
 
-	cache->filled_head =
-	    kmalloc(sizeof(struct csr1212_cache_region), GFP_KERNEL);
+	cache->filled_head = kmalloc(sizeof(*cache->filled_head), GFP_KERNEL);
 	if (!cache->filled_head) {
 		csr1212_release_keyval(fi->csr1212_dirs[dr]);
 		fi->csr1212_dirs[dr] = NULL;
@@ -2684,11 +2679,10 @@ static int raw1394_open(struct inode *inode, struct file *file)
 {
 	struct file_info *fi;
 
-	fi = kmalloc(sizeof(struct file_info), SLAB_KERNEL);
-	if (fi == NULL)
+	fi = kzalloc(sizeof(*fi), SLAB_KERNEL);
+	if (!fi)
 		return -ENOMEM;
 
-	memset(fi, 0, sizeof(struct file_info));
 	fi->notification = (u8) RAW1394_NOTIFY_ON;	/* busreset notification */
 
 	INIT_LIST_HEAD(&fi->list);
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index b871116..84875cd 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -411,14 +411,12 @@ static int sbp2util_create_command_orb_pool(struct scsi_id_instance_data *scsi_i
 
 	spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
 	for (i = 0; i < orbs; i++) {
-		command = (struct sbp2_command_info *)
-		    kmalloc(sizeof(struct sbp2_command_info), GFP_ATOMIC);
+		command = kzalloc(sizeof(*command), GFP_ATOMIC);
 		if (!command) {
 			spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock,
 					       flags);
 			return -ENOMEM;
 		}
-		memset(command, '\0', sizeof(struct sbp2_command_info));
 		command->command_orb_dma =
 		    pci_map_single(hi->host->pdev, &command->command_orb,
 				   sizeof(struct sbp2_command_orb),
@@ -714,12 +712,11 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
 
 	SBP2_DEBUG("sbp2_alloc_device");
 
-	scsi_id = kmalloc(sizeof(*scsi_id), GFP_KERNEL);
+	scsi_id = kzalloc(sizeof(*scsi_id), GFP_KERNEL);
 	if (!scsi_id) {
 		SBP2_ERR("failed to create scsi_id");
 		goto failed_alloc;
 	}
-	memset(scsi_id, 0, sizeof(*scsi_id));
 
 	scsi_id->ne = ud->ne;
 	scsi_id->ud = ud;
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index 23911da..2ad30cd 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -206,14 +206,12 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
 	struct dma_iso_ctx *d;
 	int i;
 
-	d = kmalloc(sizeof(struct dma_iso_ctx), GFP_KERNEL);
-	if (d == NULL) {
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
 		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma_iso_ctx");
 		return NULL;
 	}
 
-	memset(d, 0, sizeof *d);
-
 	d->ohci = ohci;
 	d->type = type;
 	d->channel = channel;
@@ -251,9 +249,8 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
 	}
 	d->ctx = d->iso_tasklet.context;
 
-	d->prg_reg = kmalloc(d->num_desc * sizeof(struct dma_prog_region),
-			GFP_KERNEL);
-	if (d->prg_reg == NULL) {
+	d->prg_reg = kmalloc(d->num_desc * sizeof(*d->prg_reg), GFP_KERNEL);
+	if (!d->prg_reg) {
 		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate ir prg regs");
 		free_dma_iso_ctx(d);
 		return NULL;
@@ -268,15 +265,14 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
 		d->cmdPtr = OHCI1394_IsoRcvCommandPtr+32*d->ctx;
 		d->ctxMatch = OHCI1394_IsoRcvContextMatch+32*d->ctx;
 
-		d->ir_prg = kmalloc(d->num_desc * sizeof(struct dma_cmd *),
+		d->ir_prg = kzalloc(d->num_desc * sizeof(*d->ir_prg),
 				    GFP_KERNEL);
 
-		if (d->ir_prg == NULL) {
+		if (!d->ir_prg) {
 			PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma ir prg");
 			free_dma_iso_ctx(d);
 			return NULL;
 		}
-		memset(d->ir_prg, 0, d->num_desc * sizeof(struct dma_cmd *));
 
 		d->nb_cmd = d->buf_size / PAGE_SIZE + 1;
 		d->left_size = (d->frame_size % PAGE_SIZE) ?
@@ -297,16 +293,15 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
 		d->ctrlClear = OHCI1394_IsoXmitContextControlClear+16*d->ctx;
 		d->cmdPtr = OHCI1394_IsoXmitCommandPtr+16*d->ctx;
 
-		d->it_prg = kmalloc(d->num_desc * sizeof(struct it_dma_prg *),
+		d->it_prg = kzalloc(d->num_desc * sizeof(*d->it_prg),
 				    GFP_KERNEL);
 
-		if (d->it_prg == NULL) {
+		if (!d->it_prg) {
 			PRINT(KERN_ERR, ohci->host->id,
 			      "Failed to allocate dma it prg");
 			free_dma_iso_ctx(d);
 			return NULL;
 		}
-		memset(d->it_prg, 0, d->num_desc*sizeof(struct it_dma_prg *));
 
 		d->packet_size = packet_size;
 
@@ -337,47 +332,24 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
 		}
 	}
 
-	d->buffer_status = kmalloc(d->num_desc * sizeof(unsigned int),
-				   GFP_KERNEL);
-	d->buffer_prg_assignment = kmalloc(d->num_desc * sizeof(unsigned int),
-				   GFP_KERNEL);
-	d->buffer_time = kmalloc(d->num_desc * sizeof(struct timeval),
-				   GFP_KERNEL);
-	d->last_used_cmd = kmalloc(d->num_desc * sizeof(unsigned int),
-				   GFP_KERNEL);
-	d->next_buffer = kmalloc(d->num_desc * sizeof(int),
-				 GFP_KERNEL);
-
-	if (d->buffer_status == NULL) {
-		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_status");
-		free_dma_iso_ctx(d);
-		return NULL;
-	}
-	if (d->buffer_prg_assignment == NULL) {
-		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_prg_assignment");
-		free_dma_iso_ctx(d);
-		return NULL;
-	}
-	if (d->buffer_time == NULL) {
-		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_time");
-		free_dma_iso_ctx(d);
-		return NULL;
-	}
-	if (d->last_used_cmd == NULL) {
-		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate last_used_cmd");
-		free_dma_iso_ctx(d);
-		return NULL;
-	}
-	if (d->next_buffer == NULL) {
-		PRINT(KERN_ERR, ohci->host->id, "Failed to allocate next_buffer");
+	d->buffer_status =
+	    kzalloc(d->num_desc * sizeof(*d->buffer_status), GFP_KERNEL);
+	d->buffer_prg_assignment =
+	    kzalloc(d->num_desc * sizeof(*d->buffer_prg_assignment), GFP_KERNEL);
+	d->buffer_time =
+	    kzalloc(d->num_desc * sizeof(*d->buffer_time), GFP_KERNEL);
+	d->last_used_cmd =
+	    kzalloc(d->num_desc * sizeof(*d->last_used_cmd), GFP_KERNEL);
+	d->next_buffer =
+	    kzalloc(d->num_desc * sizeof(*d->next_buffer), GFP_KERNEL);
+
+	if (!d->buffer_status || !d->buffer_prg_assignment || !d->buffer_time ||
+	    !d->last_used_cmd || !d->next_buffer) {
+		PRINT(KERN_ERR, ohci->host->id,
+		      "Failed to allocate dma_iso_ctx member");
 		free_dma_iso_ctx(d);
 		return NULL;
 	}
-	memset(d->buffer_status, 0, d->num_desc * sizeof(unsigned int));
-	memset(d->buffer_prg_assignment, 0, d->num_desc * sizeof(unsigned int));
-	memset(d->buffer_time, 0, d->num_desc * sizeof(struct timeval));
-	memset(d->last_used_cmd, 0, d->num_desc * sizeof(unsigned int));
-	memset(d->next_buffer, -1, d->num_desc * sizeof(int));
 
         spin_lock_init(&d->lock);
 
@@ -1085,7 +1057,7 @@ static int __video1394_ioctl(struct file *file,
 		}
 
 		if (d->flags & VIDEO1394_VARIABLE_PACKET_SIZE) {
-			int buf_size = d->nb_cmd * sizeof(unsigned int);
+			int buf_size = d->nb_cmd * sizeof(*psizes);
 			struct video1394_queue_variable __user *p = argp;
 			unsigned int __user *qv;
 
@@ -1251,13 +1223,12 @@ static int video1394_open(struct inode *inode, struct file *file)
         if (ohci == NULL)
                 return -EIO;
 
-	ctx = kmalloc(sizeof(struct file_ctx), GFP_KERNEL);
-	if (ctx == NULL)  {
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)  {
 		PRINT(KERN_ERR, ohci->host->id, "Cannot malloc file_ctx");
 		return -ENOMEM;
 	}
 
-	memset(ctx, 0, sizeof(struct file_ctx));
 	ctx->ohci = ohci;
 	INIT_LIST_HEAD(&ctx->context_list);
 	ctx->current_ctx = NULL;
-- 
cgit v1.1


From ef797546a93fffa9d8508e7c8539b352b6678568 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 7 Nov 2005 06:31:50 -0500
Subject: Remove definitions of unreferenced macros virt_to_page and vmalloc_32
 from dv1394 and video1394.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/dv1394.c    | 9 ---------
 drivers/ieee1394/video1394.c | 8 --------
 2 files changed, 17 deletions(-)

diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index d204ec7..196db74 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -123,15 +123,6 @@
 
 #include "ohci1394.h"
 
-#ifndef virt_to_page
-#define virt_to_page(x) MAP_NR(x)
-#endif
-
-#ifndef vmalloc_32
-#define vmalloc_32(x) vmalloc(x)
-#endif
-
-
 /* DEBUG LEVELS:
    0 - no debugging messages
    1 - some debugging messages, but none during DMA frame transmission
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index 2ad30cd..07050f0 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -77,14 +77,6 @@
 
 #define ISO_CHANNELS 64
 
-#ifndef virt_to_page
-#define virt_to_page(x) MAP_NR(x)
-#endif
-
-#ifndef vmalloc_32
-#define vmalloc_32(x) vmalloc(x)
-#endif
-
 struct it_dma_prg {
 	struct dma_cmd begin;
 	quadlet_t data[4];
-- 
cgit v1.1


From 7301c8d3a05dc52d33598364da7c4eb6ab6357eb Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjody@steamballoon.com>
Date: Fri, 18 Nov 2005 00:16:26 -0500
Subject: Remove amdtp, cmp drivers.

Remove the Audio and Music Data Transmission Protocol driver and the
Connection Management Procedures driver.  These are incomplete, have never
worked, and are better implemented in userland via raw1394 (see
http://freebob.sourceforge.net/ for example.)

Signed-off-by: Jody McIntyre <scjody@steamballoon.com>
Cc: Adrian Bunk <bunk@stusta.de>
---
 Documentation/feature-removal-schedule.txt | 11 -----------
 drivers/ieee1394/Kconfig                   | 23 -----------------------
 drivers/ieee1394/Makefile                  |  2 --
 drivers/ieee1394/ieee1394-ioctl.h          |  8 --------
 drivers/ieee1394/ohci1394.h                |  4 ++--
 5 files changed, 2 insertions(+), 46 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b67189a..daaf03e 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -40,17 +40,6 @@ Who:	Paul E. McKenney <paulmck@us.ibm.com>
 
 ---------------------------
 
-What:	IEEE1394 Audio and Music Data Transmission Protocol driver,
-	Connection Management Procedures driver
-When:	November 2005
-Files:	drivers/ieee1394/{amdtp,cmp}*
-Why:	These are incomplete, have never worked, and are better implemented
-	in userland via raw1394 (see http://freebob.sourceforge.net/ for
-	example.)
-Who:	Jody McIntyre <scjody@steamballoon.com>
-
----------------------------
-
 What:	raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
 When:	November 2005
 Why:	Deprecated in favour of the new ioctl-based rawiso interface, which is
diff --git a/drivers/ieee1394/Kconfig b/drivers/ieee1394/Kconfig
index 25103a0..39142e2 100644
--- a/drivers/ieee1394/Kconfig
+++ b/drivers/ieee1394/Kconfig
@@ -169,27 +169,4 @@ config IEEE1394_RAWIO
 	  To compile this driver as a module, say M here: the
 	  module will be called raw1394.
 
-config IEEE1394_CMP
-	tristate "IEC61883-1 Plug support"
-	depends on IEEE1394
-	help
-	  This option enables the Connection Management Procedures
-	  (IEC61883-1) driver, which implements input and output plugs.
-
-	  To compile this driver as a module, say M here: the
-	  module will be called cmp.
-
-config IEEE1394_AMDTP
-	tristate "IEC61883-6 (Audio transmission) support"
-	depends on IEEE1394 && IEEE1394_OHCI1394 && IEEE1394_CMP
-	help
-	  This option enables the Audio & Music Data Transmission Protocol
-	  (IEC61883-6) driver, which implements audio transmission over
-	  IEEE1394.
-
-	  The userspace interface is documented in amdtp.h.
-
-	  To compile this driver as a module, say M here: the
-	  module will be called amdtp.
-
 endmenu
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile
index e8b4d48..6f53611 100644
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -14,8 +14,6 @@ obj-$(CONFIG_IEEE1394_RAWIO) += raw1394.o
 obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
 obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
 obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
-obj-$(CONFIG_IEEE1394_AMDTP) += amdtp.o
-obj-$(CONFIG_IEEE1394_CMP) += cmp.o
 
 quiet_cmd_oui2c = OUI2C   $@
       cmd_oui2c = $(CONFIG_SHELL) $(srctree)/$(src)/oui2c.sh < $< > $@
diff --git a/drivers/ieee1394/ieee1394-ioctl.h b/drivers/ieee1394/ieee1394-ioctl.h
index f92b566..1567039 100644
--- a/drivers/ieee1394/ieee1394-ioctl.h
+++ b/drivers/ieee1394/ieee1394-ioctl.h
@@ -7,14 +7,6 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
-
-/* AMDTP Gets 6 */
-#define AMDTP_IOC_CHANNEL	_IOW('#', 0x00, struct amdtp_ioctl)
-#define AMDTP_IOC_PLUG		_IOW('#', 0x01, struct amdtp_ioctl)
-#define AMDTP_IOC_PING		_IOW('#', 0x02, struct amdtp_ioctl)
-#define AMDTP_IOC_ZAP		_IO ('#', 0x03)
-
-
 /* DV1394 Gets 10 */
 
 /* Get the driver ready to transmit video.  pass a struct dv1394_init* as
diff --git a/drivers/ieee1394/ohci1394.h b/drivers/ieee1394/ohci1394.h
index cc66c1c..7df0962 100644
--- a/drivers/ieee1394/ohci1394.h
+++ b/drivers/ieee1394/ohci1394.h
@@ -219,8 +219,8 @@ struct ti_ohci {
 
 	int self_id_errors;
 
-	/* Tasklets for iso receive and transmit, used by video1394,
-	 * amdtp and dv1394 */
+	/* Tasklets for iso receive and transmit, used by video1394
+	 * and dv1394 */
 
 	struct list_head iso_tasklet_list;
 	spinlock_t iso_tasklet_list_lock;
-- 
cgit v1.1


From e27d3014f301e6aee7b65b62ad1da2940e1fd8de Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 19 Nov 2005 21:23:48 -0500
Subject: Every file should #include the headers containing the prototypes for
 it's global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ieee1394_transactions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ieee1394/ieee1394_transactions.c b/drivers/ieee1394/ieee1394_transactions.c
index 0aa8763..81b983c 100644
--- a/drivers/ieee1394/ieee1394_transactions.c
+++ b/drivers/ieee1394/ieee1394_transactions.c
@@ -22,6 +22,7 @@
 #include "ieee1394_core.h"
 #include "highlevel.h"
 #include "nodemgr.h"
+#include "ieee1394_transactions.h"
 
 
 #define PREP_ASYNC_HEAD_ADDRESS(tc) \
-- 
cgit v1.1


From e4cda1654e5c0be4b68e29011e8dc04977286df9 Mon Sep 17 00:00:00 2001
From: Damien Douxchamps <ddouxchamps@users.sf.net>
Date: Sat, 19 Nov 2005 21:32:03 -0500
Subject: Fix incorrect video1394 timestamps.

This patch fixes the incoherent timestamps generated by video1394 since
the single-buffer patch was applied in 2.6.11. Credits have also been
removed from the header and a "//" comment was changed to "/* */".

Signed-off-by: Damien Douxchamps <ddouxchamps@users.sf.net>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/video1394.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index 07050f0..608479b 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -19,12 +19,6 @@
  *
  * NOTES:
  *
- * jds -- add private data to file to keep track of iso contexts associated
- * with each open -- so release won't kill all iso transfers.
- * 
- * Damien Douxchamps: Fix failure when the number of DMA pages per frame is
- * one.
- * 
  * ioctl return codes:
  * EFAULT is only for invalid address for the argp
  * EINVAL for out of range values
@@ -34,12 +28,6 @@
  * ENOTTY for unsupported ioctl request
  *
  */
-
-/* Markus Tavenrath <speedygoo@speedygoo.de> :
-   - fixed checks for valid buffer-numbers in video1394_icotl
-   - changed the ways the dma prg's are used, now it's possible to use
-     even a single dma buffer
-*/
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -503,7 +491,7 @@ static void wakeup_dma_ir_ctx(unsigned long l)
 		if (d->ir_prg[i][d->nb_cmd-1].status & cpu_to_le32(0xFFFF0000)) {
 			reset_ir_status(d, i);
 			d->buffer_status[d->buffer_prg_assignment[i]] = VIDEO1394_BUFFER_READY;
-			do_gettimeofday(&d->buffer_time[i]);
+			do_gettimeofday(&d->buffer_time[d->buffer_prg_assignment[i]]);
 		}
 	}
 
@@ -1010,7 +998,6 @@ static int __video1394_ioctl(struct file *file,
 
 		/* set time of buffer */
 		v.filltime = d->buffer_time[v.buffer];
-//		printk("Buffer %d time %d\n", v.buffer, (d->buffer_time[v.buffer]).tv_usec);
 
 		/*
 		 * Look ahead to see how many more buffers have been received
@@ -1068,7 +1055,7 @@ static int __video1394_ioctl(struct file *file,
 
 		spin_lock_irqsave(&d->lock,flags);
 
-		// last_buffer is last_prg
+		/* last_buffer is last_prg */
 		next_prg = (d->last_buffer + 1) % d->num_desc;
 		if (d->buffer_status[v.buffer]!=VIDEO1394_BUFFER_FREE) {
 			PRINT(KERN_ERR, ohci->host->id,
-- 
cgit v1.1


From 977545e35289b13981614a57fd6c9b82d55e3b4a Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Sat, 19 Nov 2005 21:35:22 -0500
Subject: sbp2: slimmer interface to scsi_mod

- sbp2scsi_reset does not need to take host_lock
- sbp2scsi_reset, as our device reset handler, does not need to stand in as
  bus reset or host reset handler
- let scsi_mod use scsi_host_template.name instead of .info
  (sbp2 is not an emulation anway)

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 84875cd..f0763b7 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -2609,27 +2609,17 @@ static int sbp2scsi_reset(struct scsi_cmnd *SCpnt)
 {
 	struct scsi_id_instance_data *scsi_id =
 		(struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0];
-	unsigned long flags;
 
 	SBP2_ERR("reset requested");
 
-	spin_lock_irqsave(SCpnt->device->host->host_lock, flags);
-
 	if (sbp2util_node_is_available(scsi_id)) {
 		SBP2_ERR("Generating sbp2 fetch agent reset");
 		sbp2_agent_reset(scsi_id, 0);
 	}
 
-	spin_unlock_irqrestore(SCpnt->device->host->host_lock, flags);
-
 	return SUCCESS;
 }
 
-static const char *sbp2scsi_info(struct Scsi_Host *host)
-{
-	return "SCSI emulation for IEEE-1394 SBP-2 Devices";
-}
-
 static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev,
 					   struct device_attribute *attr,
 					   char *buf)
@@ -2666,12 +2656,9 @@ static struct scsi_host_template scsi_driver_template = {
 	.module =			THIS_MODULE,
 	.name =				"SBP-2 IEEE-1394",
 	.proc_name =			SBP2_DEVICE_NAME,
-	.info =				sbp2scsi_info,
 	.queuecommand =			sbp2scsi_queuecommand,
 	.eh_abort_handler =		sbp2scsi_abort,
 	.eh_device_reset_handler =	sbp2scsi_reset,
-	.eh_bus_reset_handler =		sbp2scsi_reset,
-	.eh_host_reset_handler =	sbp2scsi_reset,
 	.slave_alloc =			sbp2scsi_slave_alloc,
 	.slave_configure =		sbp2scsi_slave_configure,
 	.slave_destroy =		sbp2scsi_slave_destroy,
-- 
cgit v1.1


From d734f92b0dc4c04daa2e0106354972cbbc2e0fbe Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 21 Nov 2005 17:32:14 -0500
Subject: drivers/ieee1394/raw1394.c: fix a NULL pointer

The coverity checker spotted that this was a NULL pointer dereference in
the "if (copy_from_user(...))" case since the next step is to
kfree(cache->filled_head).

There's no need to free cache at this point, and it's getting free'd
later.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/raw1394.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 0278dc5d..99b2ce1 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -2131,7 +2131,6 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
 			   req->req.length)) {
 		csr1212_release_keyval(fi->csr1212_dirs[dr]);
 		fi->csr1212_dirs[dr] = NULL;
-		CSR1212_FREE(cache);
 		ret = -EFAULT;
 	} else {
 		cache->len = req->req.length;
-- 
cgit v1.1


From b12479ddce4aed112e0018fdf8bbb7cfb349ebdc Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 21 Nov 2005 17:32:18 -0500
Subject: raw1394: fix memory deallocation in modify_config_rom

raw1394: use correct deallocation macro for CSR cache

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/raw1394.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 99b2ce1..89cac1f 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -2166,7 +2166,7 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
 		}
 	}
 	kfree(cache->filled_head);
-	kfree(cache);
+	CSR1212_FREE(cache);
 
 	if (ret >= 0) {
 		/* we have to free the request, because we queue no response,
-- 
cgit v1.1


From 5303a986c33ae6c75d5ffb57d06ccf9246a8725a Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjody@steamballoon.com>
Date: Tue, 22 Nov 2005 12:17:11 -0500
Subject: csr1212: check results of keyval reads

csr1212_parse_csr() did not properly check return values when reading
keyvals.  Fix this by using _csr1212_read_keyval() instead of
csr1212_get_keyval() and checking the return code.

Signed-off-by: Jody McIntyre <scjody@steamballoon.com>
---
 drivers/ieee1394/csr1212.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index c0f8ed6..4812d59 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -1610,15 +1610,16 @@ int csr1212_parse_csr(struct csr1212_csr *csr)
 	csr->root_kv->valid = 0;
 	csr->root_kv->next = csr->root_kv;
 	csr->root_kv->prev = csr->root_kv;
-	csr1212_get_keyval(csr, csr->root_kv);
+	ret = _csr1212_read_keyval(csr, csr->root_kv);
+	if (ret != CSR1212_SUCCESS)
+		return ret;
 
 	/* Scan through the Root directory finding all extended ROM regions
 	 * and make cache regions for them */
 	for (dentry = csr->root_kv->value.directory.dentries_head;
 	     dentry; dentry = dentry->next) {
 		if (dentry->kv->key.id == CSR1212_KV_ID_EXTENDED_ROM) {
-			csr1212_get_keyval(csr, dentry->kv);
-
+			ret = _csr1212_read_keyval(csr, dentry->kv);
 			if (ret != CSR1212_SUCCESS)
 				return ret;
 		}
-- 
cgit v1.1


From a96074e76f87a4f658af4ecfd95edc89cfd61fc1 Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjdy@steamballoon.com>
Date: Tue, 22 Nov 2005 12:17:14 -0500
Subject: csr1212: add check for !valid

Don't read the keyval if there's already a valid one in place.  May not be
necessary but shouldn't hurt.

Signed-off-by: Jody McIntyre <scjdy@steamballoon.com>
---
 drivers/ieee1394/csr1212.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index 4812d59..1577354 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -1618,7 +1618,8 @@ int csr1212_parse_csr(struct csr1212_csr *csr)
 	 * and make cache regions for them */
 	for (dentry = csr->root_kv->value.directory.dentries_head;
 	     dentry; dentry = dentry->next) {
-		if (dentry->kv->key.id == CSR1212_KV_ID_EXTENDED_ROM) {
+		if (dentry->kv->key.id == CSR1212_KV_ID_EXTENDED_ROM &&
+			!dentry->kv->valid) {
 			ret = _csr1212_read_keyval(csr, dentry->kv);
 			if (ret != CSR1212_SUCCESS)
 				return ret;
-- 
cgit v1.1


From 6649e92d792efa00a823781bcee2dba7f21199ba Mon Sep 17 00:00:00 2001
From: Jens-Michael Hoffmann <jensmh@gmx.de>
Date: Tue, 22 Nov 2005 12:18:28 -0500
Subject: ieee1394/dma: LIndent fixes

This patch contains fixes by LIndent.

Signed-off-by: Jens-Michael Hoffmann <jensmh@gmx.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/dma.c | 73 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index b79ddb4..9fb2769 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -23,7 +23,8 @@ void dma_prog_region_init(struct dma_prog_region *prog)
 	prog->bus_addr = 0;
 }
 
-int  dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes, struct pci_dev *dev)
+int dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
+			  struct pci_dev *dev)
 {
 	/* round up to page size */
 	n_bytes = PAGE_ALIGN(n_bytes);
@@ -32,7 +33,8 @@ int  dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
 
 	prog->kvirt = pci_alloc_consistent(dev, n_bytes, &prog->bus_addr);
 	if (!prog->kvirt) {
-		printk(KERN_ERR "dma_prog_region_alloc: pci_alloc_consistent() failed\n");
+		printk(KERN_ERR
+		       "dma_prog_region_alloc: pci_alloc_consistent() failed\n");
 		dma_prog_region_free(prog);
 		return -ENOMEM;
 	}
@@ -45,7 +47,8 @@ int  dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
 void dma_prog_region_free(struct dma_prog_region *prog)
 {
 	if (prog->kvirt) {
-		pci_free_consistent(prog->dev, prog->n_pages << PAGE_SHIFT, prog->kvirt, prog->bus_addr);
+		pci_free_consistent(prog->dev, prog->n_pages << PAGE_SHIFT,
+				    prog->kvirt, prog->bus_addr);
 	}
 
 	prog->kvirt = NULL;
@@ -65,7 +68,8 @@ void dma_region_init(struct dma_region *dma)
 	dma->sglist = NULL;
 }
 
-int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_dev *dev, int direction)
+int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes,
+		     struct pci_dev *dev, int direction)
 {
 	unsigned int i;
 
@@ -95,14 +99,16 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_d
 
 	/* fill scatter/gather list with pages */
 	for (i = 0; i < dma->n_pages; i++) {
-		unsigned long va = (unsigned long) dma->kvirt + (i << PAGE_SHIFT);
+		unsigned long va =
+		    (unsigned long)dma->kvirt + (i << PAGE_SHIFT);
 
 		dma->sglist[i].page = vmalloc_to_page((void *)va);
 		dma->sglist[i].length = PAGE_SIZE;
 	}
 
 	/* map sglist to the IOMMU */
-	dma->n_dma_pages = pci_map_sg(dev, dma->sglist, dma->n_pages, direction);
+	dma->n_dma_pages =
+	    pci_map_sg(dev, dma->sglist, dma->n_pages, direction);
 
 	if (dma->n_dma_pages == 0) {
 		printk(KERN_ERR "dma_region_alloc: pci_map_sg() failed\n");
@@ -114,7 +120,7 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_d
 
 	return 0;
 
-err:
+      err:
 	dma_region_free(dma);
 	return -ENOMEM;
 }
@@ -122,7 +128,8 @@ err:
 void dma_region_free(struct dma_region *dma)
 {
 	if (dma->n_dma_pages) {
-		pci_unmap_sg(dma->dev, dma->sglist, dma->n_pages, dma->direction);
+		pci_unmap_sg(dma->dev, dma->sglist, dma->n_pages,
+			     dma->direction);
 		dma->n_dma_pages = 0;
 		dma->dev = NULL;
 	}
@@ -137,7 +144,8 @@ void dma_region_free(struct dma_region *dma)
 
 /* find the scatterlist index and remaining offset corresponding to a
    given offset from the beginning of the buffer */
-static inline int dma_region_find(struct dma_region *dma, unsigned long offset, unsigned long *rem)
+static inline int dma_region_find(struct dma_region *dma, unsigned long offset,
+				  unsigned long *rem)
 {
 	int i;
 	unsigned long off = offset;
@@ -156,15 +164,18 @@ static inline int dma_region_find(struct dma_region *dma, unsigned long offset,
 	return i;
 }
 
-dma_addr_t dma_region_offset_to_bus(struct dma_region *dma, unsigned long offset)
+dma_addr_t dma_region_offset_to_bus(struct dma_region * dma,
+				    unsigned long offset)
 {
 	unsigned long rem = 0;
 
-	struct scatterlist *sg = &dma->sglist[dma_region_find(dma, offset, &rem)];
+	struct scatterlist *sg =
+	    &dma->sglist[dma_region_find(dma, offset, &rem)];
 	return sg_dma_address(sg) + rem;
 }
 
-void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset, unsigned long len)
+void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset,
+			     unsigned long len)
 {
 	int first, last;
 	unsigned long rem;
@@ -175,10 +186,12 @@ void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset, unsig
 	first = dma_region_find(dma, offset, &rem);
 	last = dma_region_find(dma, offset + len - 1, &rem);
 
-	pci_dma_sync_sg_for_cpu(dma->dev, &dma->sglist[first], last - first + 1, dma->direction);
+	pci_dma_sync_sg_for_cpu(dma->dev, &dma->sglist[first], last - first + 1,
+				dma->direction);
 }
 
-void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset, unsigned long len)
+void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset,
+				unsigned long len)
 {
 	int first, last;
 	unsigned long rem;
@@ -189,44 +202,47 @@ void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset, un
 	first = dma_region_find(dma, offset, &rem);
 	last = dma_region_find(dma, offset + len - 1, &rem);
 
-	pci_dma_sync_sg_for_device(dma->dev, &dma->sglist[first], last - first + 1, dma->direction);
+	pci_dma_sync_sg_for_device(dma->dev, &dma->sglist[first],
+				   last - first + 1, dma->direction);
 }
 
 #ifdef CONFIG_MMU
 
 /* nopage() handler for mmap access */
 
-static struct page*
-dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int *type)
+static struct page *dma_region_pagefault(struct vm_area_struct *area,
+					 unsigned long address, int *type)
 {
 	unsigned long offset;
 	unsigned long kernel_virt_addr;
 	struct page *ret = NOPAGE_SIGBUS;
 
-	struct dma_region *dma = (struct dma_region*) area->vm_private_data;
+	struct dma_region *dma = (struct dma_region *)area->vm_private_data;
 
 	if (!dma->kvirt)
 		goto out;
 
-	if ( (address < (unsigned long) area->vm_start) ||
-	    (address > (unsigned long) area->vm_start + (dma->n_pages << PAGE_SHIFT)) )
+	if ((address < (unsigned long)area->vm_start) ||
+	    (address >
+	     (unsigned long)area->vm_start + (dma->n_pages << PAGE_SHIFT)))
 		goto out;
 
 	if (type)
 		*type = VM_FAULT_MINOR;
 	offset = address - area->vm_start;
-	kernel_virt_addr = (unsigned long) dma->kvirt + offset;
-	ret = vmalloc_to_page((void*) kernel_virt_addr);
+	kernel_virt_addr = (unsigned long)dma->kvirt + offset;
+	ret = vmalloc_to_page((void *)kernel_virt_addr);
 	get_page(ret);
-out:
+      out:
 	return ret;
 }
 
 static struct vm_operations_struct dma_region_vm_ops = {
-	.nopage	= dma_region_pagefault,
+	.nopage = dma_region_pagefault,
 };
 
-int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_struct *vma)
+int dma_region_mmap(struct dma_region *dma, struct file *file,
+		    struct vm_area_struct *vma)
 {
 	unsigned long size;
 
@@ -250,11 +266,12 @@ int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_st
 	return 0;
 }
 
-#else /* CONFIG_MMU */
+#else				/* CONFIG_MMU */
 
-int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_struct *vma)
+int dma_region_mmap(struct dma_region *dma, struct file *file,
+		    struct vm_area_struct *vma)
 {
 	return -EINVAL;
 }
 
-#endif /* CONFIG_MMU */
+#endif				/* CONFIG_MMU */
-- 
cgit v1.1


From 16c333a34a1a0441c54c4fe5cf6052716f95c2fa Mon Sep 17 00:00:00 2001
From: Jens-Michael Hoffmann <jensmh@gmx.de>
Date: Tue, 22 Nov 2005 12:34:16 -0500
Subject: ieee1394/ieee1394_transactions: LIndent fixes

This patch contains fixes by LIndent.

Signed-off-by: Jens-Michael Hoffmann <jensmh@gmx.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ieee1394_transactions.c | 388 ++++++++++++++++---------------
 1 file changed, 195 insertions(+), 193 deletions(-)

diff --git a/drivers/ieee1394/ieee1394_transactions.c b/drivers/ieee1394/ieee1394_transactions.c
index 81b983c..3fe2f6c 100644
--- a/drivers/ieee1394/ieee1394_transactions.c
+++ b/drivers/ieee1394/ieee1394_transactions.c
@@ -24,7 +24,6 @@
 #include "nodemgr.h"
 #include "ieee1394_transactions.h"
 
-
 #define PREP_ASYNC_HEAD_ADDRESS(tc) \
         packet->tcode = tc; \
         packet->header[0] = (packet->node_id << 16) | (packet->tlabel << 10) \
@@ -32,80 +31,82 @@
         packet->header[1] = (packet->host->node_id << 16) | (addr >> 32); \
         packet->header[2] = addr & 0xffffffff
 
-
 static void fill_async_readquad(struct hpsb_packet *packet, u64 addr)
 {
-        PREP_ASYNC_HEAD_ADDRESS(TCODE_READQ);
-        packet->header_size = 12;
-        packet->data_size = 0;
-        packet->expect_response = 1;
+	PREP_ASYNC_HEAD_ADDRESS(TCODE_READQ);
+	packet->header_size = 12;
+	packet->data_size = 0;
+	packet->expect_response = 1;
 }
 
-static void fill_async_readblock(struct hpsb_packet *packet, u64 addr, int length)
+static void fill_async_readblock(struct hpsb_packet *packet, u64 addr,
+				 int length)
 {
-        PREP_ASYNC_HEAD_ADDRESS(TCODE_READB);
-        packet->header[3] = length << 16;
-        packet->header_size = 16;
-        packet->data_size = 0;
-        packet->expect_response = 1;
+	PREP_ASYNC_HEAD_ADDRESS(TCODE_READB);
+	packet->header[3] = length << 16;
+	packet->header_size = 16;
+	packet->data_size = 0;
+	packet->expect_response = 1;
 }
 
-static void fill_async_writequad(struct hpsb_packet *packet, u64 addr, quadlet_t data)
+static void fill_async_writequad(struct hpsb_packet *packet, u64 addr,
+				 quadlet_t data)
 {
-        PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEQ);
-        packet->header[3] = data;
-        packet->header_size = 16;
-        packet->data_size = 0;
-        packet->expect_response = 1;
+	PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEQ);
+	packet->header[3] = data;
+	packet->header_size = 16;
+	packet->data_size = 0;
+	packet->expect_response = 1;
 }
 
-static void fill_async_writeblock(struct hpsb_packet *packet, u64 addr, int length)
+static void fill_async_writeblock(struct hpsb_packet *packet, u64 addr,
+				  int length)
 {
-        PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEB);
-        packet->header[3] = length << 16;
-        packet->header_size = 16;
-        packet->expect_response = 1;
-        packet->data_size = length + (length % 4 ? 4 - (length % 4) : 0);
+	PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEB);
+	packet->header[3] = length << 16;
+	packet->header_size = 16;
+	packet->expect_response = 1;
+	packet->data_size = length + (length % 4 ? 4 - (length % 4) : 0);
 }
 
 static void fill_async_lock(struct hpsb_packet *packet, u64 addr, int extcode,
-                     int length)
+			    int length)
 {
-        PREP_ASYNC_HEAD_ADDRESS(TCODE_LOCK_REQUEST);
-        packet->header[3] = (length << 16) | extcode;
-        packet->header_size = 16;
-        packet->data_size = length;
-        packet->expect_response = 1;
+	PREP_ASYNC_HEAD_ADDRESS(TCODE_LOCK_REQUEST);
+	packet->header[3] = (length << 16) | extcode;
+	packet->header_size = 16;
+	packet->data_size = length;
+	packet->expect_response = 1;
 }
 
 static void fill_iso_packet(struct hpsb_packet *packet, int length, int channel,
-                     int tag, int sync)
+			    int tag, int sync)
 {
-        packet->header[0] = (length << 16) | (tag << 14) | (channel << 8)
-                | (TCODE_ISO_DATA << 4) | sync;
+	packet->header[0] = (length << 16) | (tag << 14) | (channel << 8)
+	    | (TCODE_ISO_DATA << 4) | sync;
 
-        packet->header_size = 4;
-        packet->data_size = length;
-        packet->type = hpsb_iso;
-        packet->tcode = TCODE_ISO_DATA;
+	packet->header_size = 4;
+	packet->data_size = length;
+	packet->type = hpsb_iso;
+	packet->tcode = TCODE_ISO_DATA;
 }
 
 static void fill_phy_packet(struct hpsb_packet *packet, quadlet_t data)
 {
-        packet->header[0] = data;
-        packet->header[1] = ~data;
-        packet->header_size = 8;
-        packet->data_size = 0;
-        packet->expect_response = 0;
-        packet->type = hpsb_raw;             /* No CRC added */
-        packet->speed_code = IEEE1394_SPEED_100; /* Force speed to be 100Mbps */
+	packet->header[0] = data;
+	packet->header[1] = ~data;
+	packet->header_size = 8;
+	packet->data_size = 0;
+	packet->expect_response = 0;
+	packet->type = hpsb_raw;	/* No CRC added */
+	packet->speed_code = IEEE1394_SPEED_100;	/* Force speed to be 100Mbps */
 }
 
 static void fill_async_stream_packet(struct hpsb_packet *packet, int length,
 				     int channel, int tag, int sync)
 {
 	packet->header[0] = (length << 16) | (tag << 14) | (channel << 8)
-	                  | (TCODE_STREAM_DATA << 4) | sync;
+	    | (TCODE_STREAM_DATA << 4) | sync;
 
 	packet->header_size = 4;
 	packet->data_size = length;
@@ -172,99 +173,96 @@ int hpsb_get_tlabel(struct hpsb_packet *packet)
  */
 void hpsb_free_tlabel(struct hpsb_packet *packet)
 {
-        unsigned long flags;
+	unsigned long flags;
 	struct hpsb_tlabel_pool *tp;
 
 	tp = &packet->host->tpool[packet->node_id & NODE_MASK];
 
 	BUG_ON(packet->tlabel > 63 || packet->tlabel < 0);
 
-        spin_lock_irqsave(&tp->lock, flags);
+	spin_lock_irqsave(&tp->lock, flags);
 	BUG_ON(!test_and_clear_bit(packet->tlabel, tp->pool));
-        spin_unlock_irqrestore(&tp->lock, flags);
+	spin_unlock_irqrestore(&tp->lock, flags);
 
 	up(&tp->count);
 }
 
-
-
 int hpsb_packet_success(struct hpsb_packet *packet)
 {
-        switch (packet->ack_code) {
-        case ACK_PENDING:
-                switch ((packet->header[1] >> 12) & 0xf) {
-                case RCODE_COMPLETE:
-                        return 0;
-                case RCODE_CONFLICT_ERROR:
-                        return -EAGAIN;
-                case RCODE_DATA_ERROR:
-                        return -EREMOTEIO;
-                case RCODE_TYPE_ERROR:
-                        return -EACCES;
-                case RCODE_ADDRESS_ERROR:
-                        return -EINVAL;
-                default:
-                        HPSB_ERR("received reserved rcode %d from node %d",
-                                 (packet->header[1] >> 12) & 0xf,
-                                 packet->node_id);
-                        return -EAGAIN;
-                }
-                HPSB_PANIC("reached unreachable code 1 in %s", __FUNCTION__);
-
-        case ACK_BUSY_X:
-        case ACK_BUSY_A:
-        case ACK_BUSY_B:
-                return -EBUSY;
-
-        case ACK_TYPE_ERROR:
-                return -EACCES;
-
-        case ACK_COMPLETE:
-                if (packet->tcode == TCODE_WRITEQ
-                    || packet->tcode == TCODE_WRITEB) {
-                        return 0;
-                } else {
-                        HPSB_ERR("impossible ack_complete from node %d "
-                                 "(tcode %d)", packet->node_id, packet->tcode);
-                        return -EAGAIN;
-                }
-
-
-        case ACK_DATA_ERROR:
-                if (packet->tcode == TCODE_WRITEB
-                    || packet->tcode == TCODE_LOCK_REQUEST) {
-                        return -EAGAIN;
-                } else {
-                        HPSB_ERR("impossible ack_data_error from node %d "
-                                 "(tcode %d)", packet->node_id, packet->tcode);
-                        return -EAGAIN;
-                }
-
-        case ACK_ADDRESS_ERROR:
-                return -EINVAL;
-
-        case ACK_TARDY:
-        case ACK_CONFLICT_ERROR:
-        case ACKX_NONE:
-        case ACKX_SEND_ERROR:
-        case ACKX_ABORTED:
-        case ACKX_TIMEOUT:
-                /* error while sending */
-                return -EAGAIN;
-
-        default:
-                HPSB_ERR("got invalid ack %d from node %d (tcode %d)",
-                         packet->ack_code, packet->node_id, packet->tcode);
-                return -EAGAIN;
-        }
-
-        HPSB_PANIC("reached unreachable code 2 in %s", __FUNCTION__);
+	switch (packet->ack_code) {
+	case ACK_PENDING:
+		switch ((packet->header[1] >> 12) & 0xf) {
+		case RCODE_COMPLETE:
+			return 0;
+		case RCODE_CONFLICT_ERROR:
+			return -EAGAIN;
+		case RCODE_DATA_ERROR:
+			return -EREMOTEIO;
+		case RCODE_TYPE_ERROR:
+			return -EACCES;
+		case RCODE_ADDRESS_ERROR:
+			return -EINVAL;
+		default:
+			HPSB_ERR("received reserved rcode %d from node %d",
+				 (packet->header[1] >> 12) & 0xf,
+				 packet->node_id);
+			return -EAGAIN;
+		}
+		HPSB_PANIC("reached unreachable code 1 in %s", __FUNCTION__);
+
+	case ACK_BUSY_X:
+	case ACK_BUSY_A:
+	case ACK_BUSY_B:
+		return -EBUSY;
+
+	case ACK_TYPE_ERROR:
+		return -EACCES;
+
+	case ACK_COMPLETE:
+		if (packet->tcode == TCODE_WRITEQ
+		    || packet->tcode == TCODE_WRITEB) {
+			return 0;
+		} else {
+			HPSB_ERR("impossible ack_complete from node %d "
+				 "(tcode %d)", packet->node_id, packet->tcode);
+			return -EAGAIN;
+		}
+
+	case ACK_DATA_ERROR:
+		if (packet->tcode == TCODE_WRITEB
+		    || packet->tcode == TCODE_LOCK_REQUEST) {
+			return -EAGAIN;
+		} else {
+			HPSB_ERR("impossible ack_data_error from node %d "
+				 "(tcode %d)", packet->node_id, packet->tcode);
+			return -EAGAIN;
+		}
+
+	case ACK_ADDRESS_ERROR:
+		return -EINVAL;
+
+	case ACK_TARDY:
+	case ACK_CONFLICT_ERROR:
+	case ACKX_NONE:
+	case ACKX_SEND_ERROR:
+	case ACKX_ABORTED:
+	case ACKX_TIMEOUT:
+		/* error while sending */
+		return -EAGAIN;
+
+	default:
+		HPSB_ERR("got invalid ack %d from node %d (tcode %d)",
+			 packet->ack_code, packet->node_id, packet->tcode);
+		return -EAGAIN;
+	}
+
+	HPSB_PANIC("reached unreachable code 2 in %s", __FUNCTION__);
 }
 
 struct hpsb_packet *hpsb_make_readpacket(struct hpsb_host *host, nodeid_t node,
 					 u64 addr, size_t length)
 {
-        struct hpsb_packet *packet;
+	struct hpsb_packet *packet;
 
 	if (length == 0)
 		return NULL;
@@ -289,8 +287,9 @@ struct hpsb_packet *hpsb_make_readpacket(struct hpsb_host *host, nodeid_t node,
 	return packet;
 }
 
-struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node,
-					   u64 addr, quadlet_t *buffer, size_t length)
+struct hpsb_packet *hpsb_make_writepacket(struct hpsb_host *host, nodeid_t node,
+					  u64 addr, quadlet_t * buffer,
+					  size_t length)
 {
 	struct hpsb_packet *packet;
 
@@ -301,7 +300,7 @@ struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node
 	if (!packet)
 		return NULL;
 
-	if (length % 4) { /* zero padding bytes */
+	if (length % 4) {	/* zero padding bytes */
 		packet->data[length >> 2] = 0;
 	}
 	packet->host = host;
@@ -323,8 +322,9 @@ struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node
 	return packet;
 }
 
-struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, int length,
-                                           int channel, int tag, int sync)
+struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 * buffer,
+					   int length, int channel, int tag,
+					   int sync)
 {
 	struct hpsb_packet *packet;
 
@@ -335,7 +335,7 @@ struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, i
 	if (!packet)
 		return NULL;
 
-	if (length % 4) { /* zero padding bytes */
+	if (length % 4) {	/* zero padding bytes */
 		packet->data[length >> 2] = 0;
 	}
 	packet->host = host;
@@ -353,14 +353,15 @@ struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, i
 }
 
 struct hpsb_packet *hpsb_make_lockpacket(struct hpsb_host *host, nodeid_t node,
-                                         u64 addr, int extcode, quadlet_t *data,
-					 quadlet_t arg)
+					 u64 addr, int extcode,
+					 quadlet_t * data, quadlet_t arg)
 {
 	struct hpsb_packet *p;
 	u32 length;
 
 	p = hpsb_alloc_packet(8);
-	if (!p) return NULL;
+	if (!p)
+		return NULL;
 
 	p->host = host;
 	p->node_id = node;
@@ -389,15 +390,16 @@ struct hpsb_packet *hpsb_make_lockpacket(struct hpsb_host *host, nodeid_t node,
 	return p;
 }
 
-struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host, nodeid_t node,
-                                           u64 addr, int extcode, octlet_t *data,
-					   octlet_t arg)
+struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host,
+					   nodeid_t node, u64 addr, int extcode,
+					   octlet_t * data, octlet_t arg)
 {
 	struct hpsb_packet *p;
 	u32 length;
 
 	p = hpsb_alloc_packet(16);
-	if (!p) return NULL;
+	if (!p)
+		return NULL;
 
 	p->host = host;
 	p->node_id = node;
@@ -430,18 +432,18 @@ struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host, nodeid_t node
 	return p;
 }
 
-struct hpsb_packet *hpsb_make_phypacket(struct hpsb_host *host,
-                                        quadlet_t data)
+struct hpsb_packet *hpsb_make_phypacket(struct hpsb_host *host, quadlet_t data)
 {
-        struct hpsb_packet *p;
+	struct hpsb_packet *p;
 
-        p = hpsb_alloc_packet(0);
-        if (!p) return NULL;
+	p = hpsb_alloc_packet(0);
+	if (!p)
+		return NULL;
 
-        p->host = host;
-        fill_phy_packet(p, data);
+	p->host = host;
+	fill_phy_packet(p, data);
 
-        return p;
+	return p;
 }
 
 struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
@@ -451,7 +453,8 @@ struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
 	struct hpsb_packet *p;
 
 	p = hpsb_alloc_packet(length);
-	if (!p) return NULL;
+	if (!p)
+		return NULL;
 
 	p->host = host;
 	fill_iso_packet(p, length, channel, tag, sync);
@@ -467,47 +470,46 @@ struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
  */
 
 int hpsb_read(struct hpsb_host *host, nodeid_t node, unsigned int generation,
-	      u64 addr, quadlet_t *buffer, size_t length)
+	      u64 addr, quadlet_t * buffer, size_t length)
 {
-        struct hpsb_packet *packet;
-        int retval = 0;
+	struct hpsb_packet *packet;
+	int retval = 0;
 
-        if (length == 0)
-                return -EINVAL;
+	if (length == 0)
+		return -EINVAL;
 
-	BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
+	BUG_ON(in_interrupt());	// We can't be called in an interrupt, yet
 
 	packet = hpsb_make_readpacket(host, node, addr, length);
 
-        if (!packet) {
-                return -ENOMEM;
-        }
+	if (!packet) {
+		return -ENOMEM;
+	}
 
 	packet->generation = generation;
-        retval = hpsb_send_packet_and_wait(packet);
+	retval = hpsb_send_packet_and_wait(packet);
 	if (retval < 0)
 		goto hpsb_read_fail;
 
-        retval = hpsb_packet_success(packet);
+	retval = hpsb_packet_success(packet);
 
-        if (retval == 0) {
-                if (length == 4) {
-                        *buffer = packet->header[3];
-                } else {
-                        memcpy(buffer, packet->data, length);
-                }
-        }
+	if (retval == 0) {
+		if (length == 4) {
+			*buffer = packet->header[3];
+		} else {
+			memcpy(buffer, packet->data, length);
+		}
+	}
 
-hpsb_read_fail:
-        hpsb_free_tlabel(packet);
-        hpsb_free_packet(packet);
+      hpsb_read_fail:
+	hpsb_free_tlabel(packet);
+	hpsb_free_packet(packet);
 
-        return retval;
+	return retval;
 }
 
-
 int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation,
-	       u64 addr, quadlet_t *buffer, size_t length)
+	       u64 addr, quadlet_t * buffer, size_t length)
 {
 	struct hpsb_packet *packet;
 	int retval;
@@ -515,62 +517,61 @@ int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation,
 	if (length == 0)
 		return -EINVAL;
 
-	BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
+	BUG_ON(in_interrupt());	// We can't be called in an interrupt, yet
 
-	packet = hpsb_make_writepacket (host, node, addr, buffer, length);
+	packet = hpsb_make_writepacket(host, node, addr, buffer, length);
 
 	if (!packet)
 		return -ENOMEM;
 
 	packet->generation = generation;
-        retval = hpsb_send_packet_and_wait(packet);
+	retval = hpsb_send_packet_and_wait(packet);
 	if (retval < 0)
 		goto hpsb_write_fail;
 
-        retval = hpsb_packet_success(packet);
+	retval = hpsb_packet_success(packet);
 
-hpsb_write_fail:
-        hpsb_free_tlabel(packet);
-        hpsb_free_packet(packet);
+      hpsb_write_fail:
+	hpsb_free_tlabel(packet);
+	hpsb_free_packet(packet);
 
-        return retval;
+	return retval;
 }
 
 #if 0
 
 int hpsb_lock(struct hpsb_host *host, nodeid_t node, unsigned int generation,
-	      u64 addr, int extcode, quadlet_t *data, quadlet_t arg)
+	      u64 addr, int extcode, quadlet_t * data, quadlet_t arg)
 {
-        struct hpsb_packet *packet;
-        int retval = 0;
+	struct hpsb_packet *packet;
+	int retval = 0;
 
-	BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
+	BUG_ON(in_interrupt());	// We can't be called in an interrupt, yet
 
 	packet = hpsb_make_lockpacket(host, node, addr, extcode, data, arg);
-        if (!packet)
-                return -ENOMEM;
+	if (!packet)
+		return -ENOMEM;
 
 	packet->generation = generation;
-        retval = hpsb_send_packet_and_wait(packet);
+	retval = hpsb_send_packet_and_wait(packet);
 	if (retval < 0)
 		goto hpsb_lock_fail;
 
-        retval = hpsb_packet_success(packet);
+	retval = hpsb_packet_success(packet);
 
-        if (retval == 0) {
-                *data = packet->data[0];
-        }
+	if (retval == 0) {
+		*data = packet->data[0];
+	}
 
-hpsb_lock_fail:
-        hpsb_free_tlabel(packet);
-        hpsb_free_packet(packet);
+      hpsb_lock_fail:
+	hpsb_free_tlabel(packet);
+	hpsb_free_packet(packet);
 
-        return retval;
+	return retval;
 }
 
-
 int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
-		   quadlet_t *buffer, size_t length, u32 specifier_id,
+		   quadlet_t * buffer, size_t length, u32 specifier_id,
 		   unsigned int version)
 {
 	struct hpsb_packet *packet;
@@ -587,7 +588,8 @@ int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
 		return -ENOMEM;
 
 	packet->data[0] = cpu_to_be32((host->node_id << 16) | specifier_id_hi);
-	packet->data[1] = cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff));
+	packet->data[1] =
+	    cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff));
 
 	memcpy(&(packet->data[2]), buffer, length - 8);
 
@@ -602,4 +604,4 @@ int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
 	return retval;
 }
 
-#endif  /*  0  */
+#endif				/*  0  */
-- 
cgit v1.1


From 066ef9c2fb30a22eca7724326e210f0405c51f29 Mon Sep 17 00:00:00 2001
From: Jens-Michael Hoffmann <jensmh@gmx.de>
Date: Tue, 22 Nov 2005 12:35:23 -0500
Subject: ieee1394/iso: LIndent fixes

This patch contains fixes by LIndent.

Signed-off-by: Jens-Michael Hoffmann <jensmh@gmx.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/iso.c | 102 +++++++++++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 42 deletions(-)

diff --git a/drivers/ieee1394/iso.c b/drivers/ieee1394/iso.c
index 615541b..f26680e 100644
--- a/drivers/ieee1394/iso.c
+++ b/drivers/ieee1394/iso.c
@@ -36,20 +36,22 @@ void hpsb_iso_shutdown(struct hpsb_iso *iso)
 	kfree(iso);
 }
 
-static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_iso_type type,
+static struct hpsb_iso *hpsb_iso_common_init(struct hpsb_host *host,
+					     enum hpsb_iso_type type,
 					     unsigned int data_buf_size,
 					     unsigned int buf_packets,
-					     int channel,
-					     int dma_mode,
+					     int channel, int dma_mode,
 					     int irq_interval,
-					     void (*callback)(struct hpsb_iso*))
+					     void (*callback) (struct hpsb_iso
+							       *))
 {
 	struct hpsb_iso *iso;
 	int dma_direction;
 
 	/* make sure driver supports the ISO API */
 	if (!host->driver->isoctl) {
-		printk(KERN_INFO "ieee1394: host driver '%s' does not support the rawiso API\n",
+		printk(KERN_INFO
+		       "ieee1394: host driver '%s' does not support the rawiso API\n",
 		       host->driver->name);
 		return NULL;
 	}
@@ -59,12 +61,13 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
 	if (buf_packets < 2)
 		buf_packets = 2;
 
-	if ((dma_mode < HPSB_ISO_DMA_DEFAULT) || (dma_mode > HPSB_ISO_DMA_PACKET_PER_BUFFER))
-		dma_mode=HPSB_ISO_DMA_DEFAULT;
+	if ((dma_mode < HPSB_ISO_DMA_DEFAULT)
+	    || (dma_mode > HPSB_ISO_DMA_PACKET_PER_BUFFER))
+		dma_mode = HPSB_ISO_DMA_DEFAULT;
 
 	if ((irq_interval < 0) || (irq_interval > buf_packets / 4))
- 		irq_interval = buf_packets / 4;
-	if (irq_interval == 0)     /* really interrupt for each packet*/
+		irq_interval = buf_packets / 4;
+	if (irq_interval == 0)	/* really interrupt for each packet */
 		irq_interval = 1;
 
 	if (channel < -1 || channel >= 64)
@@ -76,7 +79,10 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
 
 	/* allocate and write the struct hpsb_iso */
 
-	iso = kmalloc(sizeof(*iso) + buf_packets * sizeof(struct hpsb_iso_packet_info), GFP_KERNEL);
+	iso =
+	    kmalloc(sizeof(*iso) +
+		    buf_packets * sizeof(struct hpsb_iso_packet_info),
+		    GFP_KERNEL);
 	if (!iso)
 		return NULL;
 
@@ -111,17 +117,18 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
 	iso->prebuffer = 0;
 
 	/* allocate the packet buffer */
-	if (dma_region_alloc(&iso->data_buf, iso->buf_size, host->pdev, dma_direction))
+	if (dma_region_alloc
+	    (&iso->data_buf, iso->buf_size, host->pdev, dma_direction))
 		goto err;
 
 	return iso;
 
-err:
+      err:
 	hpsb_iso_shutdown(iso);
 	return NULL;
 }
 
-int hpsb_iso_n_ready(struct hpsb_iso* iso)
+int hpsb_iso_n_ready(struct hpsb_iso *iso)
 {
 	unsigned long flags;
 	int val;
@@ -133,18 +140,19 @@ int hpsb_iso_n_ready(struct hpsb_iso* iso)
 	return val;
 }
 
-
-struct hpsb_iso* hpsb_iso_xmit_init(struct hpsb_host *host,
+struct hpsb_iso *hpsb_iso_xmit_init(struct hpsb_host *host,
 				    unsigned int data_buf_size,
 				    unsigned int buf_packets,
 				    int channel,
 				    int speed,
 				    int irq_interval,
-				    void (*callback)(struct hpsb_iso*))
+				    void (*callback) (struct hpsb_iso *))
 {
 	struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_XMIT,
 						    data_buf_size, buf_packets,
-						    channel, HPSB_ISO_DMA_DEFAULT, irq_interval, callback);
+						    channel,
+						    HPSB_ISO_DMA_DEFAULT,
+						    irq_interval, callback);
 	if (!iso)
 		return NULL;
 
@@ -157,22 +165,23 @@ struct hpsb_iso* hpsb_iso_xmit_init(struct hpsb_host *host,
 	iso->flags |= HPSB_ISO_DRIVER_INIT;
 	return iso;
 
-err:
+      err:
 	hpsb_iso_shutdown(iso);
 	return NULL;
 }
 
-struct hpsb_iso* hpsb_iso_recv_init(struct hpsb_host *host,
+struct hpsb_iso *hpsb_iso_recv_init(struct hpsb_host *host,
 				    unsigned int data_buf_size,
 				    unsigned int buf_packets,
 				    int channel,
 				    int dma_mode,
 				    int irq_interval,
-				    void (*callback)(struct hpsb_iso*))
+				    void (*callback) (struct hpsb_iso *))
 {
 	struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_RECV,
 						    data_buf_size, buf_packets,
-						    channel, dma_mode, irq_interval, callback);
+						    channel, dma_mode,
+						    irq_interval, callback);
 	if (!iso)
 		return NULL;
 
@@ -183,7 +192,7 @@ struct hpsb_iso* hpsb_iso_recv_init(struct hpsb_host *host,
 	iso->flags |= HPSB_ISO_DRIVER_INIT;
 	return iso;
 
-err:
+      err:
 	hpsb_iso_shutdown(iso);
 	return NULL;
 }
@@ -197,16 +206,17 @@ int hpsb_iso_recv_listen_channel(struct hpsb_iso *iso, unsigned char channel)
 
 int hpsb_iso_recv_unlisten_channel(struct hpsb_iso *iso, unsigned char channel)
 {
-       if (iso->type != HPSB_ISO_RECV || iso->channel != -1 || channel >= 64)
-               return -EINVAL;
-       return iso->host->driver->isoctl(iso, RECV_UNLISTEN_CHANNEL, channel);
+	if (iso->type != HPSB_ISO_RECV || iso->channel != -1 || channel >= 64)
+		return -EINVAL;
+	return iso->host->driver->isoctl(iso, RECV_UNLISTEN_CHANNEL, channel);
 }
 
 int hpsb_iso_recv_set_channel_mask(struct hpsb_iso *iso, u64 mask)
 {
 	if (iso->type != HPSB_ISO_RECV || iso->channel != -1)
 		return -EINVAL;
-	return iso->host->driver->isoctl(iso, RECV_SET_CHANNEL_MASK, (unsigned long) &mask);
+	return iso->host->driver->isoctl(iso, RECV_SET_CHANNEL_MASK,
+					 (unsigned long)&mask);
 }
 
 int hpsb_iso_recv_flush(struct hpsb_iso *iso)
@@ -283,7 +293,9 @@ int hpsb_iso_recv_start(struct hpsb_iso *iso, int cycle, int tag_mask, int sync)
 
 	isoctl_args[2] = sync;
 
-	retval = iso->host->driver->isoctl(iso, RECV_START, (unsigned long) &isoctl_args[0]);
+	retval =
+	    iso->host->driver->isoctl(iso, RECV_START,
+				      (unsigned long)&isoctl_args[0]);
 	if (retval)
 		return retval;
 
@@ -296,7 +308,8 @@ int hpsb_iso_recv_start(struct hpsb_iso *iso, int cycle, int tag_mask, int sync)
 
 static int hpsb_iso_check_offset_len(struct hpsb_iso *iso,
 				     unsigned int offset, unsigned short len,
-				     unsigned int *out_offset, unsigned short *out_len)
+				     unsigned int *out_offset,
+				     unsigned short *out_len)
 {
 	if (offset >= iso->buf_size)
 		return -EFAULT;
@@ -316,8 +329,8 @@ static int hpsb_iso_check_offset_len(struct hpsb_iso *iso,
 	return 0;
 }
 
-
-int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag, u8 sy)
+int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len,
+			       u8 tag, u8 sy)
 {
 	struct hpsb_iso_packet_info *info;
 	unsigned long flags;
@@ -334,7 +347,8 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
 	info = &iso->infos[iso->first_packet];
 
 	/* check for bogus offset/length */
-	if (hpsb_iso_check_offset_len(iso, offset, len, &info->offset, &info->len))
+	if (hpsb_iso_check_offset_len
+	    (iso, offset, len, &info->offset, &info->len))
 		return -EFAULT;
 
 	info->tag = tag;
@@ -342,13 +356,13 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
 
 	spin_lock_irqsave(&iso->lock, flags);
 
-	rv = iso->host->driver->isoctl(iso, XMIT_QUEUE, (unsigned long) info);
+	rv = iso->host->driver->isoctl(iso, XMIT_QUEUE, (unsigned long)info);
 	if (rv)
 		goto out;
 
 	/* increment cursors */
-	iso->first_packet = (iso->first_packet+1) % iso->buf_packets;
-	iso->xmit_cycle = (iso->xmit_cycle+1) % 8000;
+	iso->first_packet = (iso->first_packet + 1) % iso->buf_packets;
+	iso->xmit_cycle = (iso->xmit_cycle + 1) % 8000;
 	iso->n_ready_packets--;
 
 	if (iso->prebuffer != 0) {
@@ -359,7 +373,7 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
 		}
 	}
 
-out:
+      out:
 	spin_unlock_irqrestore(&iso->lock, flags);
 	return rv;
 }
@@ -369,7 +383,9 @@ int hpsb_iso_xmit_sync(struct hpsb_iso *iso)
 	if (iso->type != HPSB_ISO_XMIT)
 		return -EINVAL;
 
-	return wait_event_interruptible(iso->waitq, hpsb_iso_n_ready(iso) == iso->buf_packets);
+	return wait_event_interruptible(iso->waitq,
+					hpsb_iso_n_ready(iso) ==
+					iso->buf_packets);
 }
 
 void hpsb_iso_packet_sent(struct hpsb_iso *iso, int cycle, int error)
@@ -396,7 +412,8 @@ void hpsb_iso_packet_sent(struct hpsb_iso *iso, int cycle, int error)
 }
 
 void hpsb_iso_packet_received(struct hpsb_iso *iso, u32 offset, u16 len,
-			      u16 total_len, u16 cycle, u8 channel, u8 tag, u8 sy)
+			      u16 total_len, u16 cycle, u8 channel, u8 tag,
+			      u8 sy)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&iso->lock, flags);
@@ -416,7 +433,7 @@ void hpsb_iso_packet_received(struct hpsb_iso *iso, u32 offset, u16 len,
 		info->tag = tag;
 		info->sy = sy;
 
-		iso->pkt_dma = (iso->pkt_dma+1) % iso->buf_packets;
+		iso->pkt_dma = (iso->pkt_dma + 1) % iso->buf_packets;
 		iso->n_ready_packets++;
 	}
 
@@ -435,20 +452,21 @@ int hpsb_iso_recv_release_packets(struct hpsb_iso *iso, unsigned int n_packets)
 	spin_lock_irqsave(&iso->lock, flags);
 	for (i = 0; i < n_packets; i++) {
 		rv = iso->host->driver->isoctl(iso, RECV_RELEASE,
-					       (unsigned long) &iso->infos[iso->first_packet]);
+					       (unsigned long)&iso->infos[iso->
+									  first_packet]);
 		if (rv)
 			break;
 
-		iso->first_packet = (iso->first_packet+1) % iso->buf_packets;
+		iso->first_packet = (iso->first_packet + 1) % iso->buf_packets;
 		iso->n_ready_packets--;
 
 		/* release memory from packets discarded when queue was full  */
-		if (iso->n_ready_packets == 0) { /* Release only after all prior packets handled */
+		if (iso->n_ready_packets == 0) {	/* Release only after all prior packets handled */
 			if (iso->bytes_discarded != 0) {
 				struct hpsb_iso_packet_info inf;
 				inf.total_len = iso->bytes_discarded;
 				iso->host->driver->isoctl(iso, RECV_RELEASE,
-							(unsigned long) &inf);
+							  (unsigned long)&inf);
 				iso->bytes_discarded = 0;
 			}
 		}
-- 
cgit v1.1


From c64d472abc68dcad4d34f365545058c3f11973d8 Mon Sep 17 00:00:00 2001
From: Jens-Michael Hoffmann <jensmh@gmx.de>
Date: Tue, 22 Nov 2005 12:37:10 -0500
Subject: ieee1394/raw1394: LIndent fixes

This patch contains fixes by LIndent.

Signed-off-by: Jens-Michael Hoffmann <jensmh@gmx.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/raw1394.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 89cac1f..b052356 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -2482,8 +2482,8 @@ static int raw1394_iso_recv_packets(struct file_info *fi, void __user * uaddr)
 
 	/* ensure user-supplied buffer is accessible and big enough */
 	if (!access_ok(VERIFY_WRITE, upackets.infos,
-			upackets.n_packets *
-			sizeof(struct raw1394_iso_packet_info)))
+		       upackets.n_packets *
+		       sizeof(struct raw1394_iso_packet_info)))
 		return -EFAULT;
 
 	/* copy the packet_infos out */
@@ -2516,8 +2516,8 @@ static int raw1394_iso_send_packets(struct file_info *fi, void __user * uaddr)
 
 	/* ensure user-supplied buffer is accessible and big enough */
 	if (!access_ok(VERIFY_READ, upackets.infos,
-			upackets.n_packets *
-			sizeof(struct raw1394_iso_packet_info)))
+		       upackets.n_packets *
+		       sizeof(struct raw1394_iso_packet_info)))
 		return -EFAULT;
 
 	/* copy the infos structs in and queue the packets */
@@ -2741,8 +2741,7 @@ static int raw1394_release(struct inode *inode, struct file *file)
 						    list) {
 					entry = fi_hlp->addr_list.next;
 					while (entry != &(fi_hlp->addr_list)) {
-						arm_addr = list_entry(entry,
-								      struct
+						arm_addr = list_entry(entry, struct
 								      arm_addr,
 								      addr_list);
 						if (arm_addr->start ==
@@ -2905,16 +2904,17 @@ static int __init init_raw1394(void)
 
 	hpsb_register_highlevel(&raw1394_highlevel);
 
-	if (IS_ERR(class_device_create(hpsb_protocol_class, NULL, MKDEV(
-		IEEE1394_MAJOR,	IEEE1394_MINOR_BLOCK_RAW1394 * 16), 
-		NULL, RAW1394_DEVICE_NAME))) {
+	if (IS_ERR
+	    (class_device_create
+	     (hpsb_protocol_class, NULL,
+	      MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16), NULL,
+	      RAW1394_DEVICE_NAME))) {
 		ret = -EFAULT;
 		goto out_unreg;
 	}
-	
-	devfs_mk_cdev(MKDEV(
-		IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16),
-		S_IFCHR | S_IRUSR | S_IWUSR, RAW1394_DEVICE_NAME);
+
+	devfs_mk_cdev(MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16),
+		      S_IFCHR | S_IRUSR | S_IWUSR, RAW1394_DEVICE_NAME);
 
 	cdev_init(&raw1394_cdev, &raw1394_fops);
 	raw1394_cdev.owner = THIS_MODULE;
@@ -2936,20 +2936,22 @@ static int __init init_raw1394(void)
 
 	goto out;
 
-out_dev:
+      out_dev:
 	devfs_remove(RAW1394_DEVICE_NAME);
 	class_device_destroy(hpsb_protocol_class,
-		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16));
-out_unreg:
+			     MKDEV(IEEE1394_MAJOR,
+				   IEEE1394_MINOR_BLOCK_RAW1394 * 16));
+      out_unreg:
 	hpsb_unregister_highlevel(&raw1394_highlevel);
-out:
+      out:
 	return ret;
 }
 
 static void __exit cleanup_raw1394(void)
 {
 	class_device_destroy(hpsb_protocol_class,
-		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16));
+			     MKDEV(IEEE1394_MAJOR,
+				   IEEE1394_MINOR_BLOCK_RAW1394 * 16));
 	cdev_del(&raw1394_cdev);
 	devfs_remove(RAW1394_DEVICE_NAME);
 	hpsb_unregister_highlevel(&raw1394_highlevel);
-- 
cgit v1.1


From 14c0fa243b358c24040ff5f44b60c47aaf6430c3 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 1 Dec 2005 18:51:52 -0500
Subject: ieee1394: resume remote ports when starting a host (fixes device
 recognition)

After initializing an IEEE 1394 host, broadcast a resume packet.  This makes
remote nodes visible which suspended their ports while the host was down.
Such nodes had to be unplugged and replugged in order to be recognized.

Motorola DCT6200 cable reciever was affected, probably other devices too.
http://marc.theaimsgroup.com/?t=113202715800001

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/hosts.h   |  1 +
 drivers/ieee1394/nodemgr.c | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/drivers/ieee1394/hosts.h b/drivers/ieee1394/hosts.h
index 38f4211..ae9b02c 100644
--- a/drivers/ieee1394/hosts.h
+++ b/drivers/ieee1394/hosts.h
@@ -41,6 +41,7 @@ struct hpsb_host {
         /* this nodes state */
         unsigned in_bus_reset:1;
         unsigned is_shutdown:1;
+	unsigned resume_packet_sent:1;
 
         /* this nodes' duties on the bus */
         unsigned is_root:1;
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 3f0917b..b56934e 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1410,6 +1410,24 @@ static void nodemgr_node_probe(struct host_info *hi, int generation)
 	return;
 }
 
+static int nodemgr_send_resume_packet(struct hpsb_host *host)
+{
+	struct hpsb_packet *packet;
+	int ret = 1;
+
+	packet = hpsb_make_phypacket(host,
+			0x003c0000 | NODEID_TO_NODE(host->node_id) << 24);
+	if (packet) {
+		packet->no_waiter = 1;
+		packet->generation = get_hpsb_generation(host);
+		ret = hpsb_send_packet(packet);
+	}
+	if (ret)
+		HPSB_WARN("fw-host%d: Failed to broadcast resume packet",
+			  host->id);
+	return ret;
+}
+
 /* Because we are a 1394a-2000 compliant IRM, we need to inform all the other
  * nodes of the broadcast channel.  (Really we're only setting the validity
  * bit). Other IRM responsibilities go in here as well. */
@@ -1460,6 +1478,13 @@ static int nodemgr_do_irm_duties(struct hpsb_host *host, int cycles)
 		}
 	}
 
+	/* Some devices suspend their ports while being connected to an inactive
+	 * host adapter, i.e. if connected before the low-level driver is
+	 * loaded.  They become visible either when physically unplugged and
+	 * replugged, or when receiving a resume packet.  Send one once. */
+	if (!host->resume_packet_sent && !nodemgr_send_resume_packet(host))
+		host->resume_packet_sent = 1;
+
 	return 1;
 }
 
-- 
cgit v1.1


From d7758461b9a8253f1c125e5907579e0594d29e3b Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 1 Dec 2005 18:51:56 -0500
Subject: ieee1394: add definitions for phy packet constants

Introduce new macros related to phy packets and use them in ieee1394_core and
nodemgr.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ieee1394.h      | 19 ++++++++++++++++++-
 drivers/ieee1394/ieee1394_core.c | 38 ++++++++++++++++++++++----------------
 drivers/ieee1394/nodemgr.c       |  3 ++-
 3 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/drivers/ieee1394/ieee1394.h b/drivers/ieee1394/ieee1394.h
index b634a9b..936d776 100644
--- a/drivers/ieee1394/ieee1394.h
+++ b/drivers/ieee1394/ieee1394.h
@@ -62,6 +62,7 @@
 extern const char *hpsb_speedto_str[];
 
 
+/* 1394a cable PHY packets */
 #define SELFID_PWRCL_NO_POWER    0x0
 #define SELFID_PWRCL_PROVIDE_15W 0x1
 #define SELFID_PWRCL_PROVIDE_30W 0x2
@@ -76,8 +77,24 @@ extern const char *hpsb_speedto_str[];
 #define SELFID_PORT_NCONN        0x1
 #define SELFID_PORT_NONE         0x0
 
+#define PHYPACKET_LINKON			0x40000000
+#define PHYPACKET_PHYCONFIG_R			0x00800000
+#define PHYPACKET_PHYCONFIG_T			0x00400000
+#define EXTPHYPACKET_TYPE_PING			0x00000000
+#define EXTPHYPACKET_TYPE_REMOTEACCESS_BASE	0x00040000
+#define EXTPHYPACKET_TYPE_REMOTEACCESS_PAGED	0x00140000
+#define EXTPHYPACKET_TYPE_REMOTEREPLY_BASE	0x000C0000
+#define EXTPHYPACKET_TYPE_REMOTEREPLY_PAGED	0x001C0000
+#define EXTPHYPACKET_TYPE_REMOTECOMMAND		0x00200000
+#define EXTPHYPACKET_TYPE_REMOTECONFIRMATION	0x00280000
+#define EXTPHYPACKET_TYPE_RESUME		0x003C0000
 
-/* 1394a PHY bitmasks */
+#define EXTPHYPACKET_TYPEMASK			0xC0FC0000
+
+#define PHYPACKET_PORT_SHIFT     24
+#define PHYPACKET_GAPCOUNT_SHIFT 16
+
+/* 1394a PHY register map bitmasks */
 #define PHY_00_PHYSICAL_ID       0xFC
 #define PHY_00_R                 0x02 /* Root */
 #define PHY_00_PS                0x01 /* Power Status*/
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 32a1e01..f2f5e48 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -256,10 +256,14 @@ static int check_selfids(struct hpsb_host *host)
 
         esid = (struct ext_selfid *)(sid - 1);
         while (esid->extended) {
-                if ((esid->porta == 0x2) || (esid->portb == 0x2)
-                    || (esid->portc == 0x2) || (esid->portd == 0x2)
-                    || (esid->porte == 0x2) || (esid->portf == 0x2)
-                    || (esid->portg == 0x2) || (esid->porth == 0x2)) {
+                if ((esid->porta == SELFID_PORT_PARENT) ||
+		    (esid->portb == SELFID_PORT_PARENT) ||
+		    (esid->portc == SELFID_PORT_PARENT) ||
+		    (esid->portd == SELFID_PORT_PARENT) ||
+		    (esid->porte == SELFID_PORT_PARENT) ||
+		    (esid->portf == SELFID_PORT_PARENT) ||
+		    (esid->portg == SELFID_PORT_PARENT) ||
+		    (esid->porth == SELFID_PORT_PARENT)) {
 			HPSB_INFO("SelfIDs failed root check on "
 				  "extended SelfID");
 			return 0;
@@ -268,7 +272,9 @@ static int check_selfids(struct hpsb_host *host)
         }
 
         sid = (struct selfid *)esid;
-        if ((sid->port0 == 0x2) || (sid->port1 == 0x2) || (sid->port2 == 0x2)) {
+	if ((sid->port0 == SELFID_PORT_PARENT) ||
+	    (sid->port1 == SELFID_PORT_PARENT) ||
+	    (sid->port2 == SELFID_PORT_PARENT)) {
 		HPSB_INFO("SelfIDs failed root check");
 		return 0;
         }
@@ -303,18 +309,18 @@ static void build_speed_map(struct hpsb_host *host, int nodecount)
                 if (sid->extended) {
                         esid = (struct ext_selfid *)sid;
 
-                        if (esid->porta == 0x3) cldcnt[n]++;
-                        if (esid->portb == 0x3) cldcnt[n]++;
-                        if (esid->portc == 0x3) cldcnt[n]++;
-                        if (esid->portd == 0x3) cldcnt[n]++;
-                        if (esid->porte == 0x3) cldcnt[n]++;
-                        if (esid->portf == 0x3) cldcnt[n]++;
-                        if (esid->portg == 0x3) cldcnt[n]++;
-                        if (esid->porth == 0x3) cldcnt[n]++;
+			if (esid->porta == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->portb == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->portc == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->portd == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->porte == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->portf == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->portg == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (esid->porth == SELFID_PORT_CHILD) cldcnt[n]++;
                 } else {
-                        if (sid->port0 == 0x3) cldcnt[n]++;
-                        if (sid->port1 == 0x3) cldcnt[n]++;
-                        if (sid->port2 == 0x3) cldcnt[n]++;
+			if (sid->port0 == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (sid->port1 == SELFID_PORT_CHILD) cldcnt[n]++;
+			if (sid->port2 == SELFID_PORT_CHILD) cldcnt[n]++;
 
                         speedcap[n] = sid->speed;
                         n--;
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index b56934e..f4b6025 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1416,7 +1416,8 @@ static int nodemgr_send_resume_packet(struct hpsb_host *host)
 	int ret = 1;
 
 	packet = hpsb_make_phypacket(host,
-			0x003c0000 | NODEID_TO_NODE(host->node_id) << 24);
+			EXTPHYPACKET_TYPE_RESUME |
+			NODEID_TO_NODE(host->node_id) << PHYPACKET_PORT_SHIFT);
 	if (packet) {
 		packet->no_waiter = 1;
 		packet->generation = get_hpsb_generation(host);
-- 
cgit v1.1


From 546513f9fd96cba613cc2d025ee03d32d79394b7 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 1 Dec 2005 18:52:01 -0500
Subject: ieee1394: hpsb_send_phy_config() cleanup

Eliminate some code in hpsb_send_phy_config() which is provided
by hpsb_make_phypacket().

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ieee1394_core.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index f2f5e48..ff8a409 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -463,6 +463,7 @@ void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
 int hpsb_send_phy_config(struct hpsb_host *host, int rootid, int gapcnt)
 {
 	struct hpsb_packet *packet;
+	quadlet_t d = 0;
 	int retval = 0;
 
 	if (rootid >= ALL_NODES || rootid < -1 || gapcnt > 0x3f || gapcnt < -1 ||
@@ -472,26 +473,16 @@ int hpsb_send_phy_config(struct hpsb_host *host, int rootid, int gapcnt)
 		return -EINVAL;
 	}
 
-	packet = hpsb_alloc_packet(0);
-	if (!packet)
-		return -ENOMEM;
-
-	packet->host = host;
-	packet->header_size = 8;
-	packet->data_size = 0;
-	packet->expect_response = 0;
-	packet->no_waiter = 0;
-	packet->type = hpsb_raw;
-	packet->header[0] = 0;
 	if (rootid != -1)
-		packet->header[0] |= rootid << 24 | 1 << 23;
+		d |= PHYPACKET_PHYCONFIG_R | rootid << PHYPACKET_PORT_SHIFT;
 	if (gapcnt != -1)
-		packet->header[0] |= gapcnt << 16 | 1 << 22;
+		d |= PHYPACKET_PHYCONFIG_T | gapcnt << PHYPACKET_GAPCOUNT_SHIFT;
 
-	packet->header[1] = ~packet->header[0];
+	packet = hpsb_make_phypacket(host, d);
+	if (!packet)
+		return -ENOMEM;
 
 	packet->generation = get_hpsb_generation(host);
-
 	retval = hpsb_send_packet_and_wait(packet);
 	hpsb_free_packet(packet);
 
-- 
cgit v1.1


From 741854e4f9a23421e194df8d846899172ff393d6 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 1 Dec 2005 18:52:03 -0500
Subject: ieee1394: whitespace cleanup in hosts.[ch], ieee1394_core.[ch]

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/hosts.c         |  28 +-
 drivers/ieee1394/hosts.h         | 162 ++++-----
 drivers/ieee1394/ieee1394_core.c | 762 +++++++++++++++++++--------------------
 drivers/ieee1394/ieee1394_core.h | 100 ++---
 4 files changed, 526 insertions(+), 526 deletions(-)

diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index d245abe..ba09741 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -61,12 +61,12 @@ static void delayed_reset_bus(void * __reset_info)
 
 static int dummy_transmit_packet(struct hpsb_host *h, struct hpsb_packet *p)
 {
-        return 0;
+	return 0;
 }
 
 static int dummy_devctl(struct hpsb_host *h, enum devctl_cmd c, int arg)
 {
-        return -1;
+	return -1;
 }
 
 static int dummy_isoctl(struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg)
@@ -75,9 +75,9 @@ static int dummy_isoctl(struct hpsb_iso *iso, enum isoctl_cmd command, unsigned
 }
 
 static struct hpsb_host_driver dummy_driver = {
-        .transmit_packet = dummy_transmit_packet,
-        .devctl =          dummy_devctl,
-	.isoctl =          dummy_isoctl
+	.transmit_packet = dummy_transmit_packet,
+	.devctl =	   dummy_devctl,
+	.isoctl =	   dummy_isoctl
 };
 
 static int alloc_hostnum_cb(struct hpsb_host *host, void *__data)
@@ -110,12 +110,12 @@ static DECLARE_MUTEX(host_num_alloc);
 struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
 				  struct device *dev)
 {
-        struct hpsb_host *h;
+	struct hpsb_host *h;
 	int i;
 	int hostnum = 0;
 
-        h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL);
-        if (!h)
+	h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL);
+	if (!h)
 		return NULL;
 
 	h->csr.rom = csr1212_create_csr(&csr_bus_ops, CSR_BUS_INFO_SIZE, h);
@@ -125,7 +125,7 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
 	}
 
 	h->hostdata = h + 1;
-        h->driver = drv;
+	h->driver = drv;
 
 	skb_queue_head_init(&h->pending_packet_queue);
 	INIT_LIST_HEAD(&h->addr_space);
@@ -145,8 +145,8 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
 	h->timeout.function = abort_timedouts;
 	h->timeout_interval = HZ / 20; // 50ms by default
 
-        h->topology_map = h->csr.topology_map + 3;
-        h->speed_map = (u8 *)(h->csr.speed_map + 2);
+	h->topology_map = h->csr.topology_map + 3;
+	h->speed_map = (u8 *)(h->csr.speed_map + 2);
 
 	down(&host_num_alloc);
 
@@ -186,14 +186,14 @@ int hpsb_add_host(struct hpsb_host *host)
 
 void hpsb_remove_host(struct hpsb_host *host)
 {
-        host->is_shutdown = 1;
+	host->is_shutdown = 1;
 
 	cancel_delayed_work(&host->delayed_reset);
 	flush_scheduled_work();
 
-        host->driver = &dummy_driver;
+	host->driver = &dummy_driver;
 
-        highlevel_remove_host(host);
+	highlevel_remove_host(host);
 
 	hpsb_remove_extra_config_roms(host);
 
diff --git a/drivers/ieee1394/hosts.h b/drivers/ieee1394/hosts.h
index ae9b02c..07d188c 100644
--- a/drivers/ieee1394/hosts.h
+++ b/drivers/ieee1394/hosts.h
@@ -17,47 +17,47 @@ struct hpsb_packet;
 struct hpsb_iso;
 
 struct hpsb_host {
-        struct list_head host_list;
+	struct list_head host_list;
 
-        void *hostdata;
+	void *hostdata;
 
-        atomic_t generation;
+	atomic_t generation;
 
 	struct sk_buff_head pending_packet_queue;
 
 	struct timer_list timeout;
 	unsigned long timeout_interval;
 
-        unsigned char iso_listen_count[64];
+	unsigned char iso_listen_count[64];
 
-        int node_count; /* number of identified nodes on this bus */
-        int selfid_count; /* total number of SelfIDs received */
+	int node_count; /* number of identified nodes on this bus */
+	int selfid_count; /* total number of SelfIDs received */
 	int nodes_active; /* number of nodes that are actually active */
 
-        nodeid_t node_id; /* node ID of this host */
-        nodeid_t irm_id; /* ID of this bus' isochronous resource manager */
-        nodeid_t busmgr_id; /* ID of this bus' bus manager */
+	nodeid_t node_id; /* node ID of this host */
+	nodeid_t irm_id; /* ID of this bus' isochronous resource manager */
+	nodeid_t busmgr_id; /* ID of this bus' bus manager */
 
-        /* this nodes state */
-        unsigned in_bus_reset:1;
-        unsigned is_shutdown:1;
+	/* this nodes state */
+	unsigned in_bus_reset:1;
+	unsigned is_shutdown:1;
 	unsigned resume_packet_sent:1;
 
-        /* this nodes' duties on the bus */
-        unsigned is_root:1;
-        unsigned is_cycmst:1;
-        unsigned is_irm:1;
-        unsigned is_busmgr:1;
+	/* this nodes' duties on the bus */
+	unsigned is_root:1;
+	unsigned is_cycmst:1;
+	unsigned is_irm:1;
+	unsigned is_busmgr:1;
 
-        int reset_retries;
-        quadlet_t *topology_map;
-        u8 *speed_map;
-        struct csr_control csr;
+	int reset_retries;
+	quadlet_t *topology_map;
+	u8 *speed_map;
+	struct csr_control csr;
 
 	/* Per node tlabel pool allocation */
 	struct hpsb_tlabel_pool tpool[64];
 
-        struct hpsb_host_driver *driver;
+	struct hpsb_host_driver *driver;
 
 	struct pci_dev *pdev;
 
@@ -77,34 +77,34 @@ struct hpsb_host {
 
 
 enum devctl_cmd {
-        /* Host is requested to reset its bus and cancel all outstanding async
-         * requests.  If arg == 1, it shall also attempt to become root on the
-         * bus.  Return void. */
-        RESET_BUS,
-
-        /* Arg is void, return value is the hardware cycle counter value. */
-        GET_CYCLE_COUNTER,
-
-        /* Set the hardware cycle counter to the value in arg, return void.
-         * FIXME - setting is probably not required. */
-        SET_CYCLE_COUNTER,
-
-        /* Configure hardware for new bus ID in arg, return void. */
-        SET_BUS_ID,
-
-        /* If arg true, start sending cycle start packets, stop if arg == 0.
-         * Return void. */
-        ACT_CYCLE_MASTER,
-
-        /* Cancel all outstanding async requests without resetting the bus.
-         * Return void. */
-        CANCEL_REQUESTS,
-
-        /* Start or stop receiving isochronous channel in arg.  Return void.
-         * This acts as an optimization hint, hosts are not required not to
-         * listen on unrequested channels. */
-        ISO_LISTEN_CHANNEL,
-        ISO_UNLISTEN_CHANNEL
+	/* Host is requested to reset its bus and cancel all outstanding async
+	 * requests.  If arg == 1, it shall also attempt to become root on the
+	 * bus.  Return void. */
+	RESET_BUS,
+
+	/* Arg is void, return value is the hardware cycle counter value. */
+	GET_CYCLE_COUNTER,
+
+	/* Set the hardware cycle counter to the value in arg, return void.
+	 * FIXME - setting is probably not required. */
+	SET_CYCLE_COUNTER,
+
+	/* Configure hardware for new bus ID in arg, return void. */
+	SET_BUS_ID,
+
+	/* If arg true, start sending cycle start packets, stop if arg == 0.
+	 * Return void. */
+	ACT_CYCLE_MASTER,
+
+	/* Cancel all outstanding async requests without resetting the bus.
+	 * Return void. */
+	CANCEL_REQUESTS,
+
+	/* Start or stop receiving isochronous channel in arg.  Return void.
+	 * This acts as an optimization hint, hosts are not required not to
+	 * listen on unrequested channels. */
+	ISO_LISTEN_CHANNEL,
+	ISO_UNLISTEN_CHANNEL
 };
 
 enum isoctl_cmd {
@@ -135,13 +135,13 @@ enum isoctl_cmd {
 };
 
 enum reset_types {
-        /* 166 microsecond reset -- only type of reset available on
-           non-1394a capable controllers */
-        LONG_RESET,
+	/* 166 microsecond reset -- only type of reset available on
+	   non-1394a capable controllers */
+	LONG_RESET,
 
-        /* Short (arbitrated) reset -- only available on 1394a capable
-           controllers */
-        SHORT_RESET,
+	/* Short (arbitrated) reset -- only available on 1394a capable
+	   controllers */
+	SHORT_RESET,
 
 	/* Variants that set force_root before issueing the bus reset */
 	LONG_RESET_FORCE_ROOT, SHORT_RESET_FORCE_ROOT,
@@ -159,22 +159,22 @@ struct hpsb_host_driver {
 	 * reads to the ConfigROM on its own. */
 	void (*set_hw_config_rom) (struct hpsb_host *host, quadlet_t *config_rom);
 
-        /* This function shall implement packet transmission based on
-         * packet->type.  It shall CRC both parts of the packet (unless
-         * packet->type == raw) and do byte-swapping as necessary or instruct
-         * the hardware to do so.  It can return immediately after the packet
-         * was queued for sending.  After sending, hpsb_sent_packet() has to be
-         * called.  Return 0 on success, negative errno on failure.
-         * NOTE: The function must be callable in interrupt context.
-         */
-        int (*transmit_packet) (struct hpsb_host *host,
-                                struct hpsb_packet *packet);
-
-        /* This function requests miscellanous services from the driver, see
-         * above for command codes and expected actions.  Return -1 for unknown
-         * command, though that should never happen.
-         */
-        int (*devctl) (struct hpsb_host *host, enum devctl_cmd command, int arg);
+	/* This function shall implement packet transmission based on
+	 * packet->type.  It shall CRC both parts of the packet (unless
+	 * packet->type == raw) and do byte-swapping as necessary or instruct
+	 * the hardware to do so.  It can return immediately after the packet
+	 * was queued for sending.  After sending, hpsb_sent_packet() has to be
+	 * called.  Return 0 on success, negative errno on failure.
+	 * NOTE: The function must be callable in interrupt context.
+	 */
+	int (*transmit_packet) (struct hpsb_host *host,
+				struct hpsb_packet *packet);
+
+	/* This function requests miscellanous services from the driver, see
+	 * above for command codes and expected actions.  Return -1 for unknown
+	 * command, though that should never happen.
+	 */
+	int (*devctl) (struct hpsb_host *host, enum devctl_cmd command, int arg);
 
 	 /* ISO transmission/reception functions. Return 0 on success, -1
 	  * (or -EXXX errno code) on failure. If the low-level driver does not
@@ -182,15 +182,15 @@ struct hpsb_host_driver {
 	  */
 	int (*isoctl) (struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg);
 
-        /* This function is mainly to redirect local CSR reads/locks to the iso
-         * management registers (bus manager id, bandwidth available, channels
-         * available) to the hardware registers in OHCI.  reg is 0,1,2,3 for bus
-         * mgr, bwdth avail, ch avail hi, ch avail lo respectively (the same ids
-         * as OHCI uses).  data and compare are the new data and expected data
-         * respectively, return value is the old value.
-         */
-        quadlet_t (*hw_csr_reg) (struct hpsb_host *host, int reg,
-                                 quadlet_t data, quadlet_t compare);
+	/* This function is mainly to redirect local CSR reads/locks to the iso
+	 * management registers (bus manager id, bandwidth available, channels
+	 * available) to the hardware registers in OHCI.  reg is 0,1,2,3 for bus
+	 * mgr, bwdth avail, ch avail hi, ch avail lo respectively (the same ids
+	 * as OHCI uses).  data and compare are the new data and expected data
+	 * respectively, return value is the old value.
+	 */
+	quadlet_t (*hw_csr_reg) (struct hpsb_host *host, int reg,
+				 quadlet_t data, quadlet_t compare);
 };
 
 
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index ff8a409..64fbbb0 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -179,34 +179,34 @@ void hpsb_free_packet(struct hpsb_packet *packet)
 
 int hpsb_reset_bus(struct hpsb_host *host, int type)
 {
-        if (!host->in_bus_reset) {
-                host->driver->devctl(host, RESET_BUS, type);
-                return 0;
-        } else {
-                return 1;
-        }
+	if (!host->in_bus_reset) {
+		host->driver->devctl(host, RESET_BUS, type);
+		return 0;
+	} else {
+		return 1;
+	}
 }
 
 
 int hpsb_bus_reset(struct hpsb_host *host)
 {
-        if (host->in_bus_reset) {
-                HPSB_NOTICE("%s called while bus reset already in progress",
+	if (host->in_bus_reset) {
+		HPSB_NOTICE("%s called while bus reset already in progress",
 			    __FUNCTION__);
-                return 1;
-        }
+		return 1;
+	}
 
-        abort_requests(host);
-        host->in_bus_reset = 1;
-        host->irm_id = -1;
+	abort_requests(host);
+	host->in_bus_reset = 1;
+	host->irm_id = -1;
 	host->is_irm = 0;
-        host->busmgr_id = -1;
+	host->busmgr_id = -1;
 	host->is_busmgr = 0;
 	host->is_cycmst = 0;
-        host->node_count = 0;
-        host->selfid_count = 0;
+	host->node_count = 0;
+	host->selfid_count = 0;
 
-        return 0;
+	return 0;
 }
 
 
@@ -216,47 +216,47 @@ int hpsb_bus_reset(struct hpsb_host *host)
  */
 static int check_selfids(struct hpsb_host *host)
 {
-        int nodeid = -1;
-        int rest_of_selfids = host->selfid_count;
-        struct selfid *sid = (struct selfid *)host->topology_map;
-        struct ext_selfid *esid;
-        int esid_seq = 23;
+	int nodeid = -1;
+	int rest_of_selfids = host->selfid_count;
+	struct selfid *sid = (struct selfid *)host->topology_map;
+	struct ext_selfid *esid;
+	int esid_seq = 23;
 
 	host->nodes_active = 0;
 
-        while (rest_of_selfids--) {
-                if (!sid->extended) {
-                        nodeid++;
-                        esid_seq = 0;
+	while (rest_of_selfids--) {
+		if (!sid->extended) {
+			nodeid++;
+			esid_seq = 0;
 
-                        if (sid->phy_id != nodeid) {
-                                HPSB_INFO("SelfIDs failed monotony check with "
-                                          "%d", sid->phy_id);
-                                return 0;
-                        }
+			if (sid->phy_id != nodeid) {
+				HPSB_INFO("SelfIDs failed monotony check with "
+					  "%d", sid->phy_id);
+				return 0;
+			}
 
 			if (sid->link_active) {
 				host->nodes_active++;
 				if (sid->contender)
 					host->irm_id = LOCAL_BUS | sid->phy_id;
 			}
-                } else {
-                        esid = (struct ext_selfid *)sid;
-
-                        if ((esid->phy_id != nodeid)
-                            || (esid->seq_nr != esid_seq)) {
-                                HPSB_INFO("SelfIDs failed monotony check with "
-                                          "%d/%d", esid->phy_id, esid->seq_nr);
-                                return 0;
-                        }
-                        esid_seq++;
-                }
-                sid++;
-        }
-
-        esid = (struct ext_selfid *)(sid - 1);
-        while (esid->extended) {
-                if ((esid->porta == SELFID_PORT_PARENT) ||
+		} else {
+			esid = (struct ext_selfid *)sid;
+
+			if ((esid->phy_id != nodeid)
+			    || (esid->seq_nr != esid_seq)) {
+				HPSB_INFO("SelfIDs failed monotony check with "
+					  "%d/%d", esid->phy_id, esid->seq_nr);
+				return 0;
+			}
+			esid_seq++;
+		}
+		sid++;
+	}
+
+	esid = (struct ext_selfid *)(sid - 1);
+	while (esid->extended) {
+		if ((esid->porta == SELFID_PORT_PARENT) ||
 		    (esid->portb == SELFID_PORT_PARENT) ||
 		    (esid->portc == SELFID_PORT_PARENT) ||
 		    (esid->portd == SELFID_PORT_PARENT) ||
@@ -267,47 +267,47 @@ static int check_selfids(struct hpsb_host *host)
 			HPSB_INFO("SelfIDs failed root check on "
 				  "extended SelfID");
 			return 0;
-                }
-                esid--;
-        }
+		}
+		esid--;
+	}
 
-        sid = (struct selfid *)esid;
+	sid = (struct selfid *)esid;
 	if ((sid->port0 == SELFID_PORT_PARENT) ||
 	    (sid->port1 == SELFID_PORT_PARENT) ||
 	    (sid->port2 == SELFID_PORT_PARENT)) {
 		HPSB_INFO("SelfIDs failed root check");
 		return 0;
-        }
+	}
 
 	host->node_count = nodeid + 1;
-        return 1;
+	return 1;
 }
 
 static void build_speed_map(struct hpsb_host *host, int nodecount)
 {
 	u8 speedcap[nodecount];
 	u8 cldcnt[nodecount];
-        u8 *map = host->speed_map;
-        struct selfid *sid;
-        struct ext_selfid *esid;
-        int i, j, n;
-
-        for (i = 0; i < (nodecount * 64); i += 64) {
-                for (j = 0; j < nodecount; j++) {
-                        map[i+j] = IEEE1394_SPEED_MAX;
-                }
-        }
-
-        for (i = 0; i < nodecount; i++) {
-                cldcnt[i] = 0;
-        }
-
-        /* find direct children count and speed */
-        for (sid = (struct selfid *)&host->topology_map[host->selfid_count-1],
-                     n = nodecount - 1;
-             (void *)sid >= (void *)host->topology_map; sid--) {
-                if (sid->extended) {
-                        esid = (struct ext_selfid *)sid;
+	u8 *map = host->speed_map;
+	struct selfid *sid;
+	struct ext_selfid *esid;
+	int i, j, n;
+
+	for (i = 0; i < (nodecount * 64); i += 64) {
+		for (j = 0; j < nodecount; j++) {
+			map[i+j] = IEEE1394_SPEED_MAX;
+		}
+	}
+
+	for (i = 0; i < nodecount; i++) {
+		cldcnt[i] = 0;
+	}
+
+	/* find direct children count and speed */
+	for (sid = (struct selfid *)&host->topology_map[host->selfid_count-1],
+		     n = nodecount - 1;
+	     (void *)sid >= (void *)host->topology_map; sid--) {
+		if (sid->extended) {
+			esid = (struct ext_selfid *)sid;
 
 			if (esid->porta == SELFID_PORT_CHILD) cldcnt[n]++;
 			if (esid->portb == SELFID_PORT_CHILD) cldcnt[n]++;
@@ -322,50 +322,50 @@ static void build_speed_map(struct hpsb_host *host, int nodecount)
 			if (sid->port1 == SELFID_PORT_CHILD) cldcnt[n]++;
 			if (sid->port2 == SELFID_PORT_CHILD) cldcnt[n]++;
 
-                        speedcap[n] = sid->speed;
-                        n--;
-                }
-        }
-
-        /* set self mapping */
-        for (i = 0; i < nodecount; i++) {
-                map[64*i + i] = speedcap[i];
-        }
-
-        /* fix up direct children count to total children count;
-         * also fix up speedcaps for sibling and parent communication */
-        for (i = 1; i < nodecount; i++) {
-                for (j = cldcnt[i], n = i - 1; j > 0; j--) {
-                        cldcnt[i] += cldcnt[n];
-                        speedcap[n] = min(speedcap[n], speedcap[i]);
-                        n -= cldcnt[n] + 1;
-                }
-        }
-
-        for (n = 0; n < nodecount; n++) {
-                for (i = n - cldcnt[n]; i <= n; i++) {
-                        for (j = 0; j < (n - cldcnt[n]); j++) {
-                                map[j*64 + i] = map[i*64 + j] =
-                                        min(map[i*64 + j], speedcap[n]);
-                        }
-                        for (j = n + 1; j < nodecount; j++) {
-                                map[j*64 + i] = map[i*64 + j] =
-                                        min(map[i*64 + j], speedcap[n]);
-                        }
-                }
-        }
+			speedcap[n] = sid->speed;
+			n--;
+		}
+	}
+
+	/* set self mapping */
+	for (i = 0; i < nodecount; i++) {
+		map[64*i + i] = speedcap[i];
+	}
+
+	/* fix up direct children count to total children count;
+	 * also fix up speedcaps for sibling and parent communication */
+	for (i = 1; i < nodecount; i++) {
+		for (j = cldcnt[i], n = i - 1; j > 0; j--) {
+			cldcnt[i] += cldcnt[n];
+			speedcap[n] = min(speedcap[n], speedcap[i]);
+			n -= cldcnt[n] + 1;
+		}
+	}
+
+	for (n = 0; n < nodecount; n++) {
+		for (i = n - cldcnt[n]; i <= n; i++) {
+			for (j = 0; j < (n - cldcnt[n]); j++) {
+				map[j*64 + i] = map[i*64 + j] =
+					min(map[i*64 + j], speedcap[n]);
+			}
+			for (j = n + 1; j < nodecount; j++) {
+				map[j*64 + i] = map[i*64 + j] =
+					min(map[i*64 + j], speedcap[n]);
+			}
+		}
+	}
 }
 
 
 void hpsb_selfid_received(struct hpsb_host *host, quadlet_t sid)
 {
-        if (host->in_bus_reset) {
-                HPSB_VERBOSE("Including SelfID 0x%x", sid);
-                host->topology_map[host->selfid_count++] = sid;
-        } else {
-                HPSB_NOTICE("Spurious SelfID packet (0x%08x) received from bus %d",
+	if (host->in_bus_reset) {
+		HPSB_VERBOSE("Including SelfID 0x%x", sid);
+		host->topology_map[host->selfid_count++] = sid;
+	} else {
+		HPSB_NOTICE("Spurious SelfID packet (0x%08x) received from bus %d",
 			    sid, NODEID_TO_BUS(host->node_id));
-        }
+	}
 }
 
 void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot)
@@ -373,50 +373,50 @@ void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot)
 	if (!host->in_bus_reset)
 		HPSB_NOTICE("SelfID completion called outside of bus reset!");
 
-        host->node_id = LOCAL_BUS | phyid;
-        host->is_root = isroot;
+	host->node_id = LOCAL_BUS | phyid;
+	host->is_root = isroot;
 
-        if (!check_selfids(host)) {
-                if (host->reset_retries++ < 20) {
-                        /* selfid stage did not complete without error */
-                        HPSB_NOTICE("Error in SelfID stage, resetting");
+	if (!check_selfids(host)) {
+		if (host->reset_retries++ < 20) {
+			/* selfid stage did not complete without error */
+			HPSB_NOTICE("Error in SelfID stage, resetting");
 			host->in_bus_reset = 0;
 			/* this should work from ohci1394 now... */
-                        hpsb_reset_bus(host, LONG_RESET);
-                        return;
-                } else {
-                        HPSB_NOTICE("Stopping out-of-control reset loop");
-                        HPSB_NOTICE("Warning - topology map and speed map will not be valid");
+			hpsb_reset_bus(host, LONG_RESET);
+			return;
+		} else {
+			HPSB_NOTICE("Stopping out-of-control reset loop");
+			HPSB_NOTICE("Warning - topology map and speed map will not be valid");
 			host->reset_retries = 0;
-                }
-        } else {
+		}
+	} else {
 		host->reset_retries = 0;
-                build_speed_map(host, host->node_count);
-        }
+		build_speed_map(host, host->node_count);
+	}
 
 	HPSB_VERBOSE("selfid_complete called with successful SelfID stage "
 		     "... irm_id: 0x%X node_id: 0x%X",host->irm_id,host->node_id);
 
-        /* irm_id is kept up to date by check_selfids() */
-        if (host->irm_id == host->node_id) {
-                host->is_irm = 1;
-        } else {
-                host->is_busmgr = 0;
-                host->is_irm = 0;
-        }
+	/* irm_id is kept up to date by check_selfids() */
+	if (host->irm_id == host->node_id) {
+		host->is_irm = 1;
+	} else {
+		host->is_busmgr = 0;
+		host->is_irm = 0;
+	}
 
-        if (isroot) {
+	if (isroot) {
 		host->driver->devctl(host, ACT_CYCLE_MASTER, 1);
 		host->is_cycmst = 1;
 	}
 	atomic_inc(&host->generation);
 	host->in_bus_reset = 0;
-        highlevel_host_reset(host);
+	highlevel_host_reset(host);
 }
 
 
 void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
-                      int ackcode)
+		      int ackcode)
 {
 	unsigned long flags;
 
@@ -507,13 +507,13 @@ int hpsb_send_packet(struct hpsb_packet *packet)
 {
 	struct hpsb_host *host = packet->host;
 
-        if (host->is_shutdown)
+	if (host->is_shutdown)
 		return -EINVAL;
 	if (host->in_bus_reset ||
 	    (packet->generation != get_hpsb_generation(host)))
-                return -EAGAIN;
+		return -EAGAIN;
 
-        packet->state = hpsb_queued;
+	packet->state = hpsb_queued;
 
 	/* This just seems silly to me */
 	WARN_ON(packet->no_waiter && packet->expect_response);
@@ -527,42 +527,42 @@ int hpsb_send_packet(struct hpsb_packet *packet)
 		skb_queue_tail(&host->pending_packet_queue, packet->skb);
 	}
 
-        if (packet->node_id == host->node_id) {
+	if (packet->node_id == host->node_id) {
 		/* it is a local request, so handle it locally */
 
-                quadlet_t *data;
-                size_t size = packet->data_size + packet->header_size;
+		quadlet_t *data;
+		size_t size = packet->data_size + packet->header_size;
 
-                data = kmalloc(size, GFP_ATOMIC);
-                if (!data) {
-                        HPSB_ERR("unable to allocate memory for concatenating header and data");
-                        return -ENOMEM;
-                }
+		data = kmalloc(size, GFP_ATOMIC);
+		if (!data) {
+			HPSB_ERR("unable to allocate memory for concatenating header and data");
+			return -ENOMEM;
+		}
 
-                memcpy(data, packet->header, packet->header_size);
+		memcpy(data, packet->header, packet->header_size);
 
-                if (packet->data_size)
+		if (packet->data_size)
 			memcpy(((u8*)data) + packet->header_size, packet->data, packet->data_size);
 
-                dump_packet("send packet local", packet->header, packet->header_size, -1);
+		dump_packet("send packet local", packet->header, packet->header_size, -1);
 
-                hpsb_packet_sent(host, packet, packet->expect_response ? ACK_PENDING : ACK_COMPLETE);
-                hpsb_packet_received(host, data, size, 0);
+		hpsb_packet_sent(host, packet, packet->expect_response ? ACK_PENDING : ACK_COMPLETE);
+		hpsb_packet_received(host, data, size, 0);
 
-                kfree(data);
+		kfree(data);
 
-                return 0;
-        }
+		return 0;
+	}
 
-        if (packet->type == hpsb_async && packet->node_id != ALL_NODES) {
-                packet->speed_code =
-                        host->speed_map[NODEID_TO_NODE(host->node_id) * 64
-                                       + NODEID_TO_NODE(packet->node_id)];
-        }
+	if (packet->type == hpsb_async && packet->node_id != ALL_NODES) {
+		packet->speed_code =
+			host->speed_map[NODEID_TO_NODE(host->node_id) * 64
+				       + NODEID_TO_NODE(packet->node_id)];
+	}
 
-        dump_packet("send packet", packet->header, packet->header_size, packet->speed_code);
+	dump_packet("send packet", packet->header, packet->header_size, packet->speed_code);
 
-        return host->driver->transmit_packet(host, packet);
+	return host->driver->transmit_packet(host, packet);
 }
 
 /* We could just use complete() directly as the packet complete
@@ -590,81 +590,81 @@ int hpsb_send_packet_and_wait(struct hpsb_packet *packet)
 
 static void send_packet_nocare(struct hpsb_packet *packet)
 {
-        if (hpsb_send_packet(packet) < 0) {
-                hpsb_free_packet(packet);
-        }
+	if (hpsb_send_packet(packet) < 0) {
+		hpsb_free_packet(packet);
+	}
 }
 
 
 static void handle_packet_response(struct hpsb_host *host, int tcode,
 				   quadlet_t *data, size_t size)
 {
-        struct hpsb_packet *packet = NULL;
+	struct hpsb_packet *packet = NULL;
 	struct sk_buff *skb;
-        int tcode_match = 0;
-        int tlabel;
-        unsigned long flags;
+	int tcode_match = 0;
+	int tlabel;
+	unsigned long flags;
 
-        tlabel = (data[0] >> 10) & 0x3f;
+	tlabel = (data[0] >> 10) & 0x3f;
 
 	spin_lock_irqsave(&host->pending_packet_queue.lock, flags);
 
 	skb_queue_walk(&host->pending_packet_queue, skb) {
 		packet = (struct hpsb_packet *)skb->data;
-                if ((packet->tlabel == tlabel)
-                    && (packet->node_id == (data[1] >> 16))){
-                        break;
-                }
+		if ((packet->tlabel == tlabel)
+		    && (packet->node_id == (data[1] >> 16))){
+			break;
+		}
 
 		packet = NULL;
-        }
+	}
 
 	if (packet == NULL) {
-                HPSB_DEBUG("unsolicited response packet received - no tlabel match");
-                dump_packet("contents", data, 16, -1);
+		HPSB_DEBUG("unsolicited response packet received - no tlabel match");
+		dump_packet("contents", data, 16, -1);
 		spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags);
-                return;
-        }
+		return;
+	}
 
-        switch (packet->tcode) {
-        case TCODE_WRITEQ:
-        case TCODE_WRITEB:
-                if (tcode != TCODE_WRITE_RESPONSE)
+	switch (packet->tcode) {
+	case TCODE_WRITEQ:
+	case TCODE_WRITEB:
+		if (tcode != TCODE_WRITE_RESPONSE)
 			break;
 		tcode_match = 1;
 		memcpy(packet->header, data, 12);
-                break;
-        case TCODE_READQ:
-                if (tcode != TCODE_READQ_RESPONSE)
+		break;
+	case TCODE_READQ:
+		if (tcode != TCODE_READQ_RESPONSE)
 			break;
 		tcode_match = 1;
 		memcpy(packet->header, data, 16);
-                break;
-        case TCODE_READB:
-                if (tcode != TCODE_READB_RESPONSE)
+		break;
+	case TCODE_READB:
+		if (tcode != TCODE_READB_RESPONSE)
 			break;
 		tcode_match = 1;
 		BUG_ON(packet->skb->len - sizeof(*packet) < size - 16);
 		memcpy(packet->header, data, 16);
 		memcpy(packet->data, data + 4, size - 16);
-                break;
-        case TCODE_LOCK_REQUEST:
-                if (tcode != TCODE_LOCK_RESPONSE)
+		break;
+	case TCODE_LOCK_REQUEST:
+		if (tcode != TCODE_LOCK_RESPONSE)
 			break;
 		tcode_match = 1;
 		size = min((size - 16), (size_t)8);
 		BUG_ON(packet->skb->len - sizeof(*packet) < size);
 		memcpy(packet->header, data, 16);
 		memcpy(packet->data, data + 4, size);
-                break;
-        }
+		break;
+	}
 
-        if (!tcode_match) {
+	if (!tcode_match) {
 		spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags);
-                HPSB_INFO("unsolicited response packet received - tcode mismatch");
-                dump_packet("contents", data, 16, -1);
-                return;
-        }
+		HPSB_INFO("unsolicited response packet received - tcode mismatch");
+		dump_packet("contents", data, 16, -1);
+		return;
+	}
 
 	__skb_unlink(skb, &host->pending_packet_queue);
 
@@ -683,27 +683,27 @@ static void handle_packet_response(struct hpsb_host *host, int tcode,
 static struct hpsb_packet *create_reply_packet(struct hpsb_host *host,
 					       quadlet_t *data, size_t dsize)
 {
-        struct hpsb_packet *p;
+	struct hpsb_packet *p;
 
-        p = hpsb_alloc_packet(dsize);
-        if (unlikely(p == NULL)) {
-                /* FIXME - send data_error response */
-                return NULL;
-        }
+	p = hpsb_alloc_packet(dsize);
+	if (unlikely(p == NULL)) {
+		/* FIXME - send data_error response */
+		return NULL;
+	}
 
-        p->type = hpsb_async;
-        p->state = hpsb_unused;
-        p->host = host;
-        p->node_id = data[1] >> 16;
-        p->tlabel = (data[0] >> 10) & 0x3f;
-        p->no_waiter = 1;
+	p->type = hpsb_async;
+	p->state = hpsb_unused;
+	p->host = host;
+	p->node_id = data[1] >> 16;
+	p->tlabel = (data[0] >> 10) & 0x3f;
+	p->no_waiter = 1;
 
 	p->generation = get_hpsb_generation(host);
 
 	if (dsize % 4)
 		p->data[dsize / 4] = 0;
 
-        return p;
+	return p;
 }
 
 #define PREP_ASYNC_HEAD_RCODE(tc) \
@@ -714,7 +714,7 @@ static struct hpsb_packet *create_reply_packet(struct hpsb_host *host,
 	packet->header[2] = 0
 
 static void fill_async_readquad_resp(struct hpsb_packet *packet, int rcode,
-                              quadlet_t data)
+			      quadlet_t data)
 {
 	PREP_ASYNC_HEAD_RCODE(TCODE_READQ_RESPONSE);
 	packet->header[3] = data;
@@ -723,7 +723,7 @@ static void fill_async_readquad_resp(struct hpsb_packet *packet, int rcode,
 }
 
 static void fill_async_readblock_resp(struct hpsb_packet *packet, int rcode,
-                               int length)
+			       int length)
 {
 	if (rcode != RCODE_COMPLETE)
 		length = 0;
@@ -743,7 +743,7 @@ static void fill_async_write_resp(struct hpsb_packet *packet, int rcode)
 }
 
 static void fill_async_lock_resp(struct hpsb_packet *packet, int rcode, int extcode,
-                          int length)
+			  int length)
 {
 	if (rcode != RCODE_COMPLETE)
 		length = 0;
@@ -755,184 +755,184 @@ static void fill_async_lock_resp(struct hpsb_packet *packet, int rcode, int extc
 }
 
 #define PREP_REPLY_PACKET(length) \
-                packet = create_reply_packet(host, data, length); \
-                if (packet == NULL) break
+		packet = create_reply_packet(host, data, length); \
+		if (packet == NULL) break
 
 static void handle_incoming_packet(struct hpsb_host *host, int tcode,
 				   quadlet_t *data, size_t size, int write_acked)
 {
-        struct hpsb_packet *packet;
-        int length, rcode, extcode;
-        quadlet_t buffer;
-        nodeid_t source = data[1] >> 16;
-        nodeid_t dest = data[0] >> 16;
-        u16 flags = (u16) data[0];
-        u64 addr;
-
-        /* big FIXME - no error checking is done for an out of bounds length */
-
-        switch (tcode) {
-        case TCODE_WRITEQ:
-                addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
-                rcode = highlevel_write(host, source, dest, data+3,
+	struct hpsb_packet *packet;
+	int length, rcode, extcode;
+	quadlet_t buffer;
+	nodeid_t source = data[1] >> 16;
+	nodeid_t dest = data[0] >> 16;
+	u16 flags = (u16) data[0];
+	u64 addr;
+
+	/* big FIXME - no error checking is done for an out of bounds length */
+
+	switch (tcode) {
+	case TCODE_WRITEQ:
+		addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
+		rcode = highlevel_write(host, source, dest, data+3,
 					addr, 4, flags);
 
-                if (!write_acked
-                    && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
-                    && (rcode >= 0)) {
-                        /* not a broadcast write, reply */
-                        PREP_REPLY_PACKET(0);
-                        fill_async_write_resp(packet, rcode);
-                        send_packet_nocare(packet);
-                }
-                break;
-
-        case TCODE_WRITEB:
-                addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
-                rcode = highlevel_write(host, source, dest, data+4,
+		if (!write_acked
+		    && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
+		    && (rcode >= 0)) {
+			/* not a broadcast write, reply */
+			PREP_REPLY_PACKET(0);
+			fill_async_write_resp(packet, rcode);
+			send_packet_nocare(packet);
+		}
+		break;
+
+	case TCODE_WRITEB:
+		addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
+		rcode = highlevel_write(host, source, dest, data+4,
 					addr, data[3]>>16, flags);
 
-                if (!write_acked
-                    && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
-                    && (rcode >= 0)) {
-                        /* not a broadcast write, reply */
-                        PREP_REPLY_PACKET(0);
-                        fill_async_write_resp(packet, rcode);
-                        send_packet_nocare(packet);
-                }
-                break;
-
-        case TCODE_READQ:
-                addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
-                rcode = highlevel_read(host, source, &buffer, addr, 4, flags);
-
-                if (rcode >= 0) {
-                        PREP_REPLY_PACKET(0);
-                        fill_async_readquad_resp(packet, rcode, buffer);
-                        send_packet_nocare(packet);
-                }
-                break;
-
-        case TCODE_READB:
-                length = data[3] >> 16;
-                PREP_REPLY_PACKET(length);
-
-                addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
-                rcode = highlevel_read(host, source, packet->data, addr,
-                                       length, flags);
-
-                if (rcode >= 0) {
-                        fill_async_readblock_resp(packet, rcode, length);
-                        send_packet_nocare(packet);
-                } else {
-                        hpsb_free_packet(packet);
-                }
-                break;
-
-        case TCODE_LOCK_REQUEST:
-                length = data[3] >> 16;
-                extcode = data[3] & 0xffff;
-                addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
-
-                PREP_REPLY_PACKET(8);
-
-                if ((extcode == 0) || (extcode >= 7)) {
-                        /* let switch default handle error */
-                        length = 0;
-                }
-
-                switch (length) {
-                case 4:
-                        rcode = highlevel_lock(host, source, packet->data, addr,
-                                               data[4], 0, extcode,flags);
-                        fill_async_lock_resp(packet, rcode, extcode, 4);
-                        break;
-                case 8:
-                        if ((extcode != EXTCODE_FETCH_ADD)
-                            && (extcode != EXTCODE_LITTLE_ADD)) {
-                                rcode = highlevel_lock(host, source,
-                                                       packet->data, addr,
-                                                       data[5], data[4],
-                                                       extcode, flags);
-                                fill_async_lock_resp(packet, rcode, extcode, 4);
-                        } else {
-                                rcode = highlevel_lock64(host, source,
-                                             (octlet_t *)packet->data, addr,
-                                             *(octlet_t *)(data + 4), 0ULL,
-                                             extcode, flags);
-                                fill_async_lock_resp(packet, rcode, extcode, 8);
-                        }
-                        break;
-                case 16:
-                        rcode = highlevel_lock64(host, source,
-                                                 (octlet_t *)packet->data, addr,
-                                                 *(octlet_t *)(data + 6),
-                                                 *(octlet_t *)(data + 4),
-                                                 extcode, flags);
-                        fill_async_lock_resp(packet, rcode, extcode, 8);
-                        break;
-                default:
-                        rcode = RCODE_TYPE_ERROR;
-                        fill_async_lock_resp(packet, rcode,
-                                             extcode, 0);
-                }
-
-                if (rcode >= 0) {
-                        send_packet_nocare(packet);
-                } else {
-                        hpsb_free_packet(packet);
-                }
-                break;
-        }
+		if (!write_acked
+		    && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
+		    && (rcode >= 0)) {
+			/* not a broadcast write, reply */
+			PREP_REPLY_PACKET(0);
+			fill_async_write_resp(packet, rcode);
+			send_packet_nocare(packet);
+		}
+		break;
+
+	case TCODE_READQ:
+		addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
+		rcode = highlevel_read(host, source, &buffer, addr, 4, flags);
+
+		if (rcode >= 0) {
+			PREP_REPLY_PACKET(0);
+			fill_async_readquad_resp(packet, rcode, buffer);
+			send_packet_nocare(packet);
+		}
+		break;
+
+	case TCODE_READB:
+		length = data[3] >> 16;
+		PREP_REPLY_PACKET(length);
+
+		addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
+		rcode = highlevel_read(host, source, packet->data, addr,
+				       length, flags);
+
+		if (rcode >= 0) {
+			fill_async_readblock_resp(packet, rcode, length);
+			send_packet_nocare(packet);
+		} else {
+			hpsb_free_packet(packet);
+		}
+		break;
+
+	case TCODE_LOCK_REQUEST:
+		length = data[3] >> 16;
+		extcode = data[3] & 0xffff;
+		addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
+
+		PREP_REPLY_PACKET(8);
+
+		if ((extcode == 0) || (extcode >= 7)) {
+			/* let switch default handle error */
+			length = 0;
+		}
+
+		switch (length) {
+		case 4:
+			rcode = highlevel_lock(host, source, packet->data, addr,
+					       data[4], 0, extcode,flags);
+			fill_async_lock_resp(packet, rcode, extcode, 4);
+			break;
+		case 8:
+			if ((extcode != EXTCODE_FETCH_ADD)
+			    && (extcode != EXTCODE_LITTLE_ADD)) {
+				rcode = highlevel_lock(host, source,
+						       packet->data, addr,
+						       data[5], data[4],
+						       extcode, flags);
+				fill_async_lock_resp(packet, rcode, extcode, 4);
+			} else {
+				rcode = highlevel_lock64(host, source,
+					     (octlet_t *)packet->data, addr,
+					     *(octlet_t *)(data + 4), 0ULL,
+					     extcode, flags);
+				fill_async_lock_resp(packet, rcode, extcode, 8);
+			}
+			break;
+		case 16:
+			rcode = highlevel_lock64(host, source,
+						 (octlet_t *)packet->data, addr,
+						 *(octlet_t *)(data + 6),
+						 *(octlet_t *)(data + 4),
+						 extcode, flags);
+			fill_async_lock_resp(packet, rcode, extcode, 8);
+			break;
+		default:
+			rcode = RCODE_TYPE_ERROR;
+			fill_async_lock_resp(packet, rcode,
+					     extcode, 0);
+		}
+
+		if (rcode >= 0) {
+			send_packet_nocare(packet);
+		} else {
+			hpsb_free_packet(packet);
+		}
+		break;
+	}
 
 }
 #undef PREP_REPLY_PACKET
 
 
 void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
-                          int write_acked)
+			  int write_acked)
 {
-        int tcode;
-
-        if (host->in_bus_reset) {
-                HPSB_INFO("received packet during reset; ignoring");
-                return;
-        }
-
-        dump_packet("received packet", data, size, -1);
-
-        tcode = (data[0] >> 4) & 0xf;
-
-        switch (tcode) {
-        case TCODE_WRITE_RESPONSE:
-        case TCODE_READQ_RESPONSE:
-        case TCODE_READB_RESPONSE:
-        case TCODE_LOCK_RESPONSE:
-                handle_packet_response(host, tcode, data, size);
-                break;
-
-        case TCODE_WRITEQ:
-        case TCODE_WRITEB:
-        case TCODE_READQ:
-        case TCODE_READB:
-        case TCODE_LOCK_REQUEST:
-                handle_incoming_packet(host, tcode, data, size, write_acked);
-                break;
-
-
-        case TCODE_ISO_DATA:
-                highlevel_iso_receive(host, data, size);
-                break;
-
-        case TCODE_CYCLE_START:
-                /* simply ignore this packet if it is passed on */
-                break;
-
-        default:
-                HPSB_NOTICE("received packet with bogus transaction code %d",
-                            tcode);
-                break;
-        }
+	int tcode;
+
+	if (host->in_bus_reset) {
+		HPSB_INFO("received packet during reset; ignoring");
+		return;
+	}
+
+	dump_packet("received packet", data, size, -1);
+
+	tcode = (data[0] >> 4) & 0xf;
+
+	switch (tcode) {
+	case TCODE_WRITE_RESPONSE:
+	case TCODE_READQ_RESPONSE:
+	case TCODE_READB_RESPONSE:
+	case TCODE_LOCK_RESPONSE:
+		handle_packet_response(host, tcode, data, size);
+		break;
+
+	case TCODE_WRITEQ:
+	case TCODE_WRITEB:
+	case TCODE_READQ:
+	case TCODE_READB:
+	case TCODE_LOCK_REQUEST:
+		handle_incoming_packet(host, tcode, data, size, write_acked);
+		break;
+
+
+	case TCODE_ISO_DATA:
+		highlevel_iso_receive(host, data, size);
+		break;
+
+	case TCODE_CYCLE_START:
+		/* simply ignore this packet if it is passed on */
+		break;
+
+	default:
+		HPSB_NOTICE("received packet with bogus transaction code %d",
+			    tcode);
+		break;
+	}
 }
 
 
@@ -1126,7 +1126,7 @@ static int __init ieee1394_init(void)
 		   nodemgr implements functionality required of ieee1394a-2000
 		   IRMs */
 		hpsb_disable_irm = 1;
-                      
+
 		return 0;
 	}
 
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index 0b31429..b354660 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -10,8 +10,8 @@
 
 
 struct hpsb_packet {
-        /* This struct is basically read-only for hosts with the exception of
-         * the data buffer contents and xnext - see below. */
+	/* This struct is basically read-only for hosts with the exception of
+	 * the data buffer contents and xnext - see below. */
 
 	/* This can be used for host driver internal linking.
 	 *
@@ -21,47 +21,47 @@ struct hpsb_packet {
 	 * driver_list when free'ing it. */
 	struct list_head driver_list;
 
-        nodeid_t node_id;
+	nodeid_t node_id;
 
-        /* Async and Iso types should be clear, raw means send-as-is, do not
-         * CRC!  Byte swapping shall still be done in this case. */
-        enum { hpsb_async, hpsb_iso, hpsb_raw } __attribute__((packed)) type;
+	/* Async and Iso types should be clear, raw means send-as-is, do not
+	 * CRC!  Byte swapping shall still be done in this case. */
+	enum { hpsb_async, hpsb_iso, hpsb_raw } __attribute__((packed)) type;
 
-        /* Okay, this is core internal and a no care for hosts.
-         * queued   = queued for sending
-         * pending  = sent, waiting for response
-         * complete = processing completed, successful or not
-         */
-        enum {
-                hpsb_unused, hpsb_queued, hpsb_pending, hpsb_complete
-        } __attribute__((packed)) state;
+	/* Okay, this is core internal and a no care for hosts.
+	 * queued   = queued for sending
+	 * pending  = sent, waiting for response
+	 * complete = processing completed, successful or not
+	 */
+	enum {
+		hpsb_unused, hpsb_queued, hpsb_pending, hpsb_complete
+	} __attribute__((packed)) state;
 
-        /* These are core internal. */
-        signed char tlabel;
+	/* These are core internal. */
+	signed char tlabel;
 	signed char ack_code;
 	unsigned char tcode;
 
-        unsigned expect_response:1;
-        unsigned no_waiter:1;
+	unsigned expect_response:1;
+	unsigned no_waiter:1;
 
-        /* Speed to transmit with: 0 = 100Mbps, 1 = 200Mbps, 2 = 400Mbps */
-        unsigned speed_code:2;
+	/* Speed to transmit with: 0 = 100Mbps, 1 = 200Mbps, 2 = 400Mbps */
+	unsigned speed_code:2;
 
-        /*
-         * *header and *data are guaranteed to be 32-bit DMAable and may be
-         * overwritten to allow in-place byte swapping.  Neither of these is
-         * CRCed (the sizes also don't include CRC), but contain space for at
-         * least one additional quadlet to allow in-place CRCing.  The memory is
-         * also guaranteed to be DMA mappable.
-         */
-        quadlet_t *header;
-        quadlet_t *data;
-        size_t header_size;
-        size_t data_size;
+	/*
+	 * *header and *data are guaranteed to be 32-bit DMAable and may be
+	 * overwritten to allow in-place byte swapping.  Neither of these is
+	 * CRCed (the sizes also don't include CRC), but contain space for at
+	 * least one additional quadlet to allow in-place CRCing.  The memory is
+	 * also guaranteed to be DMA mappable.
+	 */
+	quadlet_t *header;
+	quadlet_t *data;
+	size_t header_size;
+	size_t data_size;
 
 
-        struct hpsb_host *host;
-        unsigned int generation;
+	struct hpsb_host *host;
+	unsigned int generation;
 
 	atomic_t refcnt;
 
@@ -73,10 +73,10 @@ struct hpsb_packet {
 	/* XXX This is just a hack at the moment */
 	struct sk_buff *skb;
 
-        /* Store jiffies for implementing bus timeouts. */
-        unsigned long sendtime;
+	/* Store jiffies for implementing bus timeouts. */
+	unsigned long sendtime;
 
-        quadlet_t embedded_header[5];
+	quadlet_t embedded_header[5];
 };
 
 /* Set a task for when a packet completes */
@@ -102,7 +102,7 @@ void hpsb_free_packet(struct hpsb_packet *packet);
  */
 static inline unsigned int get_hpsb_generation(struct hpsb_host *host)
 {
-        return atomic_read(&host->generation);
+	return atomic_read(&host->generation);
 }
 
 /*
@@ -157,7 +157,7 @@ void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot);
  * from within a transmit packet routine.
  */
 void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
-                      int ackcode);
+		      int ackcode);
 
 /*
  * Hand over received packet to the core.  The contents of data are expected to
@@ -171,7 +171,7 @@ void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
  * packet type.
  */
 void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
-                          int write_acked);
+			  int write_acked);
 
 
 /*
@@ -197,20 +197,20 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
  * Block 15 (240-255)  reserved for drivers under development, etc.
  */
 
-#define IEEE1394_MAJOR               171
+#define IEEE1394_MAJOR			 171
 
-#define IEEE1394_MINOR_BLOCK_RAW1394       0
-#define IEEE1394_MINOR_BLOCK_VIDEO1394     1
-#define IEEE1394_MINOR_BLOCK_DV1394        2
-#define IEEE1394_MINOR_BLOCK_AMDTP         3
+#define IEEE1394_MINOR_BLOCK_RAW1394	   0
+#define IEEE1394_MINOR_BLOCK_VIDEO1394	   1
+#define IEEE1394_MINOR_BLOCK_DV1394	   2
+#define IEEE1394_MINOR_BLOCK_AMDTP	   3
 #define IEEE1394_MINOR_BLOCK_EXPERIMENTAL 15
 
-#define IEEE1394_CORE_DEV		MKDEV(IEEE1394_MAJOR, 0)
-#define IEEE1394_RAW1394_DEV		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16)
-#define IEEE1394_VIDEO1394_DEV		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_VIDEO1394 * 16)
-#define IEEE1394_DV1394_DEV		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_DV1394 * 16)
-#define IEEE1394_AMDTP_DEV		MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_AMDTP * 16)
-#define IEEE1394_EXPERIMENTAL_DEV	MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_EXPERIMENTAL * 16)
+#define IEEE1394_CORE_DEV	  MKDEV(IEEE1394_MAJOR, 0)
+#define IEEE1394_RAW1394_DEV	  MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16)
+#define IEEE1394_VIDEO1394_DEV	  MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_VIDEO1394 * 16)
+#define IEEE1394_DV1394_DEV	  MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_DV1394 * 16)
+#define IEEE1394_AMDTP_DEV	  MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_AMDTP * 16)
+#define IEEE1394_EXPERIMENTAL_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_EXPERIMENTAL * 16)
 
 /* return the index (within a minor number block) of a file */
 static inline unsigned char ieee1394_file_to_instance(struct file *file)
-- 
cgit v1.1


From 61c7f775ca25ccfc0e51486103a724fb1a3a08f2 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 5 Dec 2005 16:28:59 -0500
Subject: ieee1394: write broadcast_channel only to select nodes (fixes device
 recognition)

Some old 1394-1995 SBP-2 bridges would hang if they received a broadcast write
request to BROADCAST_CHANNEL before the config ROM was read.  Affected devices
include Datafab MD2-FW2 2.5" HDD and SmartDisk VST FWCDRW-V8 portable CD writer.
The write request is now directed to specific nodes instead of being broadcast
to all nodes at once, and it is only performed if a previous read request at
this register succeeded.

Fixes an old interoperability problem which was perceived as a 2.6.14-specific
regression: http://marc.theaimsgroup.com/?t=113190586800003

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/nodemgr.c | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index f4b6025..01ab2bf 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1346,6 +1346,33 @@ static void nodemgr_update_pdrv(struct node_entry *ne)
 }
 
 
+/* Write the BROADCAST_CHANNEL as per IEEE1394a 8.3.2.3.11 and 8.4.2.3.  This
+ * seems like an optional service but in the end it is practically mandatory
+ * as a consequence of these clauses.
+ *
+ * Note that we cannot do a broadcast write to all nodes at once because some
+ * pre-1394a devices would hang. */
+static void nodemgr_irm_write_bc(struct node_entry *ne, int generation)
+{
+	const u64 bc_addr = (CSR_REGISTER_BASE | CSR_BROADCAST_CHANNEL);
+	quadlet_t bc_remote, bc_local;
+	int ret;
+
+	if (!ne->host->is_irm || ne->generation != generation ||
+	    ne->nodeid == ne->host->node_id)
+		return;
+
+	bc_local = cpu_to_be32(ne->host->csr.broadcast_channel);
+
+	/* Check if the register is implemented and 1394a compliant. */
+	ret = hpsb_read(ne->host, ne->nodeid, generation, bc_addr, &bc_remote,
+			sizeof(bc_remote));
+	if (!ret && bc_remote & cpu_to_be32(0x80000000) &&
+	    bc_remote != bc_local)
+		hpsb_node_write(ne, bc_addr, &bc_local, sizeof(bc_local));
+}
+
+
 static void nodemgr_probe_ne(struct host_info *hi, struct node_entry *ne, int generation)
 {
 	struct device *dev;
@@ -1357,6 +1384,8 @@ static void nodemgr_probe_ne(struct host_info *hi, struct node_entry *ne, int ge
 	if (!dev)
 		return;
 
+	nodemgr_irm_write_bc(ne, generation);
+
 	/* If "needs_probe", then this is either a new or changed node we
 	 * rescan totally. If the generation matches for an existing node
 	 * (one that existed prior to the bus reset) we send update calls
@@ -1429,9 +1458,7 @@ static int nodemgr_send_resume_packet(struct hpsb_host *host)
 	return ret;
 }
 
-/* Because we are a 1394a-2000 compliant IRM, we need to inform all the other
- * nodes of the broadcast channel.  (Really we're only setting the validity
- * bit). Other IRM responsibilities go in here as well. */
+/* Perform a few high-level IRM responsibilities. */
 static int nodemgr_do_irm_duties(struct hpsb_host *host, int cycles)
 {
 	quadlet_t bc;
@@ -1440,13 +1467,8 @@ static int nodemgr_do_irm_duties(struct hpsb_host *host, int cycles)
 	if (!host->is_irm || host->irm_id == (nodeid_t)-1)
 		return 1;
 
-	host->csr.broadcast_channel |= 0x40000000;  /* set validity bit */
-
-	bc = cpu_to_be32(host->csr.broadcast_channel);
-
-	hpsb_write(host, LOCAL_BUS | ALL_NODES, get_hpsb_generation(host),
-		   (CSR_REGISTER_BASE | CSR_BROADCAST_CHANNEL),
-		   &bc, sizeof(quadlet_t));
+	/* We are a 1394a-2000 compliant IRM. Set the validity bit. */
+	host->csr.broadcast_channel |= 0x40000000;
 
 	/* If there is no bus manager then we should set the root node's
 	 * force_root bit to promote bus stability per the 1394
-- 
cgit v1.1


From e38dc0ae24635a2a8a68d87cd0f4a13e74a52d98 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 5 Dec 2005 16:29:02 -0500
Subject: ieee1394: remove nonexistent functions from nodemgr.h

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/nodemgr.h | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/drivers/ieee1394/nodemgr.h b/drivers/ieee1394/nodemgr.h
index 3a2f0c0..0b26616 100644
--- a/drivers/ieee1394/nodemgr.h
+++ b/drivers/ieee1394/nodemgr.h
@@ -151,24 +151,6 @@ static inline int hpsb_node_entry_valid(struct node_entry *ne)
 }
 
 /*
- * Returns a node entry (which has its reference count incremented) or NULL if
- * the GUID in question is not known.  Getting a valid entry does not mean that
- * the node with this GUID is currently accessible (might be powered down).
- */
-struct node_entry *hpsb_guid_get_entry(u64 guid);
-
-/* Same as above, but use the nodeid to get an node entry. This is not
- * fool-proof by itself, since the nodeid can change.  */
-struct node_entry *hpsb_nodeid_get_entry(struct hpsb_host *host, nodeid_t nodeid);
-
-/*
- * If the entry refers to a local host, this function will return the pointer
- * to the hpsb_host structure.  It will return NULL otherwise.  Once you have
- * established it is a local host, you can use that knowledge from then on (the
- * GUID won't wander to an external node).  */
-struct hpsb_host *hpsb_get_host_by_ne(struct node_entry *ne);
-
-/*
  * This will fill in the given, pre-initialised hpsb_packet with the current
  * information from the node entry (host, node ID, generation number).  It will
  * return false if the node owning the GUID is not accessible (and not modify the
-- 
cgit v1.1


From 51c1d80e929bace26d2d795bd77fcc14b02ba3bb Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 12 Dec 2005 23:03:19 -0500
Subject: ieee1394: run high-level updates before high-level probes

After a bus reset, let nodemgr call high-level update hooks first for nodes
which do not need to be probed.  The main benefit is for a bus with more
than one SBP-2 device:  SBP-2 reconnects will be performed before SBP-2
logins, thus have a much higher chance to succeed, and their SCSI devices
will not be blocked much longer than necessary.  This was demonstrated for
Linux 2.4 by Dave Cinege a while ago.

A better approach would be to perform time-consuming probes in parallel by a
subthread.  I actually plan to implement this for sbp2 but it may take a
while to get that done and tested.  Until then, this tweak is a huge
improvement for users with multiple SBP-2 devices.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/nodemgr.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 01ab2bf..0ec2987 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1407,14 +1407,28 @@ static void nodemgr_node_probe(struct host_info *hi, int generation)
 	struct hpsb_host *host = hi->host;
 	struct class *class = &nodemgr_ne_class;
 	struct class_device *cdev;
+	struct node_entry *ne;
 
 	/* Do some processing of the nodes we've probed. This pulls them
 	 * into the sysfs layer if needed, and can result in processing of
 	 * unit-directories, or just updating the node and it's
-	 * unit-directories. */
+	 * unit-directories.
+	 *
+	 * Run updates before probes. Usually, updates are time-critical
+	 * while probes are time-consuming. (Well, those probes need some
+	 * improvement...) */
+
 	down_read(&class->subsys.rwsem);
-	list_for_each_entry(cdev, &class->children, node)
-		nodemgr_probe_ne(hi, container_of(cdev, struct node_entry, class_dev), generation);
+	list_for_each_entry(cdev, &class->children, node) {
+		ne = container_of(cdev, struct node_entry, class_dev);
+		if (!ne->needs_probe)
+			nodemgr_probe_ne(hi, ne, generation);
+	}
+	list_for_each_entry(cdev, &class->children, node) {
+		ne = container_of(cdev, struct node_entry, class_dev);
+		if (ne->needs_probe)
+			nodemgr_probe_ne(hi, ne, generation);
+	}
         up_read(&class->subsys.rwsem);
 
 
-- 
cgit v1.1


From 43863eba763e0c91e33e342ce5b7650fea594a53 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 12 Dec 2005 23:03:24 -0500
Subject: sbp2: delete sbp2scsi_direction_table

DMA_BIDIRECTIONAL data direction may be handled properly by Linux in the
future.  For now, reject it instead to convert it to another direction.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 45 ++++++++++++++++-----------------------------
 drivers/ieee1394/sbp2.h | 40 +---------------------------------------
 2 files changed, 17 insertions(+), 68 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index f0763b7..372a772 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -1740,28 +1740,15 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 	command_orb->misc |= ORB_SET_SPEED(scsi_id->speed_code);
 	command_orb->misc |= ORB_SET_NOTIFY(1);	/* Notify us when complete */
 
-	/*
-	 * Get the direction of the transfer. If the direction is unknown, then use our
-	 * goofy table as a back-up.
-	 */
-	switch (dma_dir) {
-	case DMA_NONE:
+	if (dma_dir == DMA_NONE)
 		orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
-		break;
-	case DMA_TO_DEVICE:
+	else if (dma_dir == DMA_TO_DEVICE && scsi_request_bufflen)
 		orb_direction = ORB_DIRECTION_WRITE_TO_MEDIA;
-		break;
-	case DMA_FROM_DEVICE:
+	else if (dma_dir == DMA_FROM_DEVICE && scsi_request_bufflen)
 		orb_direction = ORB_DIRECTION_READ_FROM_MEDIA;
-		break;
-	case DMA_BIDIRECTIONAL:
-	default:
-		SBP2_ERR("SCSI data transfer direction not specified. "
-			 "Update the SBP2 direction table in sbp2.h if "
-			 "necessary for your application");
-		__scsi_print_command(scsi_cmd);
-		orb_direction = sbp2scsi_direction_table[*scsi_cmd];
-		break;
+	else {
+		SBP2_WARN("Falling back to DMA_NONE");
+		orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
 	}
 
 	/*
@@ -1880,16 +1867,6 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 			command_orb->misc |= ORB_SET_DATA_SIZE(scsi_request_bufflen);
 			command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
 
-			/*
-			 * Sanity, in case our direction table is not
-			 * up-to-date
-			 */
-			if (!scsi_request_bufflen) {
-				command_orb->data_descriptor_hi = 0x0;
-				command_orb->data_descriptor_lo = 0x0;
-				command_orb->misc |= ORB_SET_DIRECTION(1);
-			}
-
 		} else {
 			/*
 			 * Need to turn this into page tables, since the
@@ -2371,6 +2348,16 @@ static int sbp2scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	}
 
 	/*
+	 * Bidirectional commands are not yet implemented,
+	 * and unknown transfer direction not handled.
+	 */
+	if (SCpnt->sc_data_direction == DMA_BIDIRECTIONAL) {
+		SBP2_ERR("Cannot handle DMA_BIDIRECTIONAL - rejecting command");
+		result = DID_ERROR << 16;
+		goto done;
+	}
+
+	/*
 	 * Try and send our SCSI command
 	 */
 	if (sbp2_send_command(scsi_id, SCpnt, done)) {
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index abc647b..8e227c5 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -260,45 +260,7 @@ struct sbp2_status_block {
 #define SBP2_MAX_SG_ELEMENT_LENGTH	0xf000
 #define SBP2_MAX_UDS_PER_NODE		16	/* Maximum scsi devices per node */
 #define SBP2_MAX_SECTORS		255	/* Max sectors supported */
-
-/*
- * SCSI direction table...
- * (now used as a back-up in case the direction passed down from above is "unknown")
- *
- * DIN = IN data direction
- * DOU = OUT data direction
- * DNO = No data transfer
- * DUN = Unknown data direction
- *
- * Opcode 0xec (Teac specific "opc execute") possibly should be DNO,
- * but we'll change it when somebody reports a problem with this.
- */
-#define DIN				ORB_DIRECTION_READ_FROM_MEDIA
-#define DOU				ORB_DIRECTION_WRITE_TO_MEDIA
-#define DNO				ORB_DIRECTION_NO_DATA_TRANSFER
-#define DUN				DIN
-
-static unchar sbp2scsi_direction_table[0x100] = {
-	DNO,DNO,DIN,DIN,DOU,DIN,DIN,DOU,DIN,DUN,DOU,DOU,DUN,DUN,DUN,DIN,
-	DNO,DIN,DIN,DOU,DIN,DOU,DNO,DNO,DOU,DNO,DIN,DNO,DIN,DOU,DNO,DUN,
-	DIN,DUN,DIN,DIN,DOU,DIN,DUN,DUN,DIN,DIN,DOU,DNO,DUN,DIN,DOU,DOU,
-	DOU,DOU,DOU,DNO,DIN,DNO,DNO,DIN,DOU,DOU,DOU,DOU,DIN,DOU,DIN,DOU,
-	DOU,DOU,DIN,DIN,DIN,DNO,DIN,DNO,DNO,DNO,DUN,DNO,DOU,DIN,DNO,DUN,
-	DUN,DIN,DIN,DNO,DNO,DOU,DUN,DUN,DNO,DIN,DIN,DNO,DIN,DOU,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DNO,DOU,DOU,DIN,DNO,DNO,DNO,DIN,DNO,DOU,DUN,DNO,DIN,DOU,DOU,
-	DOU,DOU,DOU,DNO,DUN,DIN,DOU,DIN,DIN,DIN,DNO,DNO,DNO,DIN,DIN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DOU,DUN,DUN,DUN,DUN,DUN,
-	DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN
-};
-
-/* This should be safe */
-#define SBP2_MAX_CMDS		8
+#define SBP2_MAX_CMDS			8	/* This should be safe */
 
 /* This is the two dma types we use for cmd_dma below */
 enum cmd_dma_types {
-- 
cgit v1.1


From dc3edd5412341b02d84144ddfd5bf6ccaaeeb1ac Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 12 Dec 2005 23:03:30 -0500
Subject: sbp2: did not clean up after scsi_add_device() failed

If scsi_add_device() at the end of sbp2_start_device() fails, e.g. due to
transport errors during SCSI inquiry, sbp2 needs to log out of the device
and release all associated resources.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 372a772..5b9d03e 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -960,6 +960,8 @@ alloc_fail:
 	error = scsi_add_device(scsi_id->scsi_host, 0, scsi_id->ud->id, 0);
 	if (error) {
 		SBP2_ERR("scsi_add_device failed");
+		sbp2_logout_device(scsi_id);
+		sbp2_remove_device(scsi_id);
 		return error;
 	}
 
-- 
cgit v1.1


From 209171a17a908605e516d11436371337a5d87f06 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 13 Dec 2005 11:05:00 -0500
Subject: ohci1394: log number of implemented isochronous contexts

Print the number of IR and IT contexts which a hardware implements
as an informational log message when ohci1394 initializes.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/ohci1394.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 97b6f48..b6b96fa 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -584,12 +584,13 @@ static void ohci_initialize(struct ti_ohci *ohci)
 	sprintf (irq_buf, "%s", __irq_itoa(ohci->dev->irq));
 #endif
 	PRINT(KERN_INFO, "OHCI-1394 %d.%d (PCI): IRQ=[%s]  "
-	      "MMIO=[%lx-%lx]  Max Packet=[%d]",
+	      "MMIO=[%lx-%lx]  Max Packet=[%d]  IR/IT contexts=[%d/%d]",
 	      ((((buf) >> 16) & 0xf) + (((buf) >> 20) & 0xf) * 10),
 	      ((((buf) >> 4) & 0xf) + ((buf) & 0xf) * 10), irq_buf,
 	      pci_resource_start(ohci->dev, 0),
 	      pci_resource_start(ohci->dev, 0) + OHCI1394_REGISTER_SIZE - 1,
-	      ohci->max_packet_size);
+	      ohci->max_packet_size,
+	      ohci->nb_iso_rcv_ctx, ohci->nb_iso_xmit_ctx);
 
 	/* Check all of our ports to make sure that if anything is
 	 * connected, we enable that port. */
@@ -3351,13 +3352,8 @@ static int __devinit ohci1394_pci_probe(struct pci_dev *dev,
 	/* Determine the number of available IR and IT contexts. */
 	ohci->nb_iso_rcv_ctx =
 		get_nb_iso_ctx(ohci, OHCI1394_IsoRecvIntMaskSet);
-	DBGMSG("%d iso receive contexts available",
-	       ohci->nb_iso_rcv_ctx);
-
 	ohci->nb_iso_xmit_ctx =
 		get_nb_iso_ctx(ohci, OHCI1394_IsoXmitIntMaskSet);
-	DBGMSG("%d iso transmit contexts available",
-	       ohci->nb_iso_xmit_ctx);
 
 	/* Set the usage bits for non-existent contexts so they can't
 	 * be allocated */
-- 
cgit v1.1


From cf8d2c0965b891a5efce8c3a9a07a522e91ddba2 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 13 Dec 2005 11:05:03 -0500
Subject: sbp2: split sbp2_create_command_orb() for better readability

sbp2_create_command_orb() code cleanup:
 - add two helper functions to reduce nesting depth
 - omit the return value which was always ignored
 - remove unnecessary declaration from sb2.h

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 355 ++++++++++++++++++++++++------------------------
 drivers/ieee1394/sbp2.h |   7 -
 2 files changed, 178 insertions(+), 184 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 5b9d03e..14b0c35 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -1707,26 +1707,184 @@ static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait)
 	return 0;
 }
 
+static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
+				     struct sbp2scsi_host_info *hi,
+				     struct sbp2_command_info *command,
+				     unsigned int scsi_use_sg,
+				     struct scatterlist *sgpnt,
+				     u32 orb_direction,
+				     enum dma_data_direction dma_dir)
+{
+	command->dma_dir = dma_dir;
+	orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
+	orb->misc |= ORB_SET_DIRECTION(orb_direction);
+
+	/* Special case if only one element (and less than 64KB in size) */
+	if ((scsi_use_sg == 1) &&
+	    (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
+
+		SBP2_DEBUG("Only one s/g element");
+		command->dma_size = sgpnt[0].length;
+		command->dma_type = CMD_DMA_PAGE;
+		command->cmd_dma = pci_map_page(hi->host->pdev,
+						sgpnt[0].page,
+						sgpnt[0].offset,
+						command->dma_size,
+						command->dma_dir);
+		SBP2_DMA_ALLOC("single page scatter element");
+
+		orb->data_descriptor_lo = command->cmd_dma;
+		orb->misc |= ORB_SET_DATA_SIZE(command->dma_size);
+
+	} else {
+		struct sbp2_unrestricted_page_table *sg_element =
+					&command->scatter_gather_element[0];
+		u32 sg_count, sg_len;
+		dma_addr_t sg_addr;
+		int i, count = pci_map_sg(hi->host->pdev, sgpnt, scsi_use_sg,
+					  dma_dir);
+
+		SBP2_DMA_ALLOC("scatter list");
+
+		command->dma_size = scsi_use_sg;
+		command->sge_buffer = sgpnt;
+
+		/* use page tables (s/g) */
+		orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
+		orb->data_descriptor_lo = command->sge_dma;
+
+		/*
+		 * Loop through and fill out our sbp-2 page tables
+		 * (and split up anything too large)
+		 */
+		for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
+			sg_len = sg_dma_len(sgpnt);
+			sg_addr = sg_dma_address(sgpnt);
+			while (sg_len) {
+				sg_element[sg_count].segment_base_lo = sg_addr;
+				if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
+					sg_element[sg_count].length_segment_base_hi =
+						PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
+					sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
+					sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
+				} else {
+					sg_element[sg_count].length_segment_base_hi =
+						PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
+					sg_len = 0;
+				}
+				sg_count++;
+			}
+		}
+
+		/* Number of page table (s/g) elements */
+		orb->misc |= ORB_SET_DATA_SIZE(sg_count);
+
+		sbp2util_packet_dump(sg_element,
+				     (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
+				     "sbp2 s/g list", command->sge_dma);
+
+		/* Byte swap page tables if necessary */
+		sbp2util_cpu_to_be32_buffer(sg_element,
+					    (sizeof(struct sbp2_unrestricted_page_table)) *
+					    sg_count);
+	}
+}
+
+static void sbp2_prep_command_orb_no_sg(struct sbp2_command_orb *orb,
+					struct sbp2scsi_host_info *hi,
+					struct sbp2_command_info *command,
+					struct scatterlist *sgpnt,
+					u32 orb_direction,
+					unsigned int scsi_request_bufflen,
+					void *scsi_request_buffer,
+					enum dma_data_direction dma_dir)
+{
+	command->dma_dir = dma_dir;
+	command->dma_size = scsi_request_bufflen;
+	command->dma_type = CMD_DMA_SINGLE;
+	command->cmd_dma = pci_map_single(hi->host->pdev, scsi_request_buffer,
+					  command->dma_size, command->dma_dir);
+	orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
+	orb->misc |= ORB_SET_DIRECTION(orb_direction);
+
+	SBP2_DMA_ALLOC("single bulk");
+
+	/*
+	 * Handle case where we get a command w/o s/g enabled (but
+	 * check for transfers larger than 64K)
+	 */
+	if (scsi_request_bufflen <= SBP2_MAX_SG_ELEMENT_LENGTH) {
+
+		orb->data_descriptor_lo = command->cmd_dma;
+		orb->misc |= ORB_SET_DATA_SIZE(scsi_request_bufflen);
+
+	} else {
+		struct sbp2_unrestricted_page_table *sg_element =
+			&command->scatter_gather_element[0];
+		u32 sg_count, sg_len;
+		dma_addr_t sg_addr;
+
+		/*
+		 * Need to turn this into page tables, since the
+		 * buffer is too large.
+		 */
+		orb->data_descriptor_lo = command->sge_dma;
+
+		/* Use page tables (s/g) */
+		orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
+
+		/*
+		 * fill out our sbp-2 page tables (and split up
+		 * the large buffer)
+		 */
+		sg_count = 0;
+		sg_len = scsi_request_bufflen;
+		sg_addr = command->cmd_dma;
+		while (sg_len) {
+			sg_element[sg_count].segment_base_lo = sg_addr;
+			if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
+				sg_element[sg_count].length_segment_base_hi =
+					PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
+				sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
+				sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
+			} else {
+				sg_element[sg_count].length_segment_base_hi =
+					PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
+				sg_len = 0;
+			}
+			sg_count++;
+		}
+
+		/* Number of page table (s/g) elements */
+		orb->misc |= ORB_SET_DATA_SIZE(sg_count);
+
+		sbp2util_packet_dump(sg_element,
+				     (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
+				     "sbp2 s/g list", command->sge_dma);
+
+		/* Byte swap page tables if necessary */
+		sbp2util_cpu_to_be32_buffer(sg_element,
+					    (sizeof(struct sbp2_unrestricted_page_table)) *
+					     sg_count);
+	}
+}
+
 /*
  * This function is called to create the actual command orb and s/g list
  * out of the scsi command itself.
  */
-static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
-				   struct sbp2_command_info *command,
-				   unchar *scsi_cmd,
-				   unsigned int scsi_use_sg,
-				   unsigned int scsi_request_bufflen,
-				   void *scsi_request_buffer,
-				   enum dma_data_direction dma_dir)
+static void sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
+				    struct sbp2_command_info *command,
+				    unchar *scsi_cmd,
+				    unsigned int scsi_use_sg,
+				    unsigned int scsi_request_bufflen,
+				    void *scsi_request_buffer,
+				    enum dma_data_direction dma_dir)
 {
 	struct sbp2scsi_host_info *hi = scsi_id->hi;
 	struct scatterlist *sgpnt = (struct scatterlist *)scsi_request_buffer;
 	struct sbp2_command_orb *command_orb = &command->command_orb;
-	struct sbp2_unrestricted_page_table *scatter_gather_element =
-		&command->scatter_gather_element[0];
-	u32 sg_count, sg_len, orb_direction;
-	dma_addr_t sg_addr;
-	int i;
+	u32 orb_direction;
 
 	/*
 	 * Set-up our command ORB..
@@ -1753,186 +1911,29 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
 		orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
 	}
 
-	/*
-	 * Set-up our pagetable stuff... unfortunately, this has become
-	 * messier than I'd like. Need to clean this up a bit.   ;-)
-	 */
+	/* Set-up our pagetable stuff */
 	if (orb_direction == ORB_DIRECTION_NO_DATA_TRANSFER) {
-
 		SBP2_DEBUG("No data transfer");
-
-		/*
-		 * Handle no data transfer
-		 */
 		command_orb->data_descriptor_hi = 0x0;
 		command_orb->data_descriptor_lo = 0x0;
 		command_orb->misc |= ORB_SET_DIRECTION(1);
-
 	} else if (scsi_use_sg) {
-
 		SBP2_DEBUG("Use scatter/gather");
-
-		/*
-		 * Special case if only one element (and less than 64KB in size)
-		 */
-		if ((scsi_use_sg == 1) && (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
-
-			SBP2_DEBUG("Only one s/g element");
-			command->dma_dir = dma_dir;
-			command->dma_size = sgpnt[0].length;
-			command->dma_type = CMD_DMA_PAGE;
-			command->cmd_dma = pci_map_page(hi->host->pdev,
-							sgpnt[0].page,
-							sgpnt[0].offset,
-							command->dma_size,
-							command->dma_dir);
-			SBP2_DMA_ALLOC("single page scatter element");
-
-			command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
-			command_orb->data_descriptor_lo = command->cmd_dma;
-			command_orb->misc |= ORB_SET_DATA_SIZE(command->dma_size);
-			command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
-
-		} else {
-			int count = pci_map_sg(hi->host->pdev, sgpnt, scsi_use_sg, dma_dir);
-			SBP2_DMA_ALLOC("scatter list");
-
-			command->dma_size = scsi_use_sg;
-			command->dma_dir = dma_dir;
-			command->sge_buffer = sgpnt;
-
-			/* use page tables (s/g) */
-			command_orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
-			command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
-			command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
-			command_orb->data_descriptor_lo = command->sge_dma;
-
-			/*
-			 * Loop through and fill out our sbp-2 page tables
-			 * (and split up anything too large)
-			 */
-			for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
-				sg_len = sg_dma_len(sgpnt);
-				sg_addr = sg_dma_address(sgpnt);
-				while (sg_len) {
-					scatter_gather_element[sg_count].segment_base_lo = sg_addr;
-					if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
-						scatter_gather_element[sg_count].length_segment_base_hi =
-							PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
-						sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
-						sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
-					} else {
-						scatter_gather_element[sg_count].length_segment_base_hi =
-							PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
-						sg_len = 0;
-					}
-					sg_count++;
-				}
-			}
-
-			/* Number of page table (s/g) elements */
-			command_orb->misc |= ORB_SET_DATA_SIZE(sg_count);
-
-			sbp2util_packet_dump(scatter_gather_element,
-					     (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
-					     "sbp2 s/g list", command->sge_dma);
-
-			/*
-			 * Byte swap page tables if necessary
-			 */
-			sbp2util_cpu_to_be32_buffer(scatter_gather_element,
-						    (sizeof(struct sbp2_unrestricted_page_table)) *
-						    sg_count);
-
-		}
-
+		sbp2_prep_command_orb_sg(command_orb, hi, command, scsi_use_sg,
+					 sgpnt, orb_direction, dma_dir);
 	} else {
-
 		SBP2_DEBUG("No scatter/gather");
-
-		command->dma_dir = dma_dir;
-		command->dma_size = scsi_request_bufflen;
-		command->dma_type = CMD_DMA_SINGLE;
-		command->cmd_dma =
-		    pci_map_single(hi->host->pdev, scsi_request_buffer,
-				   command->dma_size, command->dma_dir);
-		SBP2_DMA_ALLOC("single bulk");
-
-		/*
-		 * Handle case where we get a command w/o s/g enabled (but
-		 * check for transfers larger than 64K)
-		 */
-		if (scsi_request_bufflen <= SBP2_MAX_SG_ELEMENT_LENGTH) {
-
-			command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
-			command_orb->data_descriptor_lo = command->cmd_dma;
-			command_orb->misc |= ORB_SET_DATA_SIZE(scsi_request_bufflen);
-			command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
-
-		} else {
-			/*
-			 * Need to turn this into page tables, since the
-			 * buffer is too large.
-			 */
-			command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
-			command_orb->data_descriptor_lo = command->sge_dma;
-
-			/* Use page tables (s/g) */
-			command_orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
-			command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
-
-			/*
-			 * fill out our sbp-2 page tables (and split up
-			 * the large buffer)
-			 */
-			sg_count = 0;
-			sg_len = scsi_request_bufflen;
-			sg_addr = command->cmd_dma;
-			while (sg_len) {
-				scatter_gather_element[sg_count].segment_base_lo = sg_addr;
-				if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
-					scatter_gather_element[sg_count].length_segment_base_hi =
-						PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
-					sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
-					sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
-				} else {
-					scatter_gather_element[sg_count].length_segment_base_hi =
-						PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
-					sg_len = 0;
-				}
-				sg_count++;
-			}
-
-			/* Number of page table (s/g) elements */
-			command_orb->misc |= ORB_SET_DATA_SIZE(sg_count);
-
-			sbp2util_packet_dump(scatter_gather_element,
-					     (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
-					     "sbp2 s/g list", command->sge_dma);
-
-			/*
-			 * Byte swap page tables if necessary
-			 */
-			sbp2util_cpu_to_be32_buffer(scatter_gather_element,
-						    (sizeof(struct sbp2_unrestricted_page_table)) *
-						     sg_count);
-
-		}
-
+		sbp2_prep_command_orb_no_sg(command_orb, hi, command, sgpnt,
+					    orb_direction, scsi_request_bufflen,
+					    scsi_request_buffer, dma_dir);
 	}
 
-	/*
-	 * Byte swap command ORB if necessary
-	 */
+	/* Byte swap command ORB if necessary */
 	sbp2util_cpu_to_be32_buffer(command_orb, sizeof(struct sbp2_command_orb));
 
-	/*
-	 * Put our scsi command in the command ORB
-	 */
+	/* Put our scsi command in the command ORB */
 	memset(command_orb->cdb, 0, 12);
 	memcpy(command_orb->cdb, scsi_cmd, COMMAND_SIZE(*scsi_cmd));
-
-	return 0;
 }
 
 /*
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index 8e227c5..900ea1d 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -410,13 +410,6 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id);
 static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int destid,
 				    quadlet_t *data, u64 addr, size_t length, u16 flags);
 static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait);
-static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
-				   struct sbp2_command_info *command,
-				   unchar *scsi_cmd,
-				   unsigned int scsi_use_sg,
-				   unsigned int scsi_request_bufflen,
-				   void *scsi_request_buffer,
-				   enum dma_data_direction dma_dir);
 static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
 				 struct sbp2_command_info *command);
 static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
-- 
cgit v1.1


From eaceec7f6cc5223d0f146086884d67746b8aa81d Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 13 Dec 2005 11:05:05 -0500
Subject: sbp2: remove duplicate code from sbp2_start_device() Use
 sbp2_remove_device() to free FIFO and ORB DMAs in a failure case.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Jody McIntyre <scjody@modernduck.com>
---
 drivers/ieee1394/sbp2.c | 57 ++++++-------------------------------------------
 1 file changed, 7 insertions(+), 50 deletions(-)

diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index 14b0c35..18d7eda 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -856,56 +856,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
 		pci_alloc_consistent(hi->host->pdev,
 				     sizeof(struct sbp2_login_orb),
 				     &scsi_id->login_orb_dma);
-	if (!scsi_id->login_orb) {
-alloc_fail:
-		if (scsi_id->query_logins_response) {
-			pci_free_consistent(hi->host->pdev,
-					    sizeof(struct sbp2_query_logins_response),
-					    scsi_id->query_logins_response,
-					    scsi_id->query_logins_response_dma);
-			SBP2_DMA_FREE("query logins response DMA");
-		}
-
-		if (scsi_id->query_logins_orb) {
-			pci_free_consistent(hi->host->pdev,
-					    sizeof(struct sbp2_query_logins_orb),
-					    scsi_id->query_logins_orb,
-					    scsi_id->query_logins_orb_dma);
-			SBP2_DMA_FREE("query logins ORB DMA");
-		}
-
-		if (scsi_id->logout_orb) {
-			pci_free_consistent(hi->host->pdev,
-					    sizeof(struct sbp2_logout_orb),
-					    scsi_id->logout_orb,
-					    scsi_id->logout_orb_dma);
-			SBP2_DMA_FREE("logout ORB DMA");
-		}
-
-		if (scsi_id->reconnect_orb) {
-			pci_free_consistent(hi->host->pdev,
-					    sizeof(struct sbp2_reconnect_orb),
-					    scsi_id->reconnect_orb,
-					    scsi_id->reconnect_orb_dma);
-			SBP2_DMA_FREE("reconnect ORB DMA");
-		}
-
-		if (scsi_id->login_response) {
-			pci_free_consistent(hi->host->pdev,
-					    sizeof(struct sbp2_login_response),
-					    scsi_id->login_response,
-					    scsi_id->login_response_dma);
-			SBP2_DMA_FREE("login FIFO DMA");
-		}
-
-		list_del(&scsi_id->scsi_list);
-
-		kfree(scsi_id);
-
-		SBP2_ERR("Could not allocate memory for scsi_id");
-
-		return -ENOMEM;
-	}
+	if (!scsi_id->login_orb)
+		goto alloc_fail;
 	SBP2_DMA_ALLOC("consistent DMA region for login ORB");
 
 	SBP2_DEBUG("New SBP-2 device inserted, SCSI ID = %x", scsi_id->ud->id);
@@ -966,6 +918,11 @@ alloc_fail:
 	}
 
 	return 0;
+
+alloc_fail:
+	SBP2_ERR("Could not allocate memory for scsi_id");
+	sbp2_remove_device(scsi_id);
+	return -ENOMEM;
 }
 
 /*
-- 
cgit v1.1


From 7063fbf2261194f72ee75afca67b3b38b554b5fa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 15 Dec 2005 14:29:43 -0800
Subject: [PATCH] configfs: User-driven configuration filesystem

Configfs, a file system for userspace-driven kernel object configuration.
The OCFS2 stack makes extensive use of this for propagation of cluster
configuration information into kernel.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 Documentation/filesystems/00-INDEX                 |    2 +
 Documentation/filesystems/configfs/configfs.txt    |  434 ++++++++
 .../filesystems/configfs/configfs_example.c        |  474 +++++++++
 MAINTAINERS                                        |    5 +
 fs/Kconfig                                         |   14 +
 fs/Makefile                                        |    1 +
 fs/configfs/Makefile                               |    7 +
 fs/configfs/configfs_internal.h                    |  142 +++
 fs/configfs/dir.c                                  | 1102 ++++++++++++++++++++
 fs/configfs/file.c                                 |  360 +++++++
 fs/configfs/inode.c                                |  162 +++
 fs/configfs/item.c                                 |  227 ++++
 fs/configfs/mount.c                                |  159 +++
 fs/configfs/symlink.c                              |  281 +++++
 include/linux/configfs.h                           |  205 ++++
 15 files changed, 3575 insertions(+)
 create mode 100644 Documentation/filesystems/configfs/configfs.txt
 create mode 100644 Documentation/filesystems/configfs/configfs_example.c
 create mode 100644 fs/configfs/Makefile
 create mode 100644 fs/configfs/configfs_internal.h
 create mode 100644 fs/configfs/dir.c
 create mode 100644 fs/configfs/file.c
 create mode 100644 fs/configfs/inode.c
 create mode 100644 fs/configfs/item.c
 create mode 100644 fs/configfs/mount.c
 create mode 100644 fs/configfs/symlink.c
 create mode 100644 include/linux/configfs.h

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index bcfbab8..628f8a7 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,6 +12,8 @@ cifs.txt
 	- description of the CIFS filesystem
 coda.txt
 	- description of the CODA filesystem.
+configfs/
+	- directory containing configfs documentation and example code.
 cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 0000000..c4ff96b
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
+
+configfs - Userspace-driven kernel object configuation.
+
+Joel Becker <joel.becker@oracle.com>
+
+Updated: 31 March 2005
+
+Copyright (c) 2005 Oracle Corporation,
+	Joel Becker <joel.becker@oracle.com>
+
+
+[What is configfs?]
+
+configfs is a ram-based filesystem that provides the converse of
+sysfs's functionality.  Where sysfs is a filesystem-based view of
+kernel objects, configfs is a filesystem-based manager of kernel
+objects, or config_items.
+
+With sysfs, an object is created in kernel (for example, when a device
+is discovered) and it is registered with sysfs.  Its attributes then
+appear in sysfs, allowing userspace to read the attributes via
+readdir(3)/read(2).  It may allow some attributes to be modified via
+write(2).  The important point is that the object is created and
+destroyed in kernel, the kernel controls the lifecycle of the sysfs
+representation, and sysfs is merely a window on all this.
+
+A configfs config_item is created via an explicit userspace operation:
+mkdir(2).  It is destroyed via rmdir(2).  The attributes appear at
+mkdir(2) time, and can be read or modified via read(2) and write(2).
+As with sysfs, readdir(3) queries the list of items and/or attributes.
+symlink(2) can be used to group items together.  Unlike sysfs, the
+lifetime of the representation is completely driven by userspace.  The
+kernel modules backing the items must respond to this.
+
+Both sysfs and configfs can and should exist together on the same
+system.  One is not a replacement for the other.
+
+[Using configfs]
+
+configfs can be compiled as a module or into the kernel.  You can access
+it by doing
+
+	mount -t configfs none /config
+
+The configfs tree will be empty unless client modules are also loaded.
+These are modules that register their item types with configfs as
+subsystems.  Once a client subsystem is loaded, it will appear as a
+subdirectory (or more than one) under /config.  Like sysfs, the
+configfs tree is always there, whether mounted on /config or not.
+
+An item is created via mkdir(2).  The item's attributes will also
+appear at this time.  readdir(3) can determine what the attributes are,
+read(2) can query their default values, and write(2) can store new
+values.  Like sysfs, attributes should be ASCII text files, preferably
+with only one value per file.  The same efficiency caveats from sysfs
+apply.  Don't mix more than one attribute in one attribute file.
+
+Like sysfs, configfs expects write(2) to store the entire buffer at
+once.  When writing to configfs attributes, userspace processes should
+first read the entire file, modify the portions they wish to change, and
+then write the entire buffer back.  Attribute files have a maximum size
+of one page (PAGE_SIZE, 4096 on i386).
+
+When an item needs to be destroyed, remove it with rmdir(2).  An
+item cannot be destroyed if any other item has a link to it (via
+symlink(2)).  Links can be removed via unlink(2).
+
+[Configuring FakeNBD: an Example]
+
+Imagine there's a Network Block Device (NBD) driver that allows you to
+access remote block devices.  Call it FakeNBD.  FakeNBD uses configfs
+for its configuration.  Obviously, there will be a nice program that
+sysadmins use to configure FakeNBD, but somehow that program has to tell
+the driver about it.  Here's where configfs comes in.
+
+When the FakeNBD driver is loaded, it registers itself with configfs.
+readdir(3) sees this just fine:
+
+	# ls /config
+	fakenbd
+
+A fakenbd connection can be created with mkdir(2).  The name is
+arbitrary, but likely the tool will make some use of the name.  Perhaps
+it is a uuid or a disk name:
+
+	# mkdir /config/fakenbd/disk1
+	# ls /config/fakenbd/disk1
+	target device rw
+
+The target attribute contains the IP address of the server FakeNBD will
+connect to.  The device attribute is the device on the server.
+Predictably, the rw attribute determines whether the connection is
+read-only or read-write.
+
+	# echo 10.0.0.1 > /config/fakenbd/disk1/target
+	# echo /dev/sda1 > /config/fakenbd/disk1/device
+	# echo 1 > /config/fakenbd/disk1/rw
+
+That's it.  That's all there is.  Now the device is configured, via the
+shell no less.
+
+[Coding With configfs]
+
+Every object in configfs is a config_item.  A config_item reflects an
+object in the subsystem.  It has attributes that match values on that
+object.  configfs handles the filesystem representation of that object
+and its attributes, allowing the subsystem to ignore all but the
+basic show/store interaction.
+
+Items are created and destroyed inside a config_group.  A group is a
+collection of items that share the same attributes and operations.
+Items are created by mkdir(2) and removed by rmdir(2), but configfs
+handles that.  The group has a set of operations to perform these tasks
+
+A subsystem is the top level of a client module.  During initialization,
+the client module registers the subsystem with configfs, the subsystem
+appears as a directory at the top of the configfs filesystem.  A
+subsystem is also a config_group, and can do everything a config_group
+can.
+
+[struct config_item]
+
+	struct config_item {
+		char                    *ci_name;
+		char                    ci_namebuf[UOBJ_NAME_LEN];
+		struct kref             ci_kref;
+		struct list_head        ci_entry;
+		struct config_item      *ci_parent;
+		struct config_group     *ci_group;
+		struct config_item_type *ci_type;
+		struct dentry           *ci_dentry;
+	};
+
+	void config_item_init(struct config_item *);
+	void config_item_init_type_name(struct config_item *,
+					const char *name,
+					struct config_item_type *type);
+	struct config_item *config_item_get(struct config_item *);
+	void config_item_put(struct config_item *);
+
+Generally, struct config_item is embedded in a container structure, a
+structure that actually represents what the subsystem is doing.  The
+config_item portion of that structure is how the object interacts with
+configfs.
+
+Whether statically defined in a source file or created by a parent
+config_group, a config_item must have one of the _init() functions
+called on it.  This initializes the reference count and sets up the
+appropriate fields.
+
+All users of a config_item should have a reference on it via
+config_item_get(), and drop the reference when they are done via
+config_item_put().
+
+By itself, a config_item cannot do much more than appear in configfs.
+Usually a subsystem wants the item to display and/or store attributes,
+among other things.  For that, it needs a type.
+
+[struct config_item_type]
+
+	struct configfs_item_operations {
+		void (*release)(struct config_item *);
+		ssize_t (*show_attribute)(struct config_item *,
+					  struct configfs_attribute *,
+					  char *);
+		ssize_t (*store_attribute)(struct config_item *,
+					   struct configfs_attribute *,
+					   const char *, size_t);
+		int (*allow_link)(struct config_item *src,
+				  struct config_item *target);
+		int (*drop_link)(struct config_item *src,
+				 struct config_item *target);
+	};
+
+	struct config_item_type {
+		struct module                           *ct_owner;
+		struct configfs_item_operations         *ct_item_ops;
+		struct configfs_group_operations        *ct_group_ops;
+		struct configfs_attribute               **ct_attrs;
+	};
+
+The most basic function of a config_item_type is to define what
+operations can be performed on a config_item.  All items that have been
+allocated dynamically will need to provide the ct_item_ops->release()
+method.  This method is called when the config_item's reference count
+reaches zero.  Items that wish to display an attribute need to provide
+the ct_item_ops->show_attribute() method.  Similarly, storing a new
+attribute value uses the store_attribute() method.
+
+[struct configfs_attribute]
+
+	struct configfs_attribute {
+		char                    *ca_name;
+		struct module           *ca_owner;
+		mode_t                  ca_mode;
+	};
+
+When a config_item wants an attribute to appear as a file in the item's
+configfs directory, it must define a configfs_attribute describing it.
+It then adds the attribute to the NULL-terminated array
+config_item_type->ct_attrs.  When the item appears in configfs, the
+attribute file will appear with the configfs_attribute->ca_name
+filename.  configfs_attribute->ca_mode specifies the file permissions.
+
+If an attribute is readable and the config_item provides a
+ct_item_ops->show_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute.  The converse
+will happen for write(2).
+
+[struct config_group]
+
+A config_item cannot live in a vaccum.  The only way one can be created
+is via mkdir(2) on a config_group.  This will trigger creation of a
+child item.
+
+	struct config_group {
+		struct config_item		cg_item;
+		struct list_head		cg_children;
+		struct configfs_subsystem 	*cg_subsys;
+		struct config_group		**default_groups;
+	};
+
+	void config_group_init(struct config_group *group);
+	void config_group_init_type_name(struct config_group *group,
+					 const char *name,
+					 struct config_item_type *type);
+
+
+The config_group structure contains a config_item.  Properly configuring
+that item means that a group can behave as an item in its own right.
+However, it can do more: it can create child items or groups.  This is
+accomplished via the group operations specified on the group's
+config_item_type.
+
+	struct configfs_group_operations {
+		struct config_item *(*make_item)(struct config_group *group,
+						 const char *name);
+		struct config_group *(*make_group)(struct config_group *group,
+						   const char *name);
+		int (*commit_item)(struct config_item *item);
+		void (*drop_item)(struct config_group *group,
+				  struct config_item *item);
+	};
+
+A group creates child items by providing the
+ct_group_ops->make_item() method.  If provided, this method is called from mkdir(2) in the group's directory.  The subsystem allocates a new
+config_item (or more likely, its container structure), initializes it,
+and returns it to configfs.  Configfs will then populate the filesystem
+tree to reflect the new item.
+
+If the subsystem wants the child to be a group itself, the subsystem
+provides ct_group_ops->make_group().  Everything else behaves the same,
+using the group _init() functions on the group.
+
+Finally, when userspace calls rmdir(2) on the item or group,
+ct_group_ops->drop_item() is called.  As a config_group is also a
+config_item, it is not necessary for a seperate drop_group() method.
+The subsystem must config_item_put() the reference that was initialized
+upon item allocation.  If a subsystem has no work to do, it may omit
+the ct_group_ops->drop_item() method, and configfs will call
+config_item_put() on the item on behalf of the subsystem.
+
+IMPORTANT: drop_item() is void, and as such cannot fail.  When rmdir(2)
+is called, configfs WILL remove the item from the filesystem tree
+(assuming that it has no children to keep it busy).  The subsystem is
+responsible for responding to this.  If the subsystem has references to
+the item in other threads, the memory is safe.  It may take some time
+for the item to actually disappear from the subsystem's usage.  But it
+is gone from configfs.
+
+A config_group cannot be removed while it still has child items.  This
+is implemented in the configfs rmdir(2) code.  ->drop_item() will not be
+called, as the item has not been dropped.  rmdir(2) will fail, as the
+directory is not empty.
+
+[struct configfs_subsystem]
+
+A subsystem must register itself, ususally at module_init time.  This
+tells configfs to make the subsystem appear in the file tree.
+
+	struct configfs_subsystem {
+		struct config_group	su_group;
+		struct semaphore	su_sem;
+	};
+
+	int configfs_register_subsystem(struct configfs_subsystem *subsys);
+	void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+	A subsystem consists of a toplevel config_group and a semaphore.
+The group is where child config_items are created.  For a subsystem,
+this group is usually defined statically.  Before calling
+configfs_register_subsystem(), the subsystem must have initialized the
+group via the usual group _init() functions, and it must also have
+initialized the semaphore.
+	When the register call returns, the subsystem is live, and it
+will be visible via configfs.  At that point, mkdir(2) can be called and
+the subsystem must be ready for it.
+
+[An Example]
+
+The best example of these basic concepts is the simple_children
+subsystem/group and the simple_child item in configfs_example.c  It
+shows a trivial object displaying and storing an attribute, and a simple
+group creating and destroying these children.
+
+[Hierarchy Navigation and the Subsystem Semaphore]
+
+There is an extra bonus that configfs provides.  The config_groups and
+config_items are arranged in a hierarchy due to the fact that they
+appear in a filesystem.  A subsystem is NEVER to touch the filesystem
+parts, but the subsystem might be interested in this hierarchy.  For
+this reason, the hierarchy is mirrored via the config_group->cg_children
+and config_item->ci_parent structure members.
+
+A subsystem can navigate the cg_children list and the ci_parent pointer
+to see the tree created by the subsystem.  This can race with configfs'
+management of the hierarchy, so configfs uses the subsystem semaphore to
+protect modifications.  Whenever a subsystem wants to navigate the
+hierarchy, it must do so under the protection of the subsystem
+semaphore.
+
+A subsystem will be prevented from acquiring the semaphore while a newly
+allocated item has not been linked into this hierarchy.   Similarly, it
+will not be able to acquire the semaphore while a dropping item has not
+yet been unlinked.  This means that an item's ci_parent pointer will
+never be NULL while the item is in configfs, and that an item will only
+be in its parent's cg_children list for the same duration.  This allows
+a subsystem to trust ci_parent and cg_children while they hold the
+semaphore.
+
+[Item Aggregation Via symlink(2)]
+
+configfs provides a simple group via the group->item parent/child
+relationship.  Often, however, a larger environment requires aggregation
+outside of the parent/child connection.  This is implemented via
+symlink(2).
+
+A config_item may provide the ct_item_ops->allow_link() and
+ct_item_ops->drop_link() methods.  If the ->allow_link() method exists,
+symlink(2) may be called with the config_item as the source of the link.
+These links are only allowed between configfs config_items.  Any
+symlink(2) attempt outside the configfs filesystem will be denied.
+
+When symlink(2) is called, the source config_item's ->allow_link()
+method is called with itself and a target item.  If the source item
+allows linking to target item, it returns 0.  A source item may wish to
+reject a link if it only wants links to a certain type of object (say,
+in its own subsystem).
+
+When unlink(2) is called on the symbolic link, the source item is
+notified via the ->drop_link() method.  Like the ->drop_item() method,
+this is a void function and cannot return failure.  The subsystem is
+responsible for responding to the change.
+
+A config_item cannot be removed while it links to any other item, nor
+can it be removed while an item links to it.  Dangling symlinks are not
+allowed in configfs.
+
+[Automatically Created Subgroups]
+
+A new config_group may want to have two types of child config_items.
+While this could be codified by magic names in ->make_item(), it is much
+more explicit to have a method whereby userspace sees this divergence.
+
+Rather than have a group where some items behave differently than
+others, configfs provides a method whereby one or many subgroups are
+automatically created inside the parent at its creation.  Thus,
+mkdir("parent) results in "parent", "parent/subgroup1", up through
+"parent/subgroupN".  Items of type 1 can now be created in
+"parent/subgroup1", and items of type N can be created in
+"parent/subgroupN".
+
+These automatic subgroups, or default groups, do not preclude other
+children of the parent group.  If ct_group_ops->make_group() exists,
+other child groups can be created on the parent group directly.
+
+A configfs subsystem specifies default groups by filling in the
+NULL-terminated array default_groups on the config_group structure.
+Each group in that array is populated in the configfs tree at the same
+time as the parent group.  Similarly, they are removed at the same time
+as the parent.  No extra notification is provided.  When a ->drop_item()
+method call notifies the subsystem the parent group is going away, it
+also means every default group child associated with that parent group.
+
+As a consequence of this, default_groups cannot be removed directly via
+rmdir(2).  They also are not considered when rmdir(2) on the parent
+group is checking for children.
+
+[Committable Items]
+
+NOTE: Committable items are currently unimplemented.
+
+Some config_items cannot have a valid initial state.  That is, no
+default values can be specified for the item's attributes such that the
+item can do its work.  Userspace must configure one or more attributes,
+after which the subsystem can start whatever entity this item
+represents.
+
+Consider the FakeNBD device from above.  Without a target address *and*
+a target device, the subsystem has no idea what block device to import.
+The simple example assumes that the subsystem merely waits until all the
+appropriate attributes are configured, and then connects.  This will,
+indeed, work, but now every attribute store must check if the attributes
+are initialized.  Every attribute store must fire off the connection if
+that condition is met.
+
+Far better would be an explicit action notifying the subsystem that the
+config_item is ready to go.  More importantly, an explicit action allows
+the subsystem to provide feedback as to whether the attibutes are
+initialized in a way that makes sense.  configfs provides this as
+committable items.
+
+configfs still uses only normal filesystem operations.  An item is
+committed via rename(2).  The item is moved from a directory where it
+can be modified to a directory where it cannot.
+
+Any group that provides the ct_group_ops->commit_item() method has
+committable items.  When this group appears in configfs, mkdir(2) will
+not work directly in the group.  Instead, the group will have two
+subdirectories: "live" and "pending".  The "live" directory does not
+support mkdir(2) or rmdir(2) either.  It only allows rename(2).  The
+"pending" directory does allow mkdir(2) and rmdir(2).  An item is
+created in the "pending" directory.  Its attributes can be modified at
+will.  Userspace commits the item by renaming it into the "live"
+directory.  At this point, the subsystem recieves the ->commit_item()
+callback.  If all required attributes are filled to satisfaction, the
+method returns zero and the item is moved to the "live" directory.
+
+As rmdir(2) does not work in the "live" directory, an item must be
+shutdown, or "uncommitted".  Again, this is done via rename(2), this
+time from the "live" directory back to the "pending" one.  The subsystem
+is notified by the ct_group_ops->uncommit_object() method.
+
+
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644
index 0000000..f3c6e49
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example.c - This file is a demonstration module containing
+ *      a number of configfs subsystems.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+
+struct childless {
+	struct configfs_subsystem subsys;
+	int showme;
+	int storeme;
+};
+
+struct childless_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct childless *, char *);
+	ssize_t (*store)(struct childless *, const char *, size_t);
+};
+
+static inline struct childless *to_childless(struct config_item *item)
+{
+	return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+
+static ssize_t childless_showme_read(struct childless *childless,
+				     char *page)
+{
+	ssize_t pos;
+
+	pos = sprintf(page, "%d\n", childless->showme);
+	childless->showme++;
+
+	return pos;
+}
+
+static ssize_t childless_storeme_read(struct childless *childless,
+				      char *page)
+{
+	return sprintf(page, "%d\n", childless->storeme);
+}
+
+static ssize_t childless_storeme_write(struct childless *childless,
+				       const char *page,
+				       size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	childless->storeme = tmp;
+
+	return count;
+}
+
+static ssize_t childless_description_read(struct childless *childless,
+					  char *page)
+{
+	return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+
+static struct childless_attribute childless_attr_showme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
+	.show	= childless_showme_read,
+};
+static struct childless_attribute childless_attr_storeme = {
+	.attr	= { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= childless_storeme_read,
+	.store	= childless_storeme_write,
+};
+static struct childless_attribute childless_attr_description = {
+	.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
+	.show = childless_description_read,
+};
+
+static struct configfs_attribute *childless_attrs[] = {
+	&childless_attr_showme.attr,
+	&childless_attr_storeme.attr,
+	&childless_attr_description.attr,
+	NULL,
+};
+
+static ssize_t childless_attr_show(struct config_item *item,
+				   struct configfs_attribute *attr,
+				   char *page)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = 0;
+
+	if (childless_attr->show)
+		ret = childless_attr->show(childless, page);
+	return ret;
+}
+
+static ssize_t childless_attr_store(struct config_item *item,
+				    struct configfs_attribute *attr,
+				    const char *page, size_t count)
+{
+	struct childless *childless = to_childless(item);
+	struct childless_attribute *childless_attr =
+		container_of(attr, struct childless_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (childless_attr->store)
+		ret = childless_attr->store(childless, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations childless_item_ops = {
+	.show_attribute		= childless_attr_show,
+	.store_attribute	= childless_attr_store,
+};
+
+static struct config_item_type childless_type = {
+	.ct_item_ops	= &childless_item_ops,
+	.ct_attrs	= childless_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct childless childless_subsys = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "01-childless",
+				.ci_type = &childless_type,
+			},
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+
+struct simple_child {
+	struct config_item item;
+	int storeme;
+};
+
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+	return item ? container_of(item, struct simple_child, item) : NULL;
+}
+
+static struct configfs_attribute simple_child_attr_storeme = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "storeme",
+	.ca_mode = S_IRUGO | S_IWUSR,
+};
+
+static struct configfs_attribute *simple_child_attrs[] = {
+	&simple_child_attr_storeme,
+	NULL,
+};
+
+static ssize_t simple_child_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *page)
+{
+	ssize_t count;
+	struct simple_child *simple_child = to_simple_child(item);
+
+	count = sprintf(page, "%d\n", simple_child->storeme);
+
+	return count;
+}
+
+static ssize_t simple_child_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *page, size_t count)
+{
+	struct simple_child *simple_child = to_simple_child(item);
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > INT_MAX)
+		return -ERANGE;
+
+	simple_child->storeme = tmp;
+
+	return count;
+}
+
+static void simple_child_release(struct config_item *item)
+{
+	kfree(to_simple_child(item));
+}
+
+static struct configfs_item_operations simple_child_item_ops = {
+	.release		= simple_child_release,
+	.show_attribute		= simple_child_attr_show,
+	.store_attribute	= simple_child_attr_store,
+};
+
+static struct config_item_type simple_child_type = {
+	.ct_item_ops	= &simple_child_item_ops,
+	.ct_attrs	= simple_child_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+	struct simple_child *simple_child;
+
+	simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
+	if (!simple_child)
+		return NULL;
+
+	memset(simple_child, 0, sizeof(struct simple_child));
+
+	config_item_init_type_name(&simple_child->item, name,
+				   &simple_child_type);
+
+	simple_child->storeme = 0;
+
+	return &simple_child->item;
+}
+
+static struct configfs_attribute simple_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *simple_children_attrs[] = {
+	&simple_children_attr_description,
+	NULL,
+};
+
+static ssize_t simple_children_attr_show(struct config_item *item,
+			   		 struct configfs_attribute *attr,
+			   		 char *page)
+{
+	return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+
+static struct configfs_item_operations simple_children_item_ops = {
+	.show_attribute	= simple_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+	.make_item	= simple_children_make_item,
+};
+
+static struct config_item_type simple_children_type = {
+	.ct_item_ops	= &simple_children_item_ops,
+	.ct_group_ops	= &simple_children_group_ops,
+	.ct_attrs	= simple_children_attrs,
+};
+
+static struct configfs_subsystem simple_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "02-simple-children",
+			.ci_type = &simple_children_type,
+		},
+	},
+};
+
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+
+struct simple_children {
+	struct config_group group;
+};
+
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+	struct simple_children *simple_children;
+
+	simple_children = kmalloc(sizeof(struct simple_children),
+				  GFP_KERNEL);
+	if (!simple_children)
+		return NULL;
+
+	memset(simple_children, 0, sizeof(struct simple_children));
+
+	config_group_init_type_name(&simple_children->group, name,
+				    &simple_children_type);
+
+	return &simple_children->group;
+}
+
+static struct configfs_attribute group_children_attr_description = {
+	.ca_owner = THIS_MODULE,
+	.ca_name = "description",
+	.ca_mode = S_IRUGO,
+};
+
+static struct configfs_attribute *group_children_attrs[] = {
+	&group_children_attr_description,
+	NULL,
+};
+
+static ssize_t group_children_attr_show(struct config_item *item,
+			   		struct configfs_attribute *attr,
+			   		char *page)
+{
+	return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+
+static struct configfs_item_operations group_children_item_ops = {
+	.show_attribute	= group_children_attr_show,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+	.make_group	= group_children_make_group,
+};
+
+static struct config_item_type group_children_type = {
+	.ct_item_ops	= &group_children_item_ops,
+	.ct_group_ops	= &group_children_group_ops,
+	.ct_attrs	= group_children_attrs,
+};
+
+static struct configfs_subsystem group_children_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "03-group-children",
+			.ci_type = &group_children_type,
+		},
+	},
+};
+
+/* ----------------------------------------------------------------- */
+
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+	&childless_subsys.subsys,
+	&simple_children_subsys,
+	&group_children_subsys,
+	NULL,
+};
+
+static int __init configfs_example_init(void)
+{
+	int ret;
+	int i;
+	struct configfs_subsystem *subsys;
+
+	for (i = 0; example_subsys[i]; i++) {
+		subsys = example_subsys[i];
+
+		config_group_init(&subsys->su_group);
+		init_MUTEX(&subsys->su_sem);
+		ret = configfs_register_subsystem(subsys);
+		if (ret) {
+			printk(KERN_ERR "Error %d while registering subsystem %s\n",
+			       ret,
+			       subsys->su_group.cg_item.ci_namebuf);
+			goto out_unregister;
+		}
+	}
+
+	return 0;
+
+out_unregister:
+	for (; i >= 0; i--) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+
+	return ret;
+}
+
+static void __exit configfs_example_exit(void)
+{
+	int i;
+
+	for (i = 0; example_subsys[i]; i++) {
+		configfs_unregister_subsystem(example_subsys[i]);
+	}
+}
+
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/MAINTAINERS b/MAINTAINERS
index 6af6830..86ee06f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -554,6 +554,11 @@ W:	http://us1.samba.org/samba/Linux_CIFS_client.html
 T:	git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
 S:	Supported	
 
+CONFIGFS
+P:	Joel Becker
+M:	Joel Becker <joel.becker@oracle.com>
+S:	Supported
+
 CIRRUS LOGIC GENERIC FBDEV DRIVER
 P:	Jeff Garzik
 M:	jgarzik@pobox.com
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e6..ba1dbe2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -841,6 +841,20 @@ config RELAYFS_FS
 
 	  If unsure, say N.
 
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 4c26557..ff3d48a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,4 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 0000000..00ffb27
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the configfs virtual filesystem
+#
+
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs.o
+
+configfs-objs	:= inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 0000000..8899d9c
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset:8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs_internal.h - Internal stuff for configfs
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/slab.h>
+#include <linux/list.h>
+
+struct configfs_dirent {
+	atomic_t		s_count;
+	struct list_head	s_sibling;
+	struct list_head	s_children;
+	struct list_head	s_links;
+	void 			* s_element;
+	int			s_type;
+	umode_t			s_mode;
+	struct dentry		* s_dentry;
+};
+
+#define CONFIGFS_ROOT		0x0001
+#define CONFIGFS_DIR		0x0002
+#define CONFIGFS_ITEM_ATTR 	0x0004
+#define CONFIGFS_ITEM_LINK 	0x0020
+#define CONFIGFS_USET_DIR	0x0040
+#define CONFIGFS_USET_DEFAULT	0x0080
+#define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+
+extern struct vfsmount * configfs_mount;
+
+extern int configfs_is_root(struct config_item *item);
+
+extern struct inode * configfs_new_inode(mode_t mode);
+extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
+
+extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_make_dirent(struct configfs_dirent *,
+				struct dentry *, void *, umode_t, int);
+
+extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
+extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
+
+extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
+extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
+
+extern int configfs_pin_fs(void);
+extern void configfs_release_fs(void);
+
+extern struct rw_semaphore configfs_rename_sem;
+extern struct super_block * configfs_sb;
+extern struct file_operations configfs_dir_operations;
+extern struct file_operations configfs_file_operations;
+extern struct file_operations bin_fops;
+extern struct inode_operations configfs_dir_inode_operations;
+extern struct inode_operations configfs_symlink_inode_operations;
+
+extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname);
+extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
+
+struct configfs_symlink {
+	struct list_head sl_list;
+	struct config_item *sl_target;
+};
+
+extern int configfs_create_link(struct configfs_symlink *sl,
+				struct dentry *parent,
+				struct dentry *dentry);
+
+static inline struct config_item * to_item(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct config_item *) sd->s_element);
+}
+
+static inline struct configfs_attribute * to_attr(struct dentry * dentry)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+	return ((struct configfs_attribute *) sd->s_element);
+}
+
+static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
+{
+	struct config_item * item = NULL;
+
+	spin_lock(&dcache_lock);
+	if (!d_unhashed(dentry)) {
+		struct configfs_dirent * sd = dentry->d_fsdata;
+		if (sd->s_type & CONFIGFS_ITEM_LINK) {
+			struct configfs_symlink * sl = sd->s_element;
+			item = config_item_get(sl->sl_target);
+		} else
+			item = config_item_get(sd->s_element);
+	}
+	spin_unlock(&dcache_lock);
+
+	return item;
+}
+
+static inline void release_configfs_dirent(struct configfs_dirent * sd)
+{
+	if (!(sd->s_type & CONFIGFS_ROOT))
+		kfree(sd);
+}
+
+static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
+{
+	if (sd) {
+		WARN_ON(!atomic_read(&sd->s_count));
+		atomic_inc(&sd->s_count);
+	}
+	return sd;
+}
+
+static inline void configfs_put(struct configfs_dirent * sd)
+{
+	WARN_ON(!atomic_read(&sd->s_count));
+	if (atomic_dec_and_test(&sd->s_count))
+		release_configfs_dirent(sd);
+}
+
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 0000000..e48b539
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c - Operations for configfs directories.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#undef DEBUG
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+DECLARE_RWSEM(configfs_rename_sem);
+
+static void configfs_d_iput(struct dentry * dentry,
+			    struct inode * inode)
+{
+	struct configfs_dirent * sd = dentry->d_fsdata;
+
+	if (sd) {
+		BUG_ON(sd->s_dentry != dentry);
+		sd->s_dentry = NULL;
+		configfs_put(sd);
+	}
+	iput(inode);
+}
+
+/*
+ * We _must_ delete our dentries on last dput, as the chain-to-parent
+ * behavior is required to clear the parents of default_groups.
+ */
+static int configfs_d_delete(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations configfs_dentry_ops = {
+	.d_iput		= configfs_d_iput,
+	/* simple_delete_dentry() isn't exported */
+	.d_delete	= configfs_d_delete,
+};
+
+/*
+ * Allocates a new configfs_dirent and links it to the parent configfs_dirent
+ */
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+						void * element)
+{
+	struct configfs_dirent * sd;
+
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd)
+		return NULL;
+
+	memset(sd, 0, sizeof(*sd));
+	atomic_set(&sd->s_count, 1);
+	INIT_LIST_HEAD(&sd->s_links);
+	INIT_LIST_HEAD(&sd->s_children);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	sd->s_element = element;
+
+	return sd;
+}
+
+int configfs_make_dirent(struct configfs_dirent * parent_sd,
+			 struct dentry * dentry, void * element,
+			 umode_t mode, int type)
+{
+	struct configfs_dirent * sd;
+
+	sd = configfs_new_dirent(parent_sd, element);
+	if (!sd)
+		return -ENOMEM;
+
+	sd->s_mode = mode;
+	sd->s_type = type;
+	sd->s_dentry = dentry;
+	if (dentry) {
+		dentry->d_fsdata = configfs_get(sd);
+		dentry->d_op = &configfs_dentry_ops;
+	}
+
+	return 0;
+}
+
+static int init_dir(struct inode * inode)
+{
+	inode->i_op = &configfs_dir_inode_operations;
+	inode->i_fop = &configfs_dir_operations;
+
+	/* directory inodes start off with i_nlink == 2 (for "." entry) */
+	inode->i_nlink++;
+	return 0;
+}
+
+static int init_file(struct inode * inode)
+{
+	inode->i_size = PAGE_SIZE;
+	inode->i_fop = &configfs_file_operations;
+	return 0;
+}
+
+static int init_symlink(struct inode * inode)
+{
+	inode->i_op = &configfs_symlink_inode_operations;
+	return 0;
+}
+
+static int create_dir(struct config_item * k, struct dentry * p,
+		      struct dentry * d)
+{
+	int error;
+	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+
+	error = configfs_create(d, mode, init_dir);
+	if (!error) {
+		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+					   CONFIGFS_DIR);
+		if (!error) {
+			p->d_inode->i_nlink++;
+			(d)->d_op = &configfs_dentry_ops;
+		}
+	}
+	return error;
+}
+
+
+/**
+ *	configfs_create_dir - create a directory for an config_item.
+ *	@item:		config_itemwe're creating directory for.
+ *	@dentry:	config_item's dentry.
+ */
+
+static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+{
+	struct dentry * parent;
+	int error = 0;
+
+	BUG_ON(!item);
+
+	if (item->ci_parent)
+		parent = item->ci_parent->ci_dentry;
+	else if (configfs_mount && configfs_mount->mnt_sb)
+		parent = configfs_mount->mnt_sb->s_root;
+	else
+		return -EFAULT;
+
+	error = create_dir(item,parent,dentry);
+	if (!error)
+		item->ci_dentry = dentry;
+	return error;
+}
+
+int configfs_create_link(struct configfs_symlink *sl,
+			 struct dentry *parent,
+			 struct dentry *dentry)
+{
+	int err = 0;
+	umode_t mode = S_IFLNK | S_IRWXUGO;
+
+	err = configfs_create(dentry, mode, init_symlink);
+	if (!err) {
+		err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
+					 mode, CONFIGFS_ITEM_LINK);
+		if (!err)
+			dentry->d_op = &configfs_dentry_ops;
+	}
+	return err;
+}
+
+static void remove_dir(struct dentry * d)
+{
+	struct dentry * parent = dget(d->d_parent);
+	struct configfs_dirent * sd;
+
+	sd = d->d_fsdata;
+ 	list_del_init(&sd->s_sibling);
+	configfs_put(sd);
+	if (d->d_inode)
+		simple_rmdir(parent->d_inode,d);
+
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name,
+		 atomic_read(&d->d_count));
+
+	dput(parent);
+}
+
+/**
+ * configfs_remove_dir - remove an config_item's directory.
+ * @item:	config_item we're removing.
+ *
+ * The only thing special about this is that we remove any files in
+ * the directory before we remove the directory, and we've inlined
+ * what used to be configfs_rmdir() below, instead of calling separately.
+ */
+
+static void configfs_remove_dir(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+
+	if (!dentry)
+		return;
+
+	remove_dir(dentry);
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+
+/* attaches attribute's configfs_dirent to the dentry corresponding to the
+ * attribute file
+ */
+static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
+{
+	struct configfs_attribute * attr = sd->s_element;
+	int error;
+
+	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
+	if (error)
+		return error;
+
+	dentry->d_op = &configfs_dentry_ops;
+	dentry->d_fsdata = configfs_get(sd);
+	sd->s_dentry = dentry;
+	d_rehash(dentry);
+
+	return 0;
+}
+
+static struct dentry * configfs_lookup(struct inode *dir,
+				       struct dentry *dentry,
+				       struct nameidata *nd)
+{
+	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
+	struct configfs_dirent * sd;
+	int found = 0;
+	int err = 0;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED) {
+			const unsigned char * name = configfs_get_name(sd);
+
+			if (strcmp(name, dentry->d_name.name))
+				continue;
+
+			found = 1;
+			err = configfs_attach_attr(sd, dentry);
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 * If it doesn't exist and it isn't a NOT_PINNED item,
+		 * it must be negative.
+		 */
+		return simple_lookup(dir, dentry, nd);
+	}
+
+	return ERR_PTR(err);
+}
+
+/*
+ * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
+ * attributes and are removed by rmdir().  We recurse, taking i_sem
+ * on all children that are candidates for default detach.  If the
+ * result is clean, then configfs_detach_group() will handle dropping
+ * i_sem.  If there is an error, the caller will clean up the i_sem
+ * holders via configfs_detach_rollback().
+ */
+static int configfs_detach_prep(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+	int ret;
+
+	ret = -EBUSY;
+	if (!list_empty(&parent_sd->s_links))
+		goto out;
+
+	ret = 0;
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_NOT_PINNED)
+			continue;
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			down(&sd->s_dentry->d_inode->i_sem);
+			/* Mark that we've taken i_sem */
+			sd->s_type |= CONFIGFS_USET_DROPPING;
+
+			ret = configfs_detach_prep(sd->s_dentry);
+			if (!ret)
+			       	continue;
+		} else
+			ret = -ENOTEMPTY;
+
+		break;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
+ * set.
+ */
+static void configfs_detach_rollback(struct dentry *dentry)
+{
+	struct configfs_dirent *parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *sd;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			configfs_detach_rollback(sd->s_dentry);
+
+			if (sd->s_type & CONFIGFS_USET_DROPPING) {
+				sd->s_type &= ~CONFIGFS_USET_DROPPING;
+				up(&sd->s_dentry->d_inode->i_sem);
+			}
+		}
+	}
+}
+
+static void detach_attrs(struct config_item * item)
+{
+	struct dentry * dentry = dget(item->ci_dentry);
+	struct configfs_dirent * parent_sd;
+	struct configfs_dirent * sd, * tmp;
+
+	if (!dentry)
+		return;
+
+	pr_debug("configfs %s: dropping attrs for  dir\n",
+		 dentry->d_name.name);
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
+			continue;
+		list_del_init(&sd->s_sibling);
+		configfs_drop_dentry(sd, dentry);
+		configfs_put(sd);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+static int populate_attrs(struct config_item *item)
+{
+	struct config_item_type *t = item->ci_type;
+	struct configfs_attribute *attr;
+	int error = 0;
+	int i;
+
+	if (!t)
+		return -EINVAL;
+	if (t->ct_attrs) {
+		for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+			if ((error = configfs_create_file(item, attr)))
+				break;
+		}
+	}
+
+	if (error)
+		detach_attrs(item);
+
+	return error;
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry);
+static void configfs_detach_group(struct config_item *item);
+
+static void detach_groups(struct config_group *group)
+{
+	struct dentry * dentry = dget(group->cg_item.ci_dentry);
+	struct dentry *child;
+	struct configfs_dirent *parent_sd;
+	struct configfs_dirent *sd, *tmp;
+
+	if (!dentry)
+		return;
+
+	parent_sd = dentry->d_fsdata;
+	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element ||
+		    !(sd->s_type & CONFIGFS_USET_DEFAULT))
+			continue;
+
+		child = sd->s_dentry;
+
+		configfs_detach_group(sd->s_element);
+		child->d_inode->i_flags |= S_DEAD;
+
+		/*
+		 * From rmdir/unregister, a configfs_detach_prep() pass
+		 * has taken our i_sem for us.  Drop it.
+		 * From mkdir/register cleanup, there is no sem held.
+		 */
+		if (sd->s_type & CONFIGFS_USET_DROPPING)
+			up(&child->d_inode->i_sem);
+
+		d_delete(child);
+		dput(child);
+	}
+
+	/**
+	 * Drop reference from dget() on entrance.
+	 */
+	dput(dentry);
+}
+
+/*
+ * This fakes mkdir(2) on a default_groups[] entry.  It
+ * creates a dentry, attachs it, and then does fixup
+ * on the sd->s_type.
+ *
+ * We could, perhaps, tweak our parent's ->mkdir for a minute and
+ * try using vfs_mkdir.  Just a thought.
+ */
+static int create_default_group(struct config_group *parent_group,
+				struct config_group *group)
+{
+	int ret;
+	struct qstr name;
+	struct configfs_dirent *sd;
+	/* We trust the caller holds a reference to parent */
+	struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	ret = -ENOMEM;
+	child = d_alloc(parent, &name);
+	if (child) {
+		d_add(child, NULL);
+
+		ret = configfs_attach_group(&parent_group->cg_item,
+					    &group->cg_item, child);
+		if (!ret) {
+			sd = child->d_fsdata;
+			sd->s_type |= CONFIGFS_USET_DEFAULT;
+		} else {
+			d_delete(child);
+			dput(child);
+		}
+	}
+
+	return ret;
+}
+
+static int populate_groups(struct config_group *group)
+{
+	struct config_group *new_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+	int ret = 0;
+	int i;
+
+	if (group && group->default_groups) {
+		/* FYI, we're faking mkdir here
+		 * I'm not sure we need this semaphore, as we're called
+		 * from our parent's mkdir.  That holds our parent's
+		 * i_sem, so afaik lookup cannot continue through our
+		 * parent to find us, let alone mess with our tree.
+		 * That said, taking our i_sem is closer to mkdir
+		 * emulation, and shouldn't hurt. */
+		down(&dentry->d_inode->i_sem);
+
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+
+			ret = create_default_group(group, new_group);
+			if (ret)
+				break;
+		}
+
+		up(&dentry->d_inode->i_sem);
+	}
+
+	if (ret)
+		detach_groups(group);
+
+	return ret;
+}
+
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_sem is held.
+ */
+
+static void unlink_obj(struct config_item *item)
+{
+	struct config_group *group;
+
+	group = item->ci_group;
+	if (group) {
+		list_del_init(&item->ci_entry);
+
+		item->ci_group = NULL;
+		item->ci_parent = NULL;
+		config_item_put(item);
+
+		config_group_put(group);
+	}
+}
+
+static void link_obj(struct config_item *parent_item, struct config_item *item)
+{
+	/* Parent seems redundant with group, but it makes certain
+	 * traversals much nicer. */
+	item->ci_parent = parent_item;
+	item->ci_group = config_group_get(to_config_group(parent_item));
+	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
+
+	config_item_get(item);
+}
+
+static void unlink_group(struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			unlink_group(new_group);
+		}
+	}
+
+	group->cg_subsys = NULL;
+	unlink_obj(&group->cg_item);
+}
+
+static void link_group(struct config_group *parent_group, struct config_group *group)
+{
+	int i;
+	struct config_group *new_group;
+	struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
+
+	link_obj(&parent_group->cg_item, &group->cg_item);
+
+	if (parent_group->cg_subsys)
+		subsys = parent_group->cg_subsys;
+	else if (configfs_is_root(&parent_group->cg_item))
+		subsys = to_configfs_subsystem(group);
+	else
+		BUG();
+	group->cg_subsys = subsys;
+
+	if (group->default_groups) {
+		for (i = 0; group->default_groups[i]; i++) {
+			new_group = group->default_groups[i];
+			link_group(group, new_group);
+		}
+	}
+}
+
+/*
+ * The goal is that configfs_attach_item() (and
+ * configfs_attach_group()) can be called from either the VFS or this
+ * module.  That is, they assume that the items have been created,
+ * the dentry allocated, and the dcache is all ready to go.
+ *
+ * If they fail, they must clean up after themselves as if they
+ * had never been called.  The caller (VFS or local function) will
+ * handle cleaning up the dcache bits.
+ *
+ * configfs_detach_group() and configfs_detach_item() behave similarly on
+ * the way out.  They assume that the proper semaphores are held, they
+ * clean up the configfs items, and they expect their callers will
+ * handle the dcache bits.
+ */
+static int configfs_attach_item(struct config_item *parent_item,
+				struct config_item *item,
+				struct dentry *dentry)
+{
+	int ret;
+
+	ret = configfs_create_dir(item, dentry);
+	if (!ret) {
+		ret = populate_attrs(item);
+		if (ret) {
+			configfs_remove_dir(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_item(struct config_item *item)
+{
+	detach_attrs(item);
+	configfs_remove_dir(item);
+}
+
+static int configfs_attach_group(struct config_item *parent_item,
+				 struct config_item *item,
+				 struct dentry *dentry)
+{
+	int ret;
+	struct configfs_dirent *sd;
+
+	ret = configfs_attach_item(parent_item, item, dentry);
+	if (!ret) {
+		sd = dentry->d_fsdata;
+		sd->s_type |= CONFIGFS_USET_DIR;
+
+		ret = populate_groups(to_config_group(item));
+		if (ret) {
+			configfs_detach_item(item);
+			d_delete(dentry);
+		}
+	}
+
+	return ret;
+}
+
+static void configfs_detach_group(struct config_item *item)
+{
+	detach_groups(to_config_group(item));
+	configfs_detach_item(item);
+}
+
+/*
+ * Drop the initial reference from make_item()/make_group()
+ * This function assumes that reference is held on item
+ * and that item holds a valid reference to the parent.  Also, it
+ * assumes the caller has validated ci_type.
+ */
+static void client_drop_item(struct config_item *parent_item,
+			     struct config_item *item)
+{
+	struct config_item_type *type;
+
+	type = parent_item->ci_type;
+	BUG_ON(!type);
+
+	if (type->ct_group_ops && type->ct_group_ops->drop_item)
+		type->ct_group_ops->drop_item(to_config_group(parent_item),
+						item);
+	else
+		config_item_put(item);
+}
+
+
+static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int ret;
+	struct config_group *group;
+	struct config_item *item;
+	struct config_item *parent_item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct config_item_type *type;
+	struct module *owner;
+	char *name;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_parent->d_fsdata;
+	if (!(sd->s_type & CONFIGFS_USET_DIR))
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!type || !type->ct_group_ops ||
+	    (!type->ct_group_ops->make_group &&
+	     !type->ct_group_ops->make_item)) {
+		config_item_put(parent_item);
+		return -EPERM;  /* What lack-of-mkdir returns */
+	}
+
+	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
+
+	down(&subsys->su_sem);
+	group = NULL;
+	item = NULL;
+	if (type->ct_group_ops->make_group) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (group) {
+			link_group(to_config_group(parent_item), group);
+			item = &group->cg_item;
+		}
+	} else {
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (item)
+			link_obj(parent_item, item);
+	}
+	up(&subsys->su_sem);
+
+	kfree(name);
+	if (!item) {
+		config_item_put(parent_item);
+		return -ENOMEM;
+	}
+
+	ret = -EINVAL;
+	type = item->ci_type;
+	if (type) {
+		owner = type->ct_owner;
+		if (try_module_get(owner)) {
+			if (group) {
+				ret = configfs_attach_group(parent_item,
+							    item,
+							    dentry);
+			} else {
+				ret = configfs_attach_item(parent_item,
+							   item,
+							   dentry);
+			}
+
+			if (ret) {
+				down(&subsys->su_sem);
+				if (group)
+					unlink_group(group);
+				else
+					unlink_obj(item);
+				client_drop_item(parent_item, item);
+				up(&subsys->su_sem);
+
+				config_item_put(parent_item);
+				module_put(owner);
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct config_item *parent_item;
+	struct config_item *item;
+	struct configfs_subsystem *subsys;
+	struct configfs_dirent *sd;
+	struct module *owner = NULL;
+	int ret;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		return -EPERM;
+
+	sd = dentry->d_fsdata;
+	if (sd->s_type & CONFIGFS_USET_DEFAULT)
+		return -EPERM;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	subsys = to_config_group(parent_item)->cg_subsys;
+	BUG_ON(!subsys);
+
+	if (!parent_item->ci_type) {
+		config_item_put(parent_item);
+		return -EINVAL;
+	}
+
+	ret = configfs_detach_prep(dentry);
+	if (ret) {
+		configfs_detach_rollback(dentry);
+		config_item_put(parent_item);
+		return ret;
+	}
+
+	item = configfs_get_config_item(dentry);
+
+	/* Drop reference from above, item already holds one. */
+	config_item_put(parent_item);
+
+	if (item->ci_type)
+		owner = item->ci_type->ct_owner;
+
+	if (sd->s_type & CONFIGFS_USET_DIR) {
+		configfs_detach_group(item);
+
+		down(&subsys->su_sem);
+		unlink_group(to_config_group(item));
+	} else {
+		configfs_detach_item(item);
+
+		down(&subsys->su_sem);
+		unlink_obj(item);
+	}
+
+	client_drop_item(parent_item, item);
+	up(&subsys->su_sem);
+
+	/* Drop our reference from above */
+	config_item_put(item);
+
+	module_put(owner);
+
+	return 0;
+}
+
+struct inode_operations configfs_dir_inode_operations = {
+	.mkdir		= configfs_mkdir,
+	.rmdir		= configfs_rmdir,
+	.symlink	= configfs_symlink,
+	.unlink		= configfs_unlink,
+	.lookup		= configfs_lookup,
+};
+
+#if 0
+int configfs_rename_dir(struct config_item * item, const char *new_name)
+{
+	int error = 0;
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(config_item_name(item), new_name))
+		return -EINVAL;
+
+	if (!item->parent)
+		return -EINVAL;
+
+	down_write(&configfs_rename_sem);
+	parent = item->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
+	if (!IS_ERR(new_dentry)) {
+  		if (!new_dentry->d_inode) {
+			error = config_item_set_name(item, "%s", new_name);
+			if (!error) {
+				d_add(new_dentry, NULL);
+				d_move(item->dentry, new_dentry);
+			}
+			else
+				d_delete(new_dentry);
+		} else
+			error = -EEXIST;
+		dput(new_dentry);
+	}
+	up(&parent->d_inode->i_sem);
+	up_write(&configfs_rename_sem);
+
+	return error;
+}
+#endif
+
+static int configfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+
+	down(&dentry->d_inode->i_sem);
+	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	up(&dentry->d_inode->i_sem);
+
+	return file->private_data ? 0 : -ENOMEM;
+
+}
+
+static int configfs_dir_close(struct inode *inode, struct file *file)
+{
+	struct dentry * dentry = file->f_dentry;
+	struct configfs_dirent * cursor = file->private_data;
+
+	down(&dentry->d_inode->i_sem);
+	list_del_init(&cursor->s_sibling);
+	up(&dentry->d_inode->i_sem);
+
+	release_configfs_dirent(cursor);
+
+	return 0;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct configfs_dirent *sd)
+{
+	return (sd->s_mode >> 12) & 15;
+}
+
+static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	struct configfs_dirent *cursor = filp->private_data;
+	struct list_head *p, *q = &cursor->s_sibling;
+	ino_t ino;
+	int i = filp->f_pos;
+
+	switch (i) {
+		case 0:
+			ino = dentry->d_inode->i_ino;
+			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+				break;
+			filp->f_pos++;
+			i++;
+			/* fallthrough */
+		default:
+			if (filp->f_pos == 2) {
+				list_del(q);
+				list_add(q, &parent_sd->s_children);
+			}
+			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
+				struct configfs_dirent *next;
+				const char * name;
+				int len;
+
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (!next->s_element)
+					continue;
+
+				name = configfs_get_name(next);
+				len = strlen(name);
+				if (next->s_dentry)
+					ino = next->s_dentry->d_inode->i_ino;
+				else
+					ino = iunique(configfs_sb, 2);
+
+				if (filldir(dirent, name, len, filp->f_pos, ino,
+						 dt_type(next)) < 0)
+					return 0;
+
+				list_del(q);
+				list_add(q, p);
+				p = q;
+				filp->f_pos++;
+			}
+	}
+	return 0;
+}
+
+static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
+{
+	struct dentry * dentry = file->f_dentry;
+
+	down(&dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += file->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			up(&file->f_dentry->d_inode->i_sem);
+			return -EINVAL;
+	}
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		if (file->f_pos >= 2) {
+			struct configfs_dirent *sd = dentry->d_fsdata;
+			struct configfs_dirent *cursor = file->private_data;
+			struct list_head *p;
+			loff_t n = file->f_pos - 2;
+
+			list_del(&cursor->s_sibling);
+			p = sd->s_children.next;
+			while (n && p != &sd->s_children) {
+				struct configfs_dirent *next;
+				next = list_entry(p, struct configfs_dirent,
+						   s_sibling);
+				if (next->s_element)
+					n--;
+				p = p->next;
+			}
+			list_add_tail(&cursor->s_sibling, p);
+		}
+	}
+	up(&dentry->d_inode->i_sem);
+	return offset;
+}
+
+struct file_operations configfs_dir_operations = {
+	.open		= configfs_dir_open,
+	.release	= configfs_dir_close,
+	.llseek		= configfs_dir_lseek,
+	.read		= generic_read_dir,
+	.readdir	= configfs_readdir,
+};
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys)
+{
+	int err;
+	struct config_group *group = &subsys->su_group;
+	struct qstr name;
+	struct dentry *dentry;
+	struct configfs_dirent *sd;
+
+	err = configfs_pin_fs();
+	if (err)
+		return err;
+
+	if (!group->cg_item.ci_name)
+		group->cg_item.ci_name = group->cg_item.ci_namebuf;
+
+	sd = configfs_sb->s_root->d_fsdata;
+	link_group(to_config_group(sd->s_element), group);
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+
+	name.name = group->cg_item.ci_name;
+	name.len = strlen(name.name);
+	name.hash = full_name_hash(name.name, name.len);
+
+	err = -ENOMEM;
+	dentry = d_alloc(configfs_sb->s_root, &name);
+	if (!dentry)
+		goto out_release;
+
+	d_add(dentry, NULL);
+
+	err = configfs_attach_group(sd->s_element, &group->cg_item,
+				    dentry);
+	if (!err)
+		dentry = NULL;
+	else
+		d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	if (dentry) {
+	    dput(dentry);
+out_release:
+	    unlink_group(group);
+	    configfs_release_fs();
+	}
+
+	return err;
+}
+
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
+{
+	struct config_group *group = &subsys->su_group;
+	struct dentry *dentry = group->cg_item.ci_dentry;
+
+	if (dentry->d_parent != configfs_sb->s_root) {
+		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+		return;
+	}
+
+	down(&configfs_sb->s_root->d_inode->i_sem);
+	down(&dentry->d_inode->i_sem);
+	if (configfs_detach_prep(dentry)) {
+		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+	}
+	configfs_detach_group(&group->cg_item);
+	dentry->d_inode->i_flags |= S_DEAD;
+	up(&dentry->d_inode->i_sem);
+
+	d_delete(dentry);
+
+	up(&configfs_sb->s_root->d_inode->i_sem);
+
+	dput(dentry);
+
+	unlink_group(group);
+	configfs_release_fs();
+}
+
+EXPORT_SYMBOL(configfs_register_subsystem);
+EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 0000000..af1ffc9
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c - operations for regular (text) files.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/dnotify.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+
+struct configfs_buffer {
+	size_t			count;
+	loff_t			pos;
+	char			* page;
+	struct configfs_item_operations	* ops;
+	struct semaphore	sem;
+	int			needs_read_fill;
+};
+
+
+/**
+ *	fill_read_buffer - allocate and fill buffer from item.
+ *	@dentry:	dentry pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Allocate @buffer->page, if it hasn't been already, then call the
+ *	config_item's show() method to fill the buffer with this attribute's
+ *	data.
+ *	This is called only once, on the file's first read.
+ */
+static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+	int ret = 0;
+	ssize_t count;
+
+	if (!buffer->page)
+		buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	count = ops->show_attribute(item,attr,buffer->page);
+	buffer->needs_read_fill = 0;
+	BUG_ON(count > (ssize_t)PAGE_SIZE);
+	if (count >= 0)
+		buffer->count = count;
+	else
+		ret = count;
+	return ret;
+}
+
+
+/**
+ *	flush_read_buffer - push buffer to userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	user-passed buffer.
+ *	@count:		number of bytes requested.
+ *	@ppos:		file position.
+ *
+ *	Copy the buffer we filled in fill_read_buffer() to userspace.
+ *	This is done at the reader's leisure, copying and advancing
+ *	the amount they specify each time.
+ *	This may be called continuously until the buffer is empty.
+ */
+static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
+			     size_t count, loff_t * ppos)
+{
+	int error;
+
+	if (*ppos > buffer->count)
+		return 0;
+
+	if (count > (buffer->count - *ppos))
+		count = buffer->count - *ppos;
+
+	error = copy_to_user(buf,buffer->page + *ppos,count);
+	if (!error)
+		*ppos += count;
+	return error ? -EFAULT : count;
+}
+
+/**
+ *	configfs_read_file - read an attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target item is in the directory's
+ *	->d_fsdata.
+ *
+ *	We call fill_read_buffer() to allocate and fill the buffer from the
+ *	item's show() method exactly once (if the read is happening from
+ *	the beginning of the file). That should fill the entire buffer with
+ *	all the data the item has to offer for that attribute.
+ *	We then call flush_read_buffer() to copy the buffer to userspace
+ *	in the increments specified.
+ */
+
+static ssize_t
+configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+	ssize_t retval = 0;
+
+	down(&buffer->sem);
+	if (buffer->needs_read_fill) {
+		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
+			goto out;
+	}
+	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
+		 __FUNCTION__,count,*ppos,buffer->page);
+	retval = flush_read_buffer(buffer,buf,count,ppos);
+out:
+	up(&buffer->sem);
+	return retval;
+}
+
+
+/**
+ *	fill_write_buffer - copy buffer from userspace.
+ *	@buffer:	data buffer for file.
+ *	@userbuf:	data from user.
+ *	@count:		number of bytes in @userbuf.
+ *
+ *	Allocate @buffer->page if it hasn't been already, then
+ *	copy the user-supplied buffer into it.
+ */
+
+static int
+fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+{
+	int error;
+
+	if (!buffer->page)
+		buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
+	if (!buffer->page)
+		return -ENOMEM;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+	error = copy_from_user(buffer->page,buf,count);
+	buffer->needs_read_fill = 1;
+	return error ? -EFAULT : count;
+}
+
+
+/**
+ *	flush_write_buffer - push buffer to config_item.
+ *	@file:		file pointer.
+ *	@buffer:	data buffer for file.
+ *
+ *	Get the correct pointers for the config_item and the attribute we're
+ *	dealing with, then call the store() method for the attribute,
+ *	passing the buffer that we acquired in fill_write_buffer().
+ */
+
+static int
+flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
+{
+	struct configfs_attribute * attr = to_attr(dentry);
+	struct config_item * item = to_item(dentry->d_parent);
+	struct configfs_item_operations * ops = buffer->ops;
+
+	return ops->store_attribute(item,attr,buffer->page,count);
+}
+
+
+/**
+ *	configfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Similar to configfs_read_file(), though working in the opposite direction.
+ *	We allocate and fill the data from the user in fill_write_buffer(),
+ *	then push it to the config_item in flush_write_buffer().
+ *	There is no easy way for us to know if userspace is only doing a partial
+ *	write, so we don't support them. We expect the entire buffer to come
+ *	on the first write.
+ *	Hint: if you're writing a value, first read the file, modify only the
+ *	the value you're changing, then write entire buffer back.
+ */
+
+static ssize_t
+configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct configfs_buffer * buffer = file->private_data;
+
+	down(&buffer->sem);
+	count = fill_write_buffer(buffer,buf,count);
+	if (count > 0)
+		count = flush_write_buffer(file->f_dentry,buffer,count);
+	if (count > 0)
+		*ppos += count;
+	up(&buffer->sem);
+	return count;
+}
+
+static int check_perm(struct inode * inode, struct file * file)
+{
+	struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(file->f_dentry);
+	struct configfs_buffer * buffer;
+	struct configfs_item_operations * ops = NULL;
+	int error = 0;
+
+	if (!item || !attr)
+		goto Einval;
+
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->ca_owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
+	if (item->ci_type)
+		ops = item->ci_type->ct_item_ops;
+	else
+		goto Eaccess;
+
+	/* File needs write support.
+	 * The inode's perms must say it's ok,
+	 * and we must have a store method.
+	 */
+	if (file->f_mode & FMODE_WRITE) {
+
+		if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+			goto Eaccess;
+
+	}
+
+	/* File needs read support.
+	 * The inode's perms must say it's ok, and we there
+	 * must be a show method for it.
+	 */
+	if (file->f_mode & FMODE_READ) {
+		if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+			goto Eaccess;
+	}
+
+	/* No error? Great, allocate a buffer for the file, and store it
+	 * it in file->private_data for easy access.
+	 */
+	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (buffer) {
+		memset(buffer,0,sizeof(struct configfs_buffer));
+		init_MUTEX(&buffer->sem);
+		buffer->needs_read_fill = 1;
+		buffer->ops = ops;
+		file->private_data = buffer;
+	} else
+		error = -ENOMEM;
+	goto Done;
+
+ Einval:
+	error = -EINVAL;
+	goto Done;
+ Eaccess:
+	error = -EACCES;
+	module_put(attr->ca_owner);
+ Done:
+	if (error && item)
+		config_item_put(item);
+	return error;
+}
+
+static int configfs_open_file(struct inode * inode, struct file * filp)
+{
+	return check_perm(inode,filp);
+}
+
+static int configfs_release(struct inode * inode, struct file * filp)
+{
+	struct config_item * item = to_item(filp->f_dentry->d_parent);
+	struct configfs_attribute * attr = to_attr(filp->f_dentry);
+	struct module * owner = attr->ca_owner;
+	struct configfs_buffer * buffer = filp->private_data;
+
+	if (item)
+		config_item_put(item);
+	/* After this point, attr should not be accessed. */
+	module_put(owner);
+
+	if (buffer) {
+		if (buffer->page)
+			free_page((unsigned long)buffer->page);
+		kfree(buffer);
+	}
+	return 0;
+}
+
+struct file_operations configfs_file_operations = {
+	.read		= configfs_read_file,
+	.write		= configfs_write_file,
+	.llseek		= generic_file_llseek,
+	.open		= configfs_open_file,
+	.release	= configfs_release,
+};
+
+
+int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
+{
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	down(&dir->d_inode->i_sem);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
+	up(&dir->d_inode->i_sem);
+
+	return error;
+}
+
+
+/**
+ *	configfs_create_file - create an attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
+{
+	BUG_ON(!item || !item->ci_dentry || !attr);
+
+	return configfs_add_file(item->ci_dentry, attr,
+				 CONFIGFS_ITEM_ATTR);
+}
+
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 0000000..6b274c6
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c - basic inode and dentry operations.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see Documentation/filesystems/configfs.txt for more information.
+ */
+
+#undef DEBUG
+
+#include <linux/pagemap.h>
+#include <linux/namei.h>
+#include <linux/backing-dev.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+extern struct super_block * configfs_sb;
+
+static struct address_space_operations configfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+static struct backing_dev_info configfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+struct inode * configfs_new_inode(mode_t mode)
+{
+	struct inode * inode = new_inode(configfs_sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = 0;
+		inode->i_gid = 0;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_mapping->a_ops = &configfs_aops;
+		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+	}
+	return inode;
+}
+
+int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
+{
+	int error = 0;
+	struct inode * inode = NULL;
+	if (dentry) {
+		if (!dentry->d_inode) {
+			if ((inode = configfs_new_inode(mode))) {
+				if (dentry->d_parent && dentry->d_parent->d_inode) {
+					struct inode *p_inode = dentry->d_parent->d_inode;
+					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+				}
+				goto Proceed;
+			}
+			else
+				error = -ENOMEM;
+		} else
+			error = -EEXIST;
+	} else
+		error = -ENOENT;
+	goto Done;
+
+ Proceed:
+	if (init)
+		error = init(inode);
+	if (!error) {
+		d_instantiate(dentry, inode);
+		if (S_ISDIR(mode) || S_ISLNK(mode))
+			dget(dentry);  /* pin link and directory dentries in core */
+	} else
+		iput(inode);
+ Done:
+	return error;
+}
+
+/*
+ * Get the name for corresponding element represented by the given configfs_dirent
+ */
+const unsigned char * configfs_get_name(struct configfs_dirent *sd)
+{
+	struct attribute * attr;
+
+	if (!sd || !sd->s_element)
+		BUG();
+
+	/* These always have a dentry, so use that */
+	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
+		return sd->s_dentry->d_name.name;
+
+	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+		attr = sd->s_element;
+		return attr->name;
+	}
+	return NULL;
+}
+
+
+/*
+ * Unhashes the dentry corresponding to given configfs_dirent
+ * Called with parent inode's i_sem held.
+ */
+void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
+{
+	struct dentry * dentry = sd->s_dentry;
+
+	if (dentry) {
+		spin_lock(&dcache_lock);
+		if (!(d_unhashed(dentry) && dentry->d_inode)) {
+			dget_locked(dentry);
+			__d_drop(dentry);
+			spin_unlock(&dcache_lock);
+			simple_unlink(parent->d_inode, dentry);
+		} else
+			spin_unlock(&dcache_lock);
+	}
+}
+
+void configfs_hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct configfs_dirent * sd;
+	struct configfs_dirent * parent_sd = dir->d_fsdata;
+
+	down(&dir->d_inode->i_sem);
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+		if (!sd->s_element)
+			continue;
+		if (!strcmp(configfs_get_name(sd), name)) {
+			list_del_init(&sd->s_sibling);
+			configfs_drop_dentry(sd, dir);
+			configfs_put(sd);
+			break;
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 0000000..e07485a
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * item.c - library routines for handling generic config items
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on kobject:
+ * 	kobject is Copyright (c) 2002-2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please see the file Documentation/filesystems/configfs.txt for
+ * critical information about using the config_item interface.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+
+static inline struct config_item * to_item(struct list_head * entry)
+{
+	return container_of(entry,struct config_item,ci_entry);
+}
+
+/* Evil kernel */
+static void config_item_release(struct kref *kref);
+
+/**
+ *	config_item_init - initialize item.
+ *	@item:	item in question.
+ */
+void config_item_init(struct config_item * item)
+{
+	kref_init(&item->ci_kref);
+	INIT_LIST_HEAD(&item->ci_entry);
+}
+
+/**
+ *	config_item_set_name - Set the name of an item
+ *	@item:	item.
+ *	@name:	name.
+ *
+ *	If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
+ *	dynamically allocated string that @item->ci_name points to.
+ *	Otherwise, use the static @item->ci_namebuf array.
+ */
+
+int config_item_set_name(struct config_item * item, const char * fmt, ...)
+{
+	int error = 0;
+	int limit = CONFIGFS_ITEM_NAME_LEN;
+	int need;
+	va_list args;
+	char * name;
+
+	/*
+	 * First, try the static array
+	 */
+	va_start(args,fmt);
+	need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+	va_end(args);
+	if (need < limit)
+		name = item->ci_namebuf;
+	else {
+		/*
+		 * Need more space? Allocate it and try again
+		 */
+		limit = need + 1;
+		name = kmalloc(limit,GFP_KERNEL);
+		if (!name) {
+			error = -ENOMEM;
+			goto Done;
+		}
+		va_start(args,fmt);
+		need = vsnprintf(name,limit,fmt,args);
+		va_end(args);
+
+		/* Still? Give up. */
+		if (need >= limit) {
+			kfree(name);
+			error = -EFAULT;
+			goto Done;
+		}
+	}
+
+	/* Free the old name, if necessary. */
+	if (item->ci_name && item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+
+	/* Now, set the new name */
+	item->ci_name = name;
+ Done:
+	return error;
+}
+
+EXPORT_SYMBOL(config_item_set_name);
+
+void config_item_init_type_name(struct config_item *item,
+				const char *name,
+				struct config_item_type *type)
+{
+	config_item_set_name(item, name);
+	item->ci_type = type;
+	config_item_init(item);
+}
+EXPORT_SYMBOL(config_item_init_type_name);
+
+void config_group_init_type_name(struct config_group *group, const char *name,
+			 struct config_item_type *type)
+{
+	config_item_set_name(&group->cg_item, name);
+	group->cg_item.ci_type = type;
+	config_group_init(group);
+}
+EXPORT_SYMBOL(config_group_init_type_name);
+
+struct config_item * config_item_get(struct config_item * item)
+{
+	if (item)
+		kref_get(&item->ci_kref);
+	return item;
+}
+
+/**
+ *	config_item_cleanup - free config_item resources.
+ *	@item:	item.
+ */
+
+void config_item_cleanup(struct config_item * item)
+{
+	struct config_item_type * t = item->ci_type;
+	struct config_group * s = item->ci_group;
+	struct config_item * parent = item->ci_parent;
+
+	pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+	if (item->ci_name != item->ci_namebuf)
+		kfree(item->ci_name);
+	item->ci_name = NULL;
+	if (t && t->ct_item_ops && t->ct_item_ops->release)
+		t->ct_item_ops->release(item);
+	if (s)
+		config_group_put(s);
+	if (parent)
+		config_item_put(parent);
+}
+
+static void config_item_release(struct kref *kref)
+{
+	config_item_cleanup(container_of(kref, struct config_item, ci_kref));
+}
+
+/**
+ *	config_item_put - decrement refcount for item.
+ *	@item:	item.
+ *
+ *	Decrement the refcount, and if 0, call config_item_cleanup().
+ */
+void config_item_put(struct config_item * item)
+{
+	if (item)
+		kref_put(&item->ci_kref, config_item_release);
+}
+
+
+/**
+ *	config_group_init - initialize a group for use
+ *	@k:	group
+ */
+
+void config_group_init(struct config_group *group)
+{
+	config_item_init(&group->cg_item);
+	INIT_LIST_HEAD(&group->cg_children);
+}
+
+
+/**
+ *	config_group_find_obj - search for item in group.
+ *	@group:	group we're looking in.
+ *	@name:	item's name.
+ *
+ *	Lock group via @group->cg_subsys, and iterate over @group->cg_list,
+ *	looking for a matching config_item. If matching item is found
+ *	take a reference and return the item.
+ */
+
+struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+{
+	struct list_head * entry;
+	struct config_item * ret = NULL;
+
+        /* XXX LOCKING! */
+	list_for_each(entry,&group->cg_children) {
+		struct config_item * item = to_item(entry);
+		if (config_item_name(item) &&
+                    !strcmp(config_item_name(item), name)) {
+			ret = config_item_get(item);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+EXPORT_SYMBOL(config_item_init);
+EXPORT_SYMBOL(config_group_init);
+EXPORT_SYMBOL(config_item_get);
+EXPORT_SYMBOL(config_item_put);
+
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 0000000..1a2f6f6
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mount.c - operations for initializing and mounting configfs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+/* Random magic number */
+#define CONFIGFS_MAGIC 0x62656570
+
+struct vfsmount * configfs_mount = NULL;
+struct super_block * configfs_sb = NULL;
+static int configfs_mnt_count = 0;
+
+static struct super_operations configfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct config_group configfs_root_group = {
+	.cg_item = {
+		.ci_namebuf	= "root",
+		.ci_name	= configfs_root_group.cg_item.ci_namebuf,
+	},
+};
+
+int configfs_is_root(struct config_item *item)
+{
+	return item == &configfs_root_group.cg_item;
+}
+
+static struct configfs_dirent configfs_root = {
+	.s_sibling	= LIST_HEAD_INIT(configfs_root.s_sibling),
+	.s_children	= LIST_HEAD_INIT(configfs_root.s_children),
+	.s_element	= &configfs_root_group.cg_item,
+	.s_type		= CONFIGFS_ROOT,
+};
+
+static int configfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = CONFIGFS_MAGIC;
+	sb->s_op = &configfs_ops;
+	configfs_sb = sb;
+
+	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+	if (inode) {
+		inode->i_op = &configfs_dir_inode_operations;
+		inode->i_fop = &configfs_dir_operations;
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+	} else {
+		pr_debug("configfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
+		iput(inode);
+		return -ENOMEM;
+	}
+	config_group_init(&configfs_root_group);
+	configfs_root_group.cg_item.ci_dentry = root;
+	root->d_fsdata = &configfs_root;
+	sb->s_root = root;
+	return 0;
+}
+
+static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+}
+
+static struct file_system_type configfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "configfs",
+	.get_sb		= configfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+int configfs_pin_fs(void)
+{
+	return simple_pin_fs("configfs", &configfs_mount,
+			     &configfs_mnt_count);
+}
+
+void configfs_release_fs(void)
+{
+	simple_release_fs(&configfs_mount, &configfs_mnt_count);
+}
+
+
+static decl_subsys(config, NULL, NULL);
+
+static int __init configfs_init(void)
+{
+	int err;
+
+	kset_set_kset_s(&config_subsys, kernel_subsys);
+	err = subsystem_register(&config_subsys);
+	if (err)
+		return err;
+
+	err = register_filesystem(&configfs_fs_type);
+	if (err) {
+		printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+		subsystem_unregister(&config_subsys);
+	}
+
+	return err;
+}
+
+static void __exit configfs_exit(void)
+{
+	unregister_filesystem(&configfs_fs_type);
+	subsystem_unregister(&config_subsys);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.0.1");
+MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
+
+module_init(configfs_init);
+module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 0000000..50f5840
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.c - operations for configfs symlinks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+
+#include <linux/configfs.h>
+#include "configfs_internal.h"
+
+static int item_depth(struct config_item * item)
+{
+	struct config_item * p = item;
+	int depth = 0;
+	do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
+	return depth;
+}
+
+static int item_path_length(struct config_item * item)
+{
+	struct config_item * p = item;
+	int length = 1;
+	do {
+		length += strlen(config_item_name(p)) + 1;
+		p = p->ci_parent;
+	} while (p && !configfs_is_root(p));
+	return length;
+}
+
+static void fill_item_path(struct config_item * item, char * buffer, int length)
+{
+	struct config_item * p;
+
+	--length;
+	for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
+		int cur = strlen(config_item_name(p));
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,config_item_name(p),cur);
+		*(buffer + --length) = '/';
+	}
+}
+
+static int create_link(struct config_item *parent_item,
+ 		       struct config_item *item,
+		       struct dentry *dentry)
+{
+	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	int ret;
+
+	ret = -ENOMEM;
+	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
+	if (sl) {
+		sl->sl_target = config_item_get(item);
+		/* FIXME: needs a lock, I'd bet */
+		list_add(&sl->sl_list, &target_sd->s_links);
+		ret = configfs_create_link(sl, parent_item->ci_dentry,
+					   dentry);
+		if (ret) {
+			list_del_init(&sl->sl_list);
+			config_item_put(item);
+			kfree(sl);
+		}
+	}
+
+	return ret;
+}
+
+
+static int get_target(const char *symname, struct nameidata *nd,
+		      struct config_item **target)
+{
+	int ret;
+
+	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	if (!ret) {
+		if (nd->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(nd->dentry);
+			if (!*target) {
+				ret = -ENOENT;
+				path_release(nd);
+			}
+		} else
+			ret = -EPERM;
+	}
+
+	return ret;
+}
+
+
+int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	int ret;
+	struct nameidata nd;
+	struct config_item *parent_item;
+	struct config_item *target_item;
+	struct config_item_type *type;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (dentry->d_parent == configfs_sb->s_root)
+		goto out;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	if (!type || !type->ct_item_ops ||
+	    !type->ct_item_ops->allow_link)
+		goto out_put;
+
+	ret = get_target(symname, &nd, &target_item);
+	if (ret)
+		goto out_put;
+
+	ret = type->ct_item_ops->allow_link(parent_item, target_item);
+	if (!ret)
+		ret = create_link(parent_item, target_item, dentry);
+
+	config_item_put(target_item);
+	path_release(&nd);
+
+out_put:
+	config_item_put(parent_item);
+
+out:
+	return ret;
+}
+
+int configfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct configfs_dirent *sd = dentry->d_fsdata;
+	struct configfs_symlink *sl;
+	struct config_item *parent_item;
+	struct config_item_type *type;
+	int ret;
+
+	ret = -EPERM;  /* What lack-of-symlink returns */
+	if (!(sd->s_type & CONFIGFS_ITEM_LINK))
+		goto out;
+
+	if (dentry->d_parent == configfs_sb->s_root)
+		BUG();
+
+	sl = sd->s_element;
+
+	parent_item = configfs_get_config_item(dentry->d_parent);
+	type = parent_item->ci_type;
+
+	list_del_init(&sd->s_sibling);
+	configfs_drop_dentry(sd, dentry->d_parent);
+	dput(dentry);
+	configfs_put(sd);
+
+	/*
+	 * drop_link() must be called before
+	 * list_del_init(&sl->sl_list), so that the order of
+	 * drop_link(this, target) and drop_item(target) is preserved.
+	 */
+	if (type && type->ct_item_ops &&
+	    type->ct_item_ops->drop_link)
+		type->ct_item_ops->drop_link(parent_item,
+					       sl->sl_target);
+
+	/* FIXME: Needs lock */
+	list_del_init(&sl->sl_list);
+
+	/* Put reference from create_link() */
+	config_item_put(sl->sl_target);
+	kfree(sl);
+
+	config_item_put(parent_item);
+
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int configfs_get_target_path(struct config_item * item, struct config_item * target,
+				   char *path)
+{
+	char * s;
+	int depth, size;
+
+	depth = item_depth(item);
+	size = item_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
+
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
+
+	fill_item_path(target, path, size);
+	pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
+
+	return 0;
+}
+
+static int configfs_getlink(struct dentry *dentry, char * path)
+{
+	struct config_item *item, *target_item;
+	int error = 0;
+
+	item = configfs_get_config_item(dentry->d_parent);
+	if (!item)
+		return -EINVAL;
+
+	target_item = configfs_get_config_item(dentry);
+	if (!target_item) {
+		config_item_put(item);
+		return -EINVAL;
+	}
+
+	down_read(&configfs_rename_sem);
+	error = configfs_get_target_path(item, target_item, path);
+	up_read(&configfs_rename_sem);
+
+	config_item_put(item);
+	config_item_put(target_item);
+	return error;
+
+}
+
+static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	if (page) {
+		error = configfs_getlink(dentry, (char *)page);
+		if (!error) {
+			nd_set_link(nd, (char *)page);
+			return (void *)page;
+		}
+	}
+
+	nd_set_link(nd, ERR_PTR(error));
+	return NULL;
+}
+
+static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
+			      void *cookie)
+{
+	if (cookie) {
+		unsigned long page = (unsigned long)cookie;
+		free_page(page);
+	}
+}
+
+struct inode_operations configfs_symlink_inode_operations = {
+	.follow_link = configfs_follow_link,
+	.readlink = generic_readlink,
+	.put_link = configfs_put_link,
+};
+
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
new file mode 100644
index 0000000..acffb8c
--- /dev/null
+++ b/include/linux/configfs.h
@@ -0,0 +1,205 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * configfs.h - definitions for the device driver filesystem
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ * 	sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * Based on kobject.h:
+ *      Copyright (c) 2002-2003	Patrick Mochel
+ *      Copyright (c) 2002-2003	Open Source Development Labs
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * Please read Documentation/filesystems/configfs.txt before using the
+ * configfs interface, ESPECIALLY the parts about reference counts and
+ * item destructors.
+ */
+
+#ifndef _CONFIGFS_H_
+#define _CONFIGFS_H_
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#define CONFIGFS_ITEM_NAME_LEN	20
+
+struct module;
+
+struct configfs_item_operations;
+struct configfs_group_operations;
+struct configfs_attribute;
+struct configfs_subsystem;
+
+struct config_item {
+	char			*ci_name;
+	char			ci_namebuf[CONFIGFS_ITEM_NAME_LEN];
+	struct kref		ci_kref;
+	struct list_head	ci_entry;
+	struct config_item	*ci_parent;
+	struct config_group	*ci_group;
+	struct config_item_type	*ci_type;
+	struct dentry		*ci_dentry;
+};
+
+extern int config_item_set_name(struct config_item *, const char *, ...);
+
+static inline char *config_item_name(struct config_item * item)
+{
+	return item->ci_name;
+}
+
+extern void config_item_init(struct config_item *);
+extern void config_item_init_type_name(struct config_item *item,
+				       const char *name,
+				       struct config_item_type *type);
+extern void config_item_cleanup(struct config_item *);
+
+extern struct config_item * config_item_get(struct config_item *);
+extern void config_item_put(struct config_item *);
+
+struct config_item_type {
+	struct module				*ct_owner;
+	struct configfs_item_operations		*ct_item_ops;
+	struct configfs_group_operations	*ct_group_ops;
+	struct configfs_attribute		**ct_attrs;
+};
+
+
+/**
+ *	group - a group of config_items of a specific type, belonging
+ *	to a specific subsystem.
+ */
+
+struct config_group {
+	struct config_item		cg_item;
+	struct list_head		cg_children;
+	struct configfs_subsystem 	*cg_subsys;
+	struct config_group		**default_groups;
+};
+
+
+extern void config_group_init(struct config_group *group);
+extern void config_group_init_type_name(struct config_group *group,
+					const char *name,
+					struct config_item_type *type);
+
+
+static inline struct config_group *to_config_group(struct config_item *item)
+{
+	return item ? container_of(item,struct config_group,cg_item) : NULL;
+}
+
+static inline struct config_group *config_group_get(struct config_group *group)
+{
+	return group ? to_config_group(config_item_get(&group->cg_item)) : NULL;
+}
+
+static inline void config_group_put(struct config_group *group)
+{
+	config_item_put(&group->cg_item);
+}
+
+extern struct config_item *config_group_find_obj(struct config_group *, const char *);
+
+
+struct configfs_attribute {
+	char			*ca_name;
+	struct module 		*ca_owner;
+	mode_t			ca_mode;
+};
+
+
+/*
+ * If allow_link() exists, the item can symlink(2) out to other
+ * items.  If the item is a group, it may support mkdir(2).
+ * Groups supply one of make_group() and make_item().  If the
+ * group supports make_group(), one can create group children.  If it
+ * supports make_item(), one can create config_item children.  If it has
+ * default_groups on group->default_groups, it has automatically created
+ * group children.  default_groups may coexist alongsize make_group() or
+ * make_item(), but if the group wishes to have only default_groups
+ * children (disallowing mkdir(2)), it need not provide either function.
+ * If the group has commit(), it supports pending and commited (active)
+ * items.
+ */
+struct configfs_item_operations {
+	void (*release)(struct config_item *);
+	ssize_t	(*show_attribute)(struct config_item *, struct configfs_attribute *,char *);
+	ssize_t	(*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t);
+	int (*allow_link)(struct config_item *src, struct config_item *target);
+	int (*drop_link)(struct config_item *src, struct config_item *target);
+};
+
+struct configfs_group_operations {
+	struct config_item *(*make_item)(struct config_group *group, const char *name);
+	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*commit_item)(struct config_item *item);
+	void (*drop_item)(struct config_group *group, struct config_item *item);
+};
+
+
+
+/**
+ * Use these macros to make defining attributes easier. See include/linux/device.h
+ * for examples..
+ */
+
+#if 0
+#define __ATTR(_name,_mode,_show,_store) { \
+	.attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE },	\
+	.show	= _show,					\
+	.store	= _store,					\
+}
+
+#define __ATTR_RO(_name) { \
+	.attr	= { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE },	\
+	.show	= _name##_show,	\
+}
+
+#define __ATTR_NULL { .attr = { .name = NULL } }
+
+#define attr_name(_attr) (_attr).attr.name
+#endif
+
+
+struct configfs_subsystem {
+	struct config_group	su_group;
+	struct semaphore	su_sem;
+};
+
+static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct configfs_subsystem, su_group) :
+		NULL;
+}
+
+int configfs_register_subsystem(struct configfs_subsystem *subsys);
+void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
+
+#endif  /* __KERNEL__ */
+
+#endif /* _CONFIGFS_H_ */
-- 
cgit v1.1


From 994fc28c7b1e697ac56befe4aecabf23f0689f46 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:28:17 -0800
Subject: [PATCH] add AOP_TRUNCATED_PAGE, prepend AOP_ to WRITEPAGE_ACTIVATE

readpage(), prepare_write(), and commit_write() callers are updated to
understand the special return code AOP_TRUNCATED_PAGE in the style of
writepage() and WRITEPAGE_ACTIVATE.  AOP_TRUNCATED_PAGE tells the caller that
the callee has unlocked the page and that the operation should be tried again
with a new page.  OCFS2 uses this to detect and work around a lock inversion in
its aop methods.  There should be no change in behaviour for methods that don't
return AOP_TRUNCATED_PAGE.

WRITEPAGE_ACTIVATE is also prepended with AOP_ for consistency and they are
made enums so that kerneldoc can be used to document their semantics.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---
 drivers/block/loop.c      | 23 +++++++++++----
 drivers/block/rd.c        |  4 +--
 fs/mpage.c                |  2 +-
 include/linux/fs.h        | 31 ++++++++++++++++++++
 include/linux/writeback.h |  6 ----
 mm/filemap.c              | 73 ++++++++++++++++++++++++++++++++---------------
 mm/readahead.c            | 15 ++++++----
 mm/shmem.c                |  2 +-
 mm/vmscan.c               |  2 +-
 9 files changed, 113 insertions(+), 45 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 96c664a..a452b13 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
-	int len, ret = 0;
+	int len, ret;
 
 	down(&mapping->host->i_sem);
 	index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		page = grab_cache_page(mapping, index);
 		if (unlikely(!page))
 			goto fail;
-		if (unlikely(aops->prepare_write(file, page, offset,
-				offset + size)))
+		ret = aops->prepare_write(file, page, offset,
+					  offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 		if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		flush_dcache_page(page);
-		if (unlikely(aops->commit_write(file, page, offset,
-				offset + size)))
+		ret = aops->commit_write(file, page, offset,
+					 offset + size);
+		if (unlikely(ret)) {
+			if (ret == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				continue;
+			}
 			goto unlock;
+		}
 		if (unlikely(transfer_result))
 			goto unlock;
 		bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		unlock_page(page);
 		page_cache_release(page);
 	}
+	ret = 0;
 out:
 	up(&mapping->host->i_sem);
 	return ret;
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 68c60a5..ffd6abd 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
 
 /*
  * ->writepage to the the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it.  We return WRITEPAGE_ACTIVATE so that the VM
+ * VM doesn't go and steal it.  We return AOP_WRITEPAGE_ACTIVATE so that the VM
  * won't try to (pointlessly) write the page again for a while.
  *
  * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
 		make_page_uptodate(page);
 	SetPageDirty(page);
 	if (wbc->for_reclaim)
-		return WRITEPAGE_ACTIVATE;
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index c5adcdd..f1d2d02 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -721,7 +721,7 @@ retry:
 						&last_block_in_bio, &ret, wbc,
 						page->mapping->a_ops->writepage);
 			}
-			if (unlikely(ret == WRITEPAGE_ACTIVATE))
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
 				unlock_page(page);
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6a..ed9a41a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -302,6 +302,37 @@ struct iattr {
  */
 #include <linux/quota.h>
 
+/** 
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
+ * 			    completed, that the page is still locked, and
+ * 			    should be considered active.  The VM uses this hint
+ * 			    to return the page to the active list -- it won't
+ * 			    be a candidate for writeback again in the near
+ * 			    future.  Other callers must be careful to unlock
+ * 			    the page if they get this return.  Returned by
+ * 			    writepage(); 
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ *  			unlocked it and the page might have been truncated.
+ *  			The caller should back up to acquiring a new page and
+ *  			trying again.  The aop will be taking reasonable
+ *  			precautions not to livelock.  If the caller held a page
+ *  			reference, it should drop it before retrying.  Returned
+ *  			by readpage(), prepare_write(), and commit_write().
+ *
+ * address_space_operation functions return these large constants to indicate
+ * special semantics to the caller.  These are much larger than the bytes in a
+ * page to allow for functions that return the number of bytes operated on in a
+ * given page.
+ */
+
+enum positive_aop_returns {
+	AOP_WRITEPAGE_ACTIVATE	= 0x80000,
+	AOP_TRUNCATED_PAGE	= 0x80001,
+};
+
 /*
  * oh the beauties of C type declarations.
  */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 343d883..64a36ba 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -60,12 +60,6 @@ struct writeback_control {
 };
 
 /*
- * ->writepage() return values (make these much larger than a pagesize, in
- * case some fs is returning number-of-bytes-written from writepage)
- */
-#define WRITEPAGE_ACTIVATE	0x80000	/* IO was not started: activate page */
-
-/*
  * fs/fs-writeback.c
  */	
 void writeback_inodes(struct writeback_control *wbc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bf..6e1d08a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -831,8 +831,13 @@ readpage:
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
-		if (unlikely(error))
+		if (unlikely(error)) {
+			if (error == AOP_TRUNCATED_PAGE) {
+				page_cache_release(page);
+				goto find_page;
+			}
 			goto readpage_error;
+		}
 
 		if (!PageUptodate(page)) {
 			lock_page(page);
@@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *page; 
-	int error;
+	int ret;
 
-	page = page_cache_alloc_cold(mapping);
-	if (!page)
-		return -ENOMEM;
+	do {
+		page = page_cache_alloc_cold(mapping);
+		if (!page)
+			return -ENOMEM;
+
+		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+		if (ret == 0)
+			ret = mapping->a_ops->readpage(file, page);
+		else if (ret == -EEXIST)
+			ret = 0; /* losing race to add is OK */
 
-	error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-	if (!error) {
-		error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
-		return error;
-	}
 
-	/*
-	 * We arrive here in the unlikely event that someone 
-	 * raced with us and added our page to the cache first
-	 * or we are out of memory for radix-tree nodes.
-	 */
-	page_cache_release(page);
-	return error == -EEXIST ? 0 : error;
+	} while (ret == AOP_TRUNCATED_PAGE);
+		
+	return ret;
 }
 
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1334,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1358,10 +1365,14 @@ page_not_uptodate:
 		goto success;
 	}
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1444,10 +1455,14 @@ page_not_uptodate:
 		goto success;
 	}
 
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1470,10 +1485,14 @@ page_not_uptodate:
 	}
 
 	ClearPageError(page);
-	if (!mapping->a_ops->readpage(file, page)) {
+	error = mapping->a_ops->readpage(file, page);
+	if (!error) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
 			goto success;
+	} else if (error == AOP_TRUNCATED_PAGE) {
+		page_cache_release(page);
+		goto retry_find;
 	}
 
 	/*
@@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
 		if (unlikely(status)) {
 			loff_t isize = i_size_read(inode);
+
+			if (status != AOP_TRUNCATED_PAGE)
+				unlock_page(page);
+			page_cache_release(page);
+			if (status == AOP_TRUNCATED_PAGE)
+				continue;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again.
 			 */
-			unlock_page(page);
-			page_cache_release(page);
 			if (pos + bytes > isize)
 				vmtruncate(inode, isize);
 			break;
@@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
+		if (status == AOP_TRUNCATED_PAGE) {
+			page_cache_release(page);
+			continue;
+		}
 		if (likely(copied > 0)) {
 			if (!status)
 				status = copied;
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adb..8d6eeaa 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
 	unsigned page_idx;
 	struct pagevec lru_pvec;
-	int ret = 0;
+	int ret;
 
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 		list_del(&page->lru);
 		if (!add_to_page_cache(page, mapping,
 					page->index, GFP_KERNEL)) {
-			mapping->a_ops->readpage(filp, page);
-			if (!pagevec_add(&lru_pvec, page))
-				__pagevec_lru_add(&lru_pvec);
-		} else {
-			page_cache_release(page);
+			ret = mapping->a_ops->readpage(filp, page);
+			if (ret != AOP_TRUNCATED_PAGE) {
+				if (!pagevec_add(&lru_pvec, page))
+					__pagevec_lru_add(&lru_pvec);
+				continue;
+			} /* else fall through to release */
 		}
+		page_cache_release(page);
 	}
 	pagevec_lru_add(&lru_pvec);
+	ret = 0;
 out:
 	return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565..d9fc277 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -855,7 +855,7 @@ unlock:
 	swap_free(swap);
 redirty:
 	set_page_dirty(page);
-	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
+	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c..795a050 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
-		if (res == WRITEPAGE_ACTIVATE) {
+		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
-- 
cgit v1.1


From 52fd3d6fea441835fe3a35b7280e5e128bdeca9b Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Very simple printk wrapper which adds the ability to enable various
sets of debug messages at run-time.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/masklog.c | 166 +++++++++++++++++++++++++++
 fs/ocfs2/cluster/masklog.h | 275 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 441 insertions(+)
 create mode 100644 fs/ocfs2/cluster/masklog.c
 create mode 100644 fs/ocfs2/cluster/masklog.h

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 0000000..fd741ce
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+
+#include "masklog.h"
+
+struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
+EXPORT_SYMBOL_GPL(mlog_and_bits);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+EXPORT_SYMBOL_GPL(mlog_not_bits);
+
+static ssize_t mlog_mask_show(u64 mask, char *buf)
+{
+	char *state;
+
+	if (__mlog_test_u64(mask, mlog_and_bits))
+		state = "allow";
+	else if (__mlog_test_u64(mask, mlog_not_bits))
+		state = "deny";
+	else
+		state = "off";
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", state);
+}
+
+static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
+{
+	if (!strnicmp(buf, "allow", 5)) {
+		__mlog_set_u64(mask, mlog_and_bits);
+		__mlog_clear_u64(mask, mlog_not_bits);
+	} else if (!strnicmp(buf, "deny", 4)) {
+		__mlog_set_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else if (!strnicmp(buf, "off", 3)) {
+		__mlog_clear_u64(mask, mlog_not_bits);
+		__mlog_clear_u64(mask, mlog_and_bits);
+	} else
+		return -EINVAL;
+
+	return count;
+}
+
+struct mlog_attribute {
+	struct attribute attr;
+	u64 mask;
+};
+
+#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
+
+#define define_mask(_name) {			\
+	.attr = {				\
+		.name = #_name,			\
+		.mode = S_IRUGO | S_IWUSR,	\
+	},					\
+	.mask = ML_##_name,			\
+}
+
+static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
+	define_mask(ENTRY),
+	define_mask(EXIT),
+	define_mask(TCP),
+	define_mask(MSG),
+	define_mask(SOCKET),
+	define_mask(HEARTBEAT),
+	define_mask(HB_BIO),
+	define_mask(DLMFS),
+	define_mask(DLM),
+	define_mask(DLM_DOMAIN),
+	define_mask(DLM_THREAD),
+	define_mask(DLM_MASTER),
+	define_mask(DLM_RECOVERY),
+	define_mask(AIO),
+	define_mask(JOURNAL),
+	define_mask(DISK_ALLOC),
+	define_mask(SUPER),
+	define_mask(FILE_IO),
+	define_mask(EXTENT_MAP),
+	define_mask(DLM_GLUE),
+	define_mask(BH_IO),
+	define_mask(UPTODATE),
+	define_mask(NAMEI),
+	define_mask(INODE),
+	define_mask(VOTE),
+	define_mask(DCACHE),
+	define_mask(CONN),
+	define_mask(QUORUM),
+	define_mask(EXPORT),
+	define_mask(ERROR),
+	define_mask(NOTICE),
+	define_mask(KTHREAD),
+};
+
+static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
+
+static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
+			 char *buf)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_show(mlog_attr->mask, buf);
+}
+
+static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
+
+	return mlog_mask_store(mlog_attr->mask, buf, count);
+}
+
+static struct sysfs_ops mlog_attr_ops = {
+	.show  = mlog_show,
+	.store = mlog_store,
+};
+
+static struct kobj_type mlog_ktype = {
+	.default_attrs = mlog_attr_ptrs,
+	.sysfs_ops     = &mlog_attr_ops,
+};
+
+static struct kset mlog_kset = {
+	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
+};
+
+int mlog_sys_init(struct subsystem *o2cb_subsys)
+{
+	int i = 0;
+
+	while (mlog_attrs[i].attr.mode) {
+		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
+		i++;
+	}
+	mlog_attr_ptrs[i] = NULL;
+
+	mlog_kset.subsys = o2cb_subsys;
+	return kset_register(&mlog_kset);
+}
+
+void mlog_sys_shutdown(void)
+{
+	kset_unregister(&mlog_kset);
+}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 0000000..f5ef5ea
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,275 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_MASKLOG_H
+#define O2CLUSTER_MASKLOG_H
+
+/*
+ * For now this is a trivial wrapper around printk() that gives the critical
+ * ability to enable sets of debugging output at run-time.  In the future this
+ * will almost certainly be redirected to relayfs so that it can pay a
+ * substantially lower heisenberg tax.
+ *
+ * Callers associate the message with a bitmask and a global bitmask is
+ * maintained with help from /proc.  If any of the bits match the message is
+ * output.
+ *
+ * We must have efficient bit tests on i386 and it seems gcc still emits crazy
+ * code for the 64bit compare.  It emits very good code for the dual unsigned
+ * long tests, though, completely avoiding tests that can never pass if the
+ * caller gives a constant bitmask that fills one of the longs with all 0s.  So
+ * the desire is to have almost all of the calls decided on by comparing just
+ * one of the longs.  This leads to having infrequently given bits that are
+ * frequently matched in the high bits.
+ *
+ * _ERROR and _NOTICE are used for messages that always go to the console and
+ * have appropriate KERN_ prefixes.  We wrap these in our function instead of
+ * just calling printk() so that this can eventually make its way through
+ * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
+ * The inline tests and macro dance give GCC the opportunity to quite cleverly
+ * only emit the appropriage printk() when the caller passes in a constant
+ * mask, as is almost always the case.
+ *
+ * All this bitmask nonsense is hidden from the /proc interface so that Joel
+ * doesn't have an aneurism.  Reading the file gives a straight forward
+ * indication of which bits are on or off:
+ * 	ENTRY off
+ * 	EXIT off
+ * 	TCP off
+ * 	MSG off
+ * 	SOCKET off
+ * 	ERROR off
+ * 	NOTICE on
+ *
+ * Writing changes the state of a given bit and requires a strictly formatted
+ * single write() call:
+ *
+ * 	write(fd, "ENTRY on", 8);
+ *
+ * would turn the entry bit on.  "1" is also accepted in the place of "on", and
+ * "off" and "0" behave as expected.
+ *
+ * Some trivial shell can flip all the bits on or off:
+ *
+ * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
+ * cat $log_mask | (
+ * 	while read bit status; do
+ * 		# $1 is "on" or "off", say
+ * 		echo "$bit $1" > $log_mask
+ * 	done
+ * )
+ */
+
+/* for task_struct */
+#include <linux/sched.h>
+
+/* bits that are frequently given and infrequently matched in the low word */
+/* NOTE: If you add a flag, you need to also update mlog.c! */
+#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
+#define ML_EXIT		0x0000000000000002ULL /* func call exit */
+#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000008ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
+#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
+#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
+#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
+#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
+#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
+#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
+#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
+#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
+#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
+#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
+#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
+#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
+#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
+#define ML_CONN		0x0000000004000000ULL /* net connection management */
+#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
+#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+/* bits that are infrequently given and frequently matched in the high word */
+#define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD	0x0000000400000000ULL /* kernel thread activity */
+
+#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
+#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
+#ifndef MLOG_MASK_PREFIX
+#define MLOG_MASK_PREFIX 0
+#endif
+
+#define MLOG_MAX_BITS 64
+
+struct mlog_bits {
+	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
+};
+
+extern struct mlog_bits mlog_and_bits, mlog_not_bits;
+
+#if BITS_PER_LONG == 32
+
+#define __mlog_test_u64(mask, bits)			\
+	( (u32)(mask & 0xffffffff) & bits.words[0] || 	\
+	  ((u64)(mask) >> 32) & bits.words[1] )
+#define __mlog_set_u64(mask, bits) do {			\
+	bits.words[0] |= (u32)(mask & 0xffffffff);	\
+       	bits.words[1] |= (u64)(mask) >> 32;		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {		\
+	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
+       	bits.words[1] &= ~((u64)(mask) >> 32);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) {				\
+	{						\
+		[0] = (u32)(mask & 0xffffffff),		\
+		[1] = (u64)(mask) >> 32,		\
+	}						\
+}
+
+#else /* 32bit long above, 64bit long below */
+
+#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
+#define __mlog_set_u64(mask, bits) do {		\
+	bits.words[0] |= (mask);		\
+} while (0)
+#define __mlog_clear_u64(mask, bits) do {	\
+	bits.words[0] &= ~(mask);		\
+} while (0)
+#define MLOG_BITS_RHS(mask) { { (mask) } }
+
+#endif
+
+/*
+ * smp_processor_id() "helpfully" screams when called outside preemptible
+ * regions in current kernels.  sles doesn't have the variants that don't
+ * scream.  just do this instead of trying to guess which we're building
+ * against.. *sigh*.
+ */
+#define __mlog_cpu_guess ({		\
+	unsigned long _cpu = get_cpu();	\
+	put_cpu();			\
+	_cpu;				\
+})
+
+/* In the following two macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define __mlog_printk(level, fmt, args...)				\
+	printk(level "(%u,%lu):%s:%d " fmt, current->pid,		\
+	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
+	       ##args)
+
+#define mlog(mask, fmt, args...) do {					\
+	u64 __m = MLOG_MASK_PREFIX | (mask);				\
+	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
+	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
+		if (__m & ML_ERROR)					\
+			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
+		else if (__m & ML_NOTICE)				\
+			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
+		else __mlog_printk(KERN_INFO, fmt , ##args);		\
+	}								\
+} while (0)
+
+#define mlog_errno(st) do {						\
+	int _st = (st);							\
+	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
+	    _st != AOP_TRUNCATED_PAGE)					\
+		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
+} while (0)
+
+#define mlog_entry(fmt, args...) do {					\
+	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
+} while (0)
+
+#define mlog_entry_void() do {						\
+	mlog(ML_ENTRY, "ENTRY:\n");					\
+} while (0)
+
+/* We disable this for old compilers since they don't have support for
+ * __builtin_types_compatible_p.
+ */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
+    !defined(__CHECKER__)
+#define mlog_exit(st) do {						     \
+	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
+		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
+		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
+		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
+		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
+		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
+	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
+		 || __builtin_types_compatible_p(typeof(st), signed short)   \
+		 || __builtin_types_compatible_p(typeof(st), signed char))   \
+		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
+	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
+		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
+	else								     \
+		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
+} while (0)
+#else
+#define mlog_exit(st) do {						     \
+	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
+} while (0)
+#endif
+
+#define mlog_exit_ptr(ptr) do {						\
+	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
+} while (0)
+
+#define mlog_exit_void() do {						\
+	mlog(ML_EXIT, "EXIT\n");					\
+} while (0)
+
+#define mlog_bug_on_msg(cond, fmt, args...) do {			\
+	if (cond) {							\
+		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
+		mlog(ML_ERROR, fmt, ##args);				\
+		BUG();							\
+	}								\
+} while (0)
+
+#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
+#define MLFi64 "lld"
+#define MLFu64 "llu"
+#define MLFx64 "llx"
+#else
+#define MLFi64 "ld"
+#define MLFu64 "lu"
+#define MLFx64 "lx"
+#endif
+
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+int mlog_sys_init(struct subsystem *o2cb_subsys);
+void mlog_sys_shutdown(void);
+
+#endif /* O2CLUSTER_MASKLOG_H */
-- 
cgit v1.1


From 0c83ed8eeb28a045cdbd0b216679938aa9e665fe Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

A simple node information service, filled and updated from
userspace. The rest of the stack queries this service for simple node
information.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/Makefile            |   4 +
 fs/ocfs2/cluster/endian.h            |  30 ++
 fs/ocfs2/cluster/nodemanager.c       | 791 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/nodemanager.h       |  64 +++
 fs/ocfs2/cluster/ocfs2_nodemanager.h |  39 ++
 fs/ocfs2/cluster/ver.c               |  42 ++
 fs/ocfs2/cluster/ver.h               |  31 ++
 7 files changed, 1001 insertions(+)
 create mode 100644 fs/ocfs2/cluster/Makefile
 create mode 100644 fs/ocfs2/cluster/endian.h
 create mode 100644 fs/ocfs2/cluster/nodemanager.c
 create mode 100644 fs/ocfs2/cluster/nodemanager.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_nodemanager.h
 create mode 100644 fs/ocfs2/cluster/ver.c
 create mode 100644 fs/ocfs2/cluster/ver.h

diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 0000000..cdd162f
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
+
+ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
+	quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644
index 0000000..2df9082
--- /dev/null
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_CLUSTER_ENDIAN_H
+#define OCFS2_CLUSTER_ENDIAN_H
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 0000000..5fd60c1
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/configfs.h>
+
+#include "endian.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "heartbeat.h"
+#include "masklog.h"
+#include "sys.h"
+#include "ver.h"
+
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time.  Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+static struct o2nm_cluster *o2nm_single_cluster = NULL;
+
+#define OCFS2_MAX_HB_CTL_PATH 256
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2_NM,
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.ctl_name	= KERN_OCFS2,
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+const char *o2nm_get_hb_ctl_path(void)
+{
+	return ocfs2_hb_ctl_path;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
+
+struct o2nm_cluster {
+	struct config_group	cl_group;
+	unsigned		cl_has_local:1;
+	u8			cl_local_node;
+	rwlock_t		cl_nodes_lock;
+	struct o2nm_node  	*cl_nodes[O2NM_MAX_NODES];
+	struct rb_root		cl_node_ip_tree;
+	/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
+{
+	struct o2nm_node *node = NULL;
+
+	if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
+		goto out;
+
+	read_lock(&o2nm_single_cluster->cl_nodes_lock);
+	node = o2nm_single_cluster->cl_nodes[node_num];
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&o2nm_single_cluster->cl_nodes_lock);
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
+{
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
+
+	if (cluster == NULL)
+		return -EINVAL;
+
+	read_lock(&cluster->cl_nodes_lock);
+	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+	read_unlock(&cluster->cl_nodes_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
+
+static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
+						  __be32 ip_needle,
+						  struct rb_node ***ret_p,
+						  struct rb_node **ret_parent)
+{
+	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct o2nm_node *node, *ret = NULL;
+
+	while (*p) {
+		parent = *p;
+		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
+
+		if (memcmp(&ip_needle, &node->nd_ipv4_address,
+		           sizeof(ip_needle)) < 0)
+			p = &(*p)->rb_left;
+		else if (memcmp(&ip_needle, &node->nd_ipv4_address,
+			        sizeof(ip_needle)) > 0)
+			p = &(*p)->rb_right;
+		else {
+			ret = node;
+			break;
+		}
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+
+	return ret;
+}
+
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
+{
+	struct o2nm_node *node = NULL;
+	struct o2nm_cluster *cluster = o2nm_single_cluster;
+
+	if (cluster == NULL)
+		goto out;
+
+	read_lock(&cluster->cl_nodes_lock);
+	node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+	if (node)
+		config_item_get(&node->nd_item);
+	read_unlock(&cluster->cl_nodes_lock);
+
+out:
+	return node;
+}
+EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
+
+void o2nm_node_put(struct o2nm_node *node)
+{
+	config_item_put(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_put);
+
+void o2nm_node_get(struct o2nm_node *node)
+{
+	config_item_get(&node->nd_item);
+}
+EXPORT_SYMBOL_GPL(o2nm_node_get);
+
+u8 o2nm_this_node(void)
+{
+	u8 node_num = O2NM_MAX_NODES;
+
+	if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
+		node_num = o2nm_single_cluster->cl_local_node;
+
+	return node_num;
+}
+EXPORT_SYMBOL_GPL(o2nm_this_node);
+
+/* node configfs bits */
+
+static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
+{
+	return item ?
+		container_of(to_config_group(item), struct o2nm_cluster,
+			     cl_group)
+		: NULL;
+}
+
+static struct o2nm_node *to_o2nm_node(struct config_item *item)
+{
+	return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
+}
+
+static void o2nm_node_release(struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	kfree(node);
+}
+
+static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_num);
+}
+
+static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
+{
+	/* through the first node_set .parent
+	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
+	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+}
+
+enum {
+	O2NM_NODE_ATTR_NUM = 0,
+	O2NM_NODE_ATTR_PORT,
+	O2NM_NODE_ATTR_ADDRESS,
+	O2NM_NODE_ATTR_LOCAL,
+};
+
+static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+				   size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp >= O2NM_MAX_NODES)
+		return -ERANGE;
+
+	/* once we're in the cl_nodes tree networking can look us up by
+	 * node number and try to use our address and port attributes
+	 * to connect to this node.. make sure that they've been set
+	 * before writing the node attribute? */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	write_lock(&cluster->cl_nodes_lock);
+	if (cluster->cl_nodes[tmp])
+		p = NULL;
+	else  {
+		cluster->cl_nodes[tmp] = node;
+		node->nd_num = tmp;
+		set_bit(tmp, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (p == NULL)
+		return -EEXIST;
+
+	return count;
+}
+static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
+
+static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+					 const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp == 0)
+		return -EINVAL;
+	if (tmp >= (u16)-1)
+		return -ERANGE;
+
+	node->nd_ipv4_port = htons(tmp);
+
+	return count;
+}
+
+static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
+}
+
+static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+					    const char *page,
+					    size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	int ret, i;
+	struct rb_node **p, *parent;
+	unsigned int octets[4];
+	__be32 ipv4_addr = 0;
+
+	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
+		     &octets[1], &octets[0]);
+	if (ret != 4)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(octets); i++) {
+		if (octets[i] > 255)
+			return -ERANGE;
+		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
+	}
+
+	ret = 0;
+	write_lock(&cluster->cl_nodes_lock);
+	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+		ret = -EEXIST;
+	else {
+		rb_link_node(&node->nd_ip_node, parent, p);
+		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+	if (ret)
+		return ret;
+
+	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
+
+	return count;
+}
+
+static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+{
+	return sprintf(page, "%d\n", node->nd_local);
+}
+
+static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+				     size_t count)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	unsigned long tmp;
+	char *p = (char *)page;
+	ssize_t ret;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	tmp = !!tmp; /* boolean of whether this node wants to be local */
+
+	/* setting local turns on networking rx for now so we require having
+	 * set everything else first */
+	if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
+	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+		return -EINVAL; /* XXX */
+
+	/* the only failure case is trying to set a new local node
+	 * when a different one is already set */
+	if (tmp && tmp == cluster->cl_has_local &&
+	    cluster->cl_local_node != node->nd_num)
+		return -EBUSY;
+
+	/* bring up the rx thread if we're setting the new local node. */
+	if (tmp && !cluster->cl_has_local) {
+		ret = o2net_start_listening(node);
+		if (ret)
+			return ret;
+	}
+
+	if (!tmp && cluster->cl_has_local &&
+	    cluster->cl_local_node == node->nd_num) {
+		o2net_stop_listening(node);
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	node->nd_local = tmp;
+	if (node->nd_local) {
+		cluster->cl_has_local = tmp;
+		cluster->cl_local_node = node->nd_num;
+	}
+
+	return count;
+}
+
+struct o2nm_node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2nm_node *, char *);
+	ssize_t (*store)(struct o2nm_node *, const char *, size_t);
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_num = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "num",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_num_read,
+	.store	= o2nm_node_num_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_port",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_port_read,
+	.store	= o2nm_node_ipv4_port_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "ipv4_address",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_ipv4_address_read,
+	.store	= o2nm_node_ipv4_address_write,
+};
+
+static struct o2nm_node_attribute o2nm_node_attr_local = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "local",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2nm_node_local_read,
+	.store	= o2nm_node_local_write,
+};
+
+static struct configfs_attribute *o2nm_node_attrs[] = {
+	[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
+	[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
+	[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
+	[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+	NULL,
+};
+
+static int o2nm_attr_index(struct configfs_attribute *attr)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
+		if (attr == o2nm_node_attrs[i])
+			return i;
+	}
+	BUG();
+	return 0;
+}
+
+static ssize_t o2nm_node_show(struct config_item *item,
+			      struct configfs_attribute *attr,
+			      char *page)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2nm_node_attr->show)
+		ret = o2nm_node_attr->show(node, page);
+	return ret;
+}
+
+static ssize_t o2nm_node_store(struct config_item *item,
+			       struct configfs_attribute *attr,
+			       const char *page, size_t count)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_node_attribute *o2nm_node_attr =
+		container_of(attr, struct o2nm_node_attribute, attr);
+	ssize_t ret;
+	int attr_index = o2nm_attr_index(attr);
+
+	if (o2nm_node_attr->store == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(attr_index, &node->nd_set_attributes))
+		return -EBUSY;
+
+	ret = o2nm_node_attr->store(node, page, count);
+	if (ret < count)
+		goto out;
+
+	set_bit(attr_index, &node->nd_set_attributes);
+out:
+	return ret;
+}
+
+static struct configfs_item_operations o2nm_node_item_ops = {
+	.release		= o2nm_node_release,
+	.show_attribute		= o2nm_node_show,
+	.store_attribute	= o2nm_node_store,
+};
+
+static struct config_item_type o2nm_node_type = {
+	.ct_item_ops	= &o2nm_node_item_ops,
+	.ct_attrs	= o2nm_node_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* node set */
+
+struct o2nm_node_group {
+	struct config_group ns_group;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2nm_node_group, ns_group)
+		: NULL;
+}
+#endif
+
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
+{
+	struct o2nm_node *node = NULL;
+	struct config_item *ret = NULL;
+
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		goto out; /* ENAMETOOLONG */
+
+	node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
+	if (node == NULL)
+		goto out; /* ENOMEM */
+
+	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
+	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
+	spin_lock_init(&node->nd_lock);
+
+	ret = &node->nd_item;
+
+out:
+	if (ret == NULL)
+		kfree(node);
+
+	return ret;
+}
+
+static void o2nm_node_group_drop_item(struct config_group *group,
+				      struct config_item *item)
+{
+	struct o2nm_node *node = to_o2nm_node(item);
+	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
+
+	o2net_disconnect_node(node);
+
+	if (cluster->cl_has_local &&
+	    (cluster->cl_local_node == node->nd_num)) {
+		cluster->cl_has_local = 0;
+		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+		o2net_stop_listening(node);
+	}
+
+	/* XXX call into net to stop this node from trading messages */
+
+	write_lock(&cluster->cl_nodes_lock);
+
+	/* XXX sloppy */
+	if (node->nd_ipv4_address)
+		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
+
+	/* nd_num might be 0 if the node number hasn't been set.. */
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		cluster->cl_nodes[node->nd_num] = NULL;
+		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
+	}
+	write_unlock(&cluster->cl_nodes_lock);
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_node_group_group_ops = {
+	.make_item	= o2nm_node_group_make_item,
+	.drop_item	= o2nm_node_group_drop_item,
+};
+
+static struct config_item_type o2nm_node_group_type = {
+	.ct_group_ops	= &o2nm_node_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster */
+
+static void o2nm_cluster_release(struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+
+	kfree(cluster->cl_group.default_groups);
+	kfree(cluster);
+}
+
+static struct configfs_item_operations o2nm_cluster_item_ops = {
+	.release	= o2nm_cluster_release,
+};
+
+static struct config_item_type o2nm_cluster_type = {
+	.ct_item_ops	= &o2nm_cluster_item_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* cluster set */
+
+struct o2nm_cluster_group {
+	struct configfs_subsystem cs_subsys;
+	/* some stuff? */
+};
+
+#if 0
+static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
+{
+	return group ?
+		container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
+	       : NULL;
+}
+#endif
+
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
+{
+	struct o2nm_cluster *cluster = NULL;
+	struct o2nm_node_group *ns = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
+	void *defs = NULL;
+
+	/* this runs under the parent dir's i_sem; there can be only
+	 * one caller in here at a time */
+	if (o2nm_single_cluster)
+		goto out; /* ENOSPC */
+
+	cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
+	ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
+	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	o2hb_group = o2hb_alloc_hb_set();
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+		goto out;
+
+	config_group_init_type_name(&cluster->cl_group, name,
+				    &o2nm_cluster_type);
+	config_group_init_type_name(&ns->ns_group, "node",
+				    &o2nm_node_group_type);
+
+	cluster->cl_group.default_groups = defs;
+	cluster->cl_group.default_groups[0] = &ns->ns_group;
+	cluster->cl_group.default_groups[1] = o2hb_group;
+	cluster->cl_group.default_groups[2] = NULL;
+	rwlock_init(&cluster->cl_nodes_lock);
+	cluster->cl_node_ip_tree = RB_ROOT;
+
+	ret = &cluster->cl_group;
+	o2nm_single_cluster = cluster;
+
+out:
+	if (ret == NULL) {
+		kfree(cluster);
+		kfree(ns);
+		o2hb_free_hb_set(o2hb_group);
+		kfree(defs);
+	}
+
+	return ret;
+}
+
+static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+	int i;
+	struct config_item *killme;
+
+	BUG_ON(o2nm_single_cluster != cluster);
+	o2nm_single_cluster = NULL;
+
+	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
+		killme = &cluster->cl_group.default_groups[i]->cg_item;
+		cluster->cl_group.default_groups[i] = NULL;
+		config_item_put(killme);
+	}
+
+	config_item_put(item);
+}
+
+static struct configfs_group_operations o2nm_cluster_group_group_ops = {
+	.make_group	= o2nm_cluster_group_make_group,
+	.drop_item	= o2nm_cluster_group_drop_item,
+};
+
+static struct config_item_type o2nm_cluster_group_type = {
+	.ct_group_ops	= &o2nm_cluster_group_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct o2nm_cluster_group o2nm_cluster_group = {
+	.cs_subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "cluster",
+				.ci_type = &o2nm_cluster_group_type,
+			},
+		},
+	},
+};
+
+static void __exit exit_o2nm(void)
+{
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
+
+	/* XXX sync with hb callbacks and shut down hb? */
+	o2net_unregister_hb_callbacks();
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+	o2cb_sys_shutdown();
+
+	o2net_exit();
+}
+
+static int __init init_o2nm(void)
+{
+	int ret = -1;
+
+	cluster_print_version();
+
+	o2hb_init();
+	o2net_init();
+
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
+		ret = -ENOMEM; /* or something. */
+		goto out;
+	}
+
+	ret = o2net_register_hb_callbacks();
+	if (ret)
+		goto out_sysctl;
+
+	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
+	init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+	ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
+	if (ret) {
+		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+		goto out_callbacks;
+	}
+
+	ret = o2cb_sys_init();
+	if (!ret)
+		goto out;
+
+	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
+out_callbacks:
+	o2net_unregister_hb_callbacks();
+out_sysctl:
+	unregister_sysctl_table(ocfs2_table_header);
+out:
+	return ret;
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_o2nm)
+module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 0000000..fce8033
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * nodemanager.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_NODEMANAGER_H
+#define O2CLUSTER_NODEMANAGER_H
+
+#include "ocfs2_nodemanager.h"
+
+/* This totally doesn't belong here. */
+#include <linux/configfs.h>
+#include <linux/rbtree.h>
+
+#define KERN_OCFS2		988
+#define KERN_OCFS2_NM		1
+
+const char *o2nm_get_hb_ctl_path(void);
+
+struct o2nm_node {
+	spinlock_t		nd_lock;
+	struct config_item	nd_item;
+	char			nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
+	__u8			nd_num;
+	/* only one address per node, as attributes, for now. */
+	__be32			nd_ipv4_address;
+	__be16			nd_ipv4_port;
+	struct rb_node		nd_ip_node;
+	/* there can be only one local node for now */
+	int			nd_local;
+
+	unsigned long		nd_set_attributes;
+};
+
+u8 o2nm_this_node(void);
+
+int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
+struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
+struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
+void o2nm_node_get(struct o2nm_node *node);
+void o2nm_node_put(struct o2nm_node *node);
+
+#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 0000000..5b9854b
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_nodemanager.h
+ *
+ * Header describing the interface between userspace and the kernel
+ * for the ocfs2_nodemanager module.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef _OCFS2_NODEMANAGER_H
+#define _OCFS2_NODEMANAGER_H
+
+#define O2NM_API_VERSION	5
+
+#define O2NM_MAX_NODES		255
+#define O2NM_INVALID_NODE_NUM	255
+
+/* host name, group name, cluster name all 64 bytes */
+#define O2NM_MAX_NAME_LEN        64    // __NEW_UTS_LEN
+
+#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 0000000..7286c48
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define CLUSTER_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
+
+void cluster_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 0000000..32554c3
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_VER_H
+#define O2CLUSTER_VER_H
+
+void cluster_print_version(void);
+
+#endif /* O2CLUSTER_VER_H */
-- 
cgit v1.1


From a7f6a5fb4bde142b622706e2006ba33f793e13ed Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Disk based heartbeat. Configured and started from userspace, the
kernel component handles I/O submission and event generation via
callback mechanism.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c       | 1797 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/heartbeat.h       |   82 ++
 fs/ocfs2/cluster/ocfs2_heartbeat.h |   37 +
 3 files changed, 1916 insertions(+)
 create mode 100644 fs/ocfs2/cluster/heartbeat.c
 create mode 100644 fs/ocfs2/cluster/heartbeat.h
 create mode 100644 fs/ocfs2/cluster/ocfs2_heartbeat.h

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 0000000..7307ba5
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#include "quorum.h"
+
+#include "masklog.h"
+
+
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls.  This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(o2hb_callback_sem);
+
+/*
+ * multiple hb threads are watching multiple regions.  A node is live
+ * whenever any of the threads sees activity from the node in its region.
+ */
+static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
+static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+static LIST_HEAD(o2hb_node_events);
+static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+
+static LIST_HEAD(o2hb_all_regions);
+
+static struct o2hb_callback {
+	struct list_head list;
+} o2hb_callbacks[O2HB_NUM_CB];
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
+
+#define O2HB_DEFAULT_BLOCK_BITS       9
+
+unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+
+/* Only sets a new threshold if there are no active regions. 
+ *
+ * No locking or otherwise interesting code is required for reading
+ * o2hb_dead_threshold as it can't change once regions are active and
+ * it's not interesting to anyone until then anyway. */
+static void o2hb_dead_threshold_set(unsigned int threshold)
+{
+	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
+		spin_lock(&o2hb_live_lock);
+		if (list_empty(&o2hb_all_regions))
+			o2hb_dead_threshold = threshold;
+		spin_unlock(&o2hb_live_lock);
+	}
+}
+
+struct o2hb_node_event {
+	struct list_head        hn_item;
+	enum o2hb_callback_type hn_event_type;
+	struct o2nm_node        *hn_node;
+	int                     hn_node_num;
+};
+
+struct o2hb_disk_slot {
+	struct o2hb_disk_heartbeat_block *ds_raw_block;
+	u8			ds_node_num;
+	u64			ds_last_time;
+	u64			ds_last_generation;
+	u16			ds_equal_samples;
+	u16			ds_changed_samples;
+	struct list_head	ds_live_item;
+};
+
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct o2hb_region {
+	struct config_item	hr_item;
+
+	struct list_head	hr_all_item;
+	unsigned		hr_unclean_stop:1;
+
+	/* protected by the hr_callback_sem */
+	struct task_struct 	*hr_task;
+
+	unsigned int		hr_blocks;
+	unsigned long long	hr_start_block;
+
+	unsigned int		hr_block_bits;
+	unsigned int		hr_block_bytes;
+
+	unsigned int		hr_slots_per_page;
+	unsigned int		hr_num_pages;
+
+	struct page             **hr_slot_data;
+	struct block_device	*hr_bdev;
+	struct o2hb_disk_slot	*hr_slots;
+
+	/* let the person setting up hb wait for it to return until it
+	 * has reached a 'steady' state.  This will be fixed when we have
+	 * a more complete api that doesn't lead to this sort of fragility. */
+	atomic_t		hr_steady_iterations;
+
+	char			hr_dev_name[BDEVNAME_SIZE];
+
+	unsigned int		hr_timeout_ms;
+
+	/* randomized as the region goes up and down so that a node
+	 * recognizes a node going up and down in one iteration */
+	u64			hr_generation;
+
+	struct work_struct	hr_write_timeout_work;
+	unsigned long		hr_last_timeout_start;
+
+	/* Used during o2hb_check_slot to hold a copy of the block
+	 * being checked because we temporarily have to zero out the
+	 * crc field. */
+	struct o2hb_disk_heartbeat_block *hr_tmp_block;
+};
+
+struct o2hb_bio_wait_ctxt {
+	atomic_t          wc_num_reqs;
+	struct completion wc_io_complete;
+};
+
+static void o2hb_write_timeout(void *arg)
+{
+	struct o2hb_region *reg = arg;
+
+	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
+	     "milliseconds\n", reg->hr_dev_name,
+	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
+	o2quo_disk_timeout();
+}
+
+static void o2hb_arm_write_timeout(struct o2hb_region *reg)
+{
+	mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	reg->hr_last_timeout_start = jiffies;
+	schedule_delayed_work(&reg->hr_write_timeout_work,
+			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
+}
+
+static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
+{
+	cancel_delayed_work(&reg->hr_write_timeout_work);
+	flush_scheduled_work();
+}
+
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int num_ios)
+{
+	atomic_set(&wc->wc_num_reqs, num_ios);
+	init_completion(&wc->wc_io_complete);
+}
+
+/* Used in error paths too */
+static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
+				     unsigned int num)
+{
+	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
+	 * good news is that the fast path only completes one at a time */
+	while(num--) {
+		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
+			BUG_ON(num > 0);
+			complete(&wc->wc_io_complete);
+		}
+	}
+}
+
+static void o2hb_wait_on_io(struct o2hb_region *reg,
+			    struct o2hb_bio_wait_ctxt *wc)
+{
+	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
+
+	blk_run_address_space(mapping);
+
+	wait_for_completion(&wc->wc_io_complete);
+}
+
+static int o2hb_bio_end_io(struct bio *bio,
+			   unsigned int bytes_done,
+			   int error)
+{
+	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
+
+	if (error)
+		mlog(ML_ERROR, "IO Error %d\n", error);
+
+	if (bio->bi_size)
+		return 1;
+
+	o2hb_bio_wait_dec(wc, 1);
+	return 0;
+}
+
+/* Setup a Bio to cover I/O against num_slots slots starting at
+ * start_slot. */
+static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
+				      struct o2hb_bio_wait_ctxt *wc,
+				      unsigned int start_slot,
+				      unsigned int num_slots)
+{
+	int i, nr_vecs, len, first_page, last_page;
+	unsigned int vec_len, vec_start;
+	unsigned int bits = reg->hr_block_bits;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct bio *bio;
+	struct page *page;
+
+	nr_vecs = (num_slots + spp - 1) / spp;
+
+	/* Testing has shown this allocation to take long enough under
+	 * GFP_KERNEL that the local node can get fenced. It would be
+	 * nicest if we could pre-allocate these bios and avoid this
+	 * all together. */
+	bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+	if (!bio) {
+		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
+		bio = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	/* Must put everything in 512 byte sectors for the bio... */
+	bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+	bio->bi_bdev = reg->hr_bdev;
+	bio->bi_private = wc;
+	bio->bi_end_io = o2hb_bio_end_io;
+
+	first_page = start_slot / spp;
+	last_page = first_page + nr_vecs;
+	vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
+	for(i = first_page; i < last_page; i++) {
+		page = reg->hr_slot_data[i];
+
+		vec_len = PAGE_CACHE_SIZE;
+		/* last page might be short */
+		if (((i + 1) * spp) > (start_slot + num_slots))
+			vec_len = ((num_slots + start_slot) % spp) << bits;
+		vec_len -=  vec_start;
+
+		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
+		     i, vec_len, vec_start);
+
+		len = bio_add_page(bio, page, vec_len, vec_start);
+		if (len != vec_len) {
+			bio_put(bio);
+			bio = ERR_PTR(-EIO);
+
+			mlog(ML_ERROR, "Error adding page to bio i = %d, "
+			     "vec_len = %u, len = %d\n, start = %u\n",
+			     i, vec_len, len, vec_start);
+			goto bail;
+		}
+
+		vec_start = 0;
+	}
+
+bail:
+	return bio;
+}
+
+/*
+ * Compute the maximum number of sectors the bdev can handle in one bio,
+ * as a power of two.
+ *
+ * Stolen from oracleasm, thanks Joel!
+ */
+static int compute_max_sectors(struct block_device *bdev)
+{
+	int max_pages, max_sectors, pow_two_sectors;
+
+	struct request_queue *q;
+
+	q = bdev_get_queue(bdev);
+	max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
+	if (max_pages > q->max_phys_segments)
+		max_pages = q->max_phys_segments;
+	if (max_pages > q->max_hw_segments)
+		max_pages = q->max_hw_segments;
+	max_pages--; /* Handle I/Os that straddle a page */
+
+	max_sectors = max_pages << (PAGE_SHIFT - 9);
+
+	/* Why is fls() 1-based???? */
+	pow_two_sectors = 1 << (fls(max_sectors) - 1);
+
+	return pow_two_sectors;
+}
+
+static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
+					       unsigned int num_slots,
+					       unsigned int *num_bios,
+					       unsigned int *slots_per_bio)
+{
+	unsigned int max_sectors, io_sectors;
+
+	max_sectors = compute_max_sectors(reg->hr_bdev);
+
+	io_sectors = num_slots << (reg->hr_block_bits - 9);
+
+	*num_bios = (io_sectors + max_sectors - 1) / max_sectors;
+	*slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
+
+	mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
+	     "device can handle %u sectors of I/O\n", io_sectors, num_slots,
+	     max_sectors);
+	mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
+	     *num_bios, *slots_per_bio);
+}
+
+static int o2hb_read_slots(struct o2hb_region *reg,
+			   unsigned int max_slots)
+{
+	unsigned int num_bios, slots_per_bio, start_slot, num_slots;
+	int i, status;
+	struct o2hb_bio_wait_ctxt wc;
+	struct bio **bios;
+	struct bio *bio;
+
+	o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
+
+	bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
+	if (!bios) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		return status;
+	}
+
+	o2hb_bio_wait_init(&wc, num_bios);
+
+	num_slots = slots_per_bio;
+	for(i = 0; i < num_bios; i++) {
+		start_slot = i * slots_per_bio;
+
+		/* adjust num_slots at last bio */
+		if (max_slots < (start_slot + num_slots))
+			num_slots = max_slots - start_slot;
+
+		bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
+		if (IS_ERR(bio)) {
+			o2hb_bio_wait_dec(&wc, num_bios - i);
+
+			status = PTR_ERR(bio);
+			mlog_errno(status);
+			goto bail_and_wait;
+		}
+		bios[i] = bio;
+
+		submit_bio(READ, bio);
+	}
+
+	status = 0;
+
+bail_and_wait:
+	o2hb_wait_on_io(reg, &wc);
+
+	if (bios) {
+		for(i = 0; i < num_bios; i++)
+			if (bios[i])
+				bio_put(bios[i]);
+		kfree(bios);
+	}
+
+	return status;
+}
+
+static int o2hb_issue_node_write(struct o2hb_region *reg,
+				 struct bio **write_bio,
+				 struct o2hb_bio_wait_ctxt *write_wc)
+{
+	int status;
+	unsigned int slot;
+	struct bio *bio;
+
+	o2hb_bio_wait_init(write_wc, 1);
+
+	slot = o2nm_this_node();
+
+	bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+	if (IS_ERR(bio)) {
+		status = PTR_ERR(bio);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	submit_bio(WRITE, bio);
+
+	*write_bio = bio;
+	status = 0;
+bail:
+	return status;
+}
+
+static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
+				     struct o2hb_disk_heartbeat_block *hb_block)
+{
+	__le32 old_cksum;
+	u32 ret;
+
+	/* We want to compute the block crc with a 0 value in the
+	 * hb_cksum field. Save it off here and replace after the
+	 * crc. */
+	old_cksum = hb_block->hb_cksum;
+	hb_block->hb_cksum = 0;
+
+	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
+
+	hb_block->hb_cksum = old_cksum;
+
+	return ret;
+}
+
+static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
+{
+	mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
+	     "cksum = 0x%x, generation 0x%"MLFx64"\n",
+	     le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
+	     le32_to_cpu(hb_block->hb_cksum),
+	     le64_to_cpu(hb_block->hb_generation));
+}
+
+static int o2hb_verify_crc(struct o2hb_region *reg,
+			   struct o2hb_disk_heartbeat_block *hb_block)
+{
+	u32 read, computed;
+
+	read = le32_to_cpu(hb_block->hb_cksum);
+	computed = o2hb_compute_block_crc_le(reg, hb_block);
+
+	return read == computed;
+}
+
+/* We want to make sure that nobody is heartbeating on top of us --
+ * this will help detect an invalid configuration. */
+static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+{
+	int node_num, ret;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+
+	ret = 1;
+	slot = &reg->hr_slots[node_num];
+	/* Don't check on our 1st timestamp */
+	if (slot->ds_last_time) {
+		hb_block = slot->ds_raw_block;
+
+		if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
+			ret = 0;
+	}
+
+	return ret;
+}
+
+static inline void o2hb_prepare_block(struct o2hb_region *reg,
+				      u64 generation)
+{
+	int node_num;
+	u64 cputime;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	node_num = o2nm_this_node();
+	slot = &reg->hr_slots[node_num];
+
+	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
+	memset(hb_block, 0, reg->hr_block_bytes);
+	/* TODO: time stuff */
+	cputime = CURRENT_TIME.tv_sec;
+	if (!cputime)
+		cputime = 1;
+
+	hb_block->hb_seq = cpu_to_le64(cputime);
+	hb_block->hb_node = node_num;
+	hb_block->hb_generation = cpu_to_le64(generation);
+
+	/* This step must always happen last! */
+	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
+								   hb_block));
+
+	mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
+	     cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
+}
+
+static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
+				struct o2nm_node *node,
+				int idx)
+{
+	struct list_head *iter;
+	struct o2hb_callback_func *f;
+
+	list_for_each(iter, &hbcall->list) {
+		f = list_entry(iter, struct o2hb_callback_func, hc_item);
+		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
+		(f->hc_func)(node, idx, f->hc_data);
+	}
+}
+
+/* Will run the list in order until we process the passed event */
+static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
+{
+	int empty;
+	struct o2hb_callback *hbcall;
+	struct o2hb_node_event *event;
+
+	spin_lock(&o2hb_live_lock);
+	empty = list_empty(&queued_event->hn_item);
+	spin_unlock(&o2hb_live_lock);
+	if (empty)
+		return;
+
+	/* Holding callback sem assures we don't alter the callback
+	 * lists when doing this, and serializes ourselves with other
+	 * processes wanting callbacks. */
+	down_write(&o2hb_callback_sem);
+
+	spin_lock(&o2hb_live_lock);
+	while (!list_empty(&o2hb_node_events)
+	       && !list_empty(&queued_event->hn_item)) {
+		event = list_entry(o2hb_node_events.next,
+				   struct o2hb_node_event,
+				   hn_item);
+		list_del_init(&event->hn_item);
+		spin_unlock(&o2hb_live_lock);
+
+		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
+		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
+		     event->hn_node_num);
+
+		hbcall = hbcall_from_type(event->hn_event_type);
+
+		/* We should *never* have gotten on to the list with a
+		 * bad type... This isn't something that we should try
+		 * to recover from. */
+		BUG_ON(IS_ERR(hbcall));
+
+		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
+
+		spin_lock(&o2hb_live_lock);
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	up_write(&o2hb_callback_sem);
+}
+
+static void o2hb_queue_node_event(struct o2hb_node_event *event,
+				  enum o2hb_callback_type type,
+				  struct o2nm_node *node,
+				  int node_num)
+{
+	assert_spin_locked(&o2hb_live_lock);
+
+	event->hn_event_type = type;
+	event->hn_node = node;
+	event->hn_node_num = node_num;
+
+	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
+	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
+
+	list_add_tail(&event->hn_item, &o2hb_node_events);
+}
+
+static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
+{
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return;
+
+	spin_lock(&o2hb_live_lock);
+	if (!list_empty(&slot->ds_live_item)) {
+		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
+		     slot->ds_node_num);
+
+		list_del_init(&slot->ds_live_item);
+
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+		}
+	}
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+}
+
+static int o2hb_check_slot(struct o2hb_region *reg,
+			   struct o2hb_disk_slot *slot)
+{
+	int changed = 0, gen_changed = 0;
+	struct o2hb_node_event event =
+		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
+	struct o2nm_node *node;
+	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
+	u64 cputime;
+
+	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
+
+	/* Is this correct? Do we assume that the node doesn't exist
+	 * if we're not configured for him? */
+	node = o2nm_get_node_by_num(slot->ds_node_num);
+	if (!node)
+		return 0;
+
+	if (!o2hb_verify_crc(reg, hb_block)) {
+		/* all paths from here will drop o2hb_live_lock for
+		 * us. */
+		spin_lock(&o2hb_live_lock);
+
+		/* Don't print an error on the console in this case -
+		 * a freshly formatted heartbeat area will not have a
+		 * crc set on it. */
+		if (list_empty(&slot->ds_live_item))
+			goto out;
+
+		/* The node is live but pushed out a bad crc. We
+		 * consider it a transient miss but don't populate any
+		 * other values as they may be junk. */
+		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
+		     slot->ds_node_num, reg->hr_dev_name);
+		o2hb_dump_slot(hb_block);
+
+		slot->ds_equal_samples++;
+		goto fire_callbacks;
+	}
+
+	/* we don't care if these wrap.. the state transitions below
+	 * clear at the right places */
+	cputime = le64_to_cpu(hb_block->hb_seq);
+	if (slot->ds_last_time != cputime)
+		slot->ds_changed_samples++;
+	else
+		slot->ds_equal_samples++;
+	slot->ds_last_time = cputime;
+
+	/* The node changed heartbeat generations. We assume this to
+	 * mean it dropped off but came back before we timed out. We
+	 * want to consider it down for the time being but don't want
+	 * to lose any changed_samples state we might build up to
+	 * considering it live again. */
+	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
+		gen_changed = 1;
+		slot->ds_equal_samples = 0;
+		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
+		     "to 0x%"MLFx64")\n", slot->ds_node_num,
+		     slot->ds_last_generation,
+		     le64_to_cpu(hb_block->hb_generation));
+	}
+
+	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+
+	mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
+	     "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
+	     slot->ds_node_num, slot->ds_last_generation,
+	     le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), 
+	     slot->ds_last_time, slot->ds_changed_samples,
+	     slot->ds_equal_samples);
+
+	spin_lock(&o2hb_live_lock);
+
+fire_callbacks:
+	/* dead nodes only come to life after some number of
+	 * changes at any time during their dead time */
+	if (list_empty(&slot->ds_live_item) &&
+	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
+		mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
+		     "region\n", slot->ds_node_num, slot->ds_last_generation);
+
+		/* first on the list generates a callback */
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		list_add_tail(&slot->ds_live_item,
+			      &o2hb_live_slots[slot->ds_node_num]);
+
+		slot->ds_equal_samples = 0;
+		goto out;
+	}
+
+	/* if the list is dead, we're done.. */
+	if (list_empty(&slot->ds_live_item))
+		goto out;
+
+	/* live nodes only go dead after enough consequtive missed
+	 * samples..  reset the missed counter whenever we see
+	 * activity */
+	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
+		mlog(ML_HEARTBEAT, "Node %d left my region\n",
+		     slot->ds_node_num);
+
+		/* last off the live_slot generates a callback */
+		list_del_init(&slot->ds_live_item);
+		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+
+			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
+					      slot->ds_node_num);
+
+			changed = 1;
+		}
+
+		/* We don't clear this because the node is still
+		 * actually writing new blocks. */
+		if (!gen_changed)
+			slot->ds_changed_samples = 0;
+		goto out;
+	}
+	if (slot->ds_changed_samples) {
+		slot->ds_changed_samples = 0;
+		slot->ds_equal_samples = 0;
+	}
+out:
+	spin_unlock(&o2hb_live_lock);
+
+	o2hb_run_event_list(&event);
+
+	o2nm_node_put(node);
+	return changed;
+}
+
+/* This could be faster if we just implmented a find_last_bit, but I
+ * don't think the circumstances warrant it. */
+static int o2hb_highest_node(unsigned long *nodes,
+			     int numbits)
+{
+	int highest, node;
+
+	highest = numbits;
+	node = -1;
+	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
+		if (node >= numbits)
+			break;
+
+		highest = node;
+	}
+
+	return highest;
+}
+
+static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+{
+	int i, ret, highest_node, change = 0;
+	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+
+	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
+		return;
+
+	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
+	if (highest_node >= O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
+		return;
+	}
+
+	/* No sense in reading the slots of nodes that don't exist
+	 * yet. Of course, if the node definitions have holes in them
+	 * then we're reading an empty slot anyway... Consider this
+	 * best-effort. */
+	ret = o2hb_read_slots(reg, highest_node + 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	/* With an up to date view of the slots, we can check that no
+	 * other node has been improperly configured to heartbeat in
+	 * our slot. */
+	if (!o2hb_check_last_timestamp(reg))
+		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
+		     "in our slot!\n", reg->hr_dev_name);
+
+	/* fill in the proper info for our next heartbeat */
+	o2hb_prepare_block(reg, reg->hr_generation);
+
+	/* And fire off the write. Note that we don't wait on this I/O
+	 * until later. */
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return;
+	}
+
+	i = -1;
+	while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+
+		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+	}
+
+	/*
+	 * We have to be sure we've advertised ourselves on disk
+	 * before we can go to steady state.  This ensures that
+	 * people we find in our steady state have seen us.
+	 */
+	o2hb_wait_on_io(reg, &write_wc);
+	bio_put(write_bio);
+	o2hb_arm_write_timeout(reg);
+
+	/* let the person who launched us know when things are steady */
+	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
+		if (atomic_dec_and_test(&reg->hr_steady_iterations))
+			wake_up(&o2hb_steady_queue);
+	}
+}
+
+/* Subtract b from a, storing the result in a. a *must* have a larger
+ * value than b. */
+static void o2hb_tv_subtract(struct timeval *a,
+			     struct timeval *b)
+{
+	/* just return 0 when a is after b */
+	if (a->tv_sec < b->tv_sec ||
+	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
+		a->tv_sec = 0;
+		a->tv_usec = 0;
+		return;
+	}
+
+	a->tv_sec -= b->tv_sec;
+	a->tv_usec -= b->tv_usec;
+	while ( a->tv_usec < 0 ) {
+		a->tv_sec--;
+		a->tv_usec += 1000000;
+	}
+}
+
+static unsigned int o2hb_elapsed_msecs(struct timeval *start,
+				       struct timeval *end)
+{
+	struct timeval res = *end;
+
+	o2hb_tv_subtract(&res, start);
+
+	return res.tv_sec * 1000 + res.tv_usec / 1000;
+}
+
+/*
+ * we ride the region ref that the region dir holds.  before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
+static int o2hb_thread(void *data)
+{
+	int i, ret;
+	struct o2hb_region *reg = data;
+	struct bio *write_bio;
+	struct o2hb_bio_wait_ctxt write_wc;
+	struct timeval before_hb, after_hb;
+	unsigned int elapsed_msec;
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
+
+	set_user_nice(current, -20);
+
+	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+		/* We track the time spent inside
+		 * o2hb_do_disk_heartbeat so that we avoid more then
+		 * hr_timeout_ms between disk writes. On busy systems
+		 * this should result in a heartbeat which is less
+		 * likely to time itself out. */
+		do_gettimeofday(&before_hb);
+
+		o2hb_do_disk_heartbeat(reg);
+
+		do_gettimeofday(&after_hb);
+		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+
+		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+		     before_hb.tv_sec, before_hb.tv_usec,
+		     after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
+
+		if (elapsed_msec < reg->hr_timeout_ms) {
+			/* the kthread api has blocked signals for us so no
+			 * need to record the return value. */
+			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
+		}
+	}
+
+	o2hb_disarm_write_timeout(reg);
+
+	/* unclean stop is only used in very bad situation */
+	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
+		o2hb_shutdown_slot(&reg->hr_slots[i]);
+
+	/* Explicit down notification - avoid forcing the other nodes
+	 * to timeout on this region when we could just as easily
+	 * write a clear generation - thus indicating to them that
+	 * this node has left this region.
+	 *
+	 * XXX: Should we skip this on unclean_stop? */
+	o2hb_prepare_block(reg, 0);
+	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+	if (ret == 0) {
+		o2hb_wait_on_io(reg, &write_wc);
+		bio_put(write_bio);
+	} else {
+		mlog_errno(ret);
+	}
+
+	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+
+	return 0;
+}
+
+void o2hb_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
+		INIT_LIST_HEAD(&o2hb_callbacks[i].list);
+
+	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
+		INIT_LIST_HEAD(&o2hb_live_slots[i]);
+
+	INIT_LIST_HEAD(&o2hb_node_events);
+
+	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+}
+
+/* if we're already in a callback then we're already serialized by the sem */
+static void o2hb_fill_node_map_from_callback(unsigned long *map,
+					     unsigned bytes)
+{
+	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+	memcpy(map, &o2hb_live_node_bitmap, bytes);
+}
+
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+	/* callers want to serialize this map and callbacks so that they
+	 * can trust that they don't miss nodes coming to the party */
+	down_read(&o2hb_callback_sem);
+	spin_lock(&o2hb_live_lock);
+	o2hb_fill_node_map_from_callback(map, bytes);
+	spin_unlock(&o2hb_live_lock);
+	up_read(&o2hb_callback_sem);
+}
+EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
+
+/*
+ * heartbeat configfs bits.  The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct o2hb_region *to_o2hb_region(struct config_item *item)
+{
+	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
+}
+
+/* drop_item only drops its ref after killing the thread, nothing should
+ * be using the region anymore.  this has to clean up any state that
+ * attributes might have built up. */
+static void o2hb_region_release(struct config_item *item)
+{
+	int i;
+	struct page *page;
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	if (reg->hr_tmp_block)
+		kfree(reg->hr_tmp_block);
+
+	if (reg->hr_slot_data) {
+		for (i = 0; i < reg->hr_num_pages; i++) {
+			page = reg->hr_slot_data[i];
+			if (page)
+				__free_page(page);
+		}
+		kfree(reg->hr_slot_data);
+	}
+
+	if (reg->hr_bdev)
+		blkdev_put(reg->hr_bdev);
+
+	if (reg->hr_slots)
+		kfree(reg->hr_slots);
+
+	spin_lock(&o2hb_live_lock);
+	list_del(&reg->hr_all_item);
+	spin_unlock(&o2hb_live_lock);
+
+	kfree(reg);
+}
+
+static int o2hb_read_block_input(struct o2hb_region *reg,
+				 const char *page,
+				 size_t count,
+				 unsigned long *ret_bytes,
+				 unsigned int *ret_bits)
+{
+	unsigned long bytes;
+	char *p = (char *)page;
+
+	bytes = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	/* Heartbeat and fs min / max block sizes are the same. */
+	if (bytes > 4096 || bytes < 512)
+		return -ERANGE;
+	if (hweight16(bytes) != 1)
+		return -EINVAL;
+
+	if (ret_bytes)
+		*ret_bytes = bytes;
+	if (ret_bits)
+		*ret_bits = ffs(bytes) - 1;
+
+	return 0;
+}
+
+static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%u\n", reg->hr_block_bytes);
+}
+
+static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	int status;
+	unsigned long block_bytes;
+	unsigned int block_bits;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	status = o2hb_read_block_input(reg, page, count,
+				       &block_bytes, &block_bits);
+	if (status)
+		return status;
+
+	reg->hr_block_bytes = (unsigned int)block_bytes;
+	reg->hr_block_bits = block_bits;
+
+	return count;
+}
+
+static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+					    char *page)
+{
+	return sprintf(page, "%llu\n", reg->hr_start_block);
+}
+
+static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+					     const char *page,
+					     size_t count)
+{
+	unsigned long long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoull(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	reg->hr_start_block = tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
+				       char *page)
+{
+	return sprintf(page, "%d\n", reg->hr_blocks);
+}
+
+static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+					const char *page,
+					size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	if (reg->hr_bdev)
+		return -EINVAL;
+
+	tmp = simple_strtoul(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		return -EINVAL;
+
+	if (tmp > O2NM_MAX_NODES || tmp == 0)
+		return -ERANGE;
+
+	reg->hr_blocks = (unsigned int)tmp;
+
+	return count;
+}
+
+static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
+				    char *page)
+{
+	unsigned int ret = 0;
+
+	if (reg->hr_bdev)
+		ret = sprintf(page, "%s\n", reg->hr_dev_name);
+
+	return ret;
+}
+
+static void o2hb_init_region_params(struct o2hb_region *reg)
+{
+	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
+
+	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
+	     reg->hr_start_block, reg->hr_blocks);
+	mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
+	     reg->hr_block_bytes, reg->hr_block_bits);
+	mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
+	mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
+}
+
+static int o2hb_map_slot_data(struct o2hb_region *reg)
+{
+	int i, j;
+	unsigned int last_slot;
+	unsigned int spp = reg->hr_slots_per_page;
+	struct page *page;
+	char *raw;
+	struct o2hb_disk_slot *slot;
+
+	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
+	if (reg->hr_tmp_block == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	reg->hr_slots = kcalloc(reg->hr_blocks,
+				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
+	if (reg->hr_slots == NULL) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		slot->ds_node_num = i;
+		INIT_LIST_HEAD(&slot->ds_live_item);
+		slot->ds_raw_block = NULL;
+	}
+
+	reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
+	mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
+			   "at %u blocks per page\n",
+	     reg->hr_num_pages, reg->hr_blocks, spp);
+
+	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
+				    GFP_KERNEL);
+	if (!reg->hr_slot_data) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	for(i = 0; i < reg->hr_num_pages; i++) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+
+		reg->hr_slot_data[i] = page;
+
+		last_slot = i * spp;
+		raw = page_address(page);
+		for (j = 0;
+		     (j < spp) && ((j + last_slot) < reg->hr_blocks);
+		     j++) {
+			BUG_ON((j + last_slot) >= reg->hr_blocks);
+
+			slot = &reg->hr_slots[j + last_slot];
+			slot->ds_raw_block =
+				(struct o2hb_disk_heartbeat_block *) raw;
+
+			raw += reg->hr_block_bytes;
+		}
+	}
+
+	return 0;
+}
+
+/* Read in all the slots available and populate the tracking
+ * structures so that we can start with a baseline idea of what's
+ * there. */
+static int o2hb_populate_slot_data(struct o2hb_region *reg)
+{
+	int ret, i;
+	struct o2hb_disk_slot *slot;
+	struct o2hb_disk_heartbeat_block *hb_block;
+
+	mlog_entry_void();
+
+	ret = o2hb_read_slots(reg, reg->hr_blocks);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* We only want to get an idea of the values initially in each
+	 * slot, so we do no verification - o2hb_check_slot will
+	 * actually determine if each configured slot is valid and
+	 * whether any values have changed. */
+	for(i = 0; i < reg->hr_blocks; i++) {
+		slot = &reg->hr_slots[i];
+		hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
+
+		/* Only fill the values that o2hb_check_slot uses to
+		 * determine changing slots */
+		slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
+		slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+				     const char *page,
+				     size_t count)
+{
+	long fd;
+	int sectsize;
+	char *p = (char *)page;
+	struct file *filp = NULL;
+	struct inode *inode = NULL;
+	ssize_t ret = -EINVAL;
+
+	if (reg->hr_bdev)
+		goto out;
+
+	/* We can't heartbeat without having had our node number
+	 * configured yet. */
+	if (o2nm_this_node() == O2NM_MAX_NODES)
+		goto out;
+
+	fd = simple_strtol(p, &p, 0);
+	if (!p || (*p && (*p != '\n')))
+		goto out;
+
+	if (fd < 0 || fd >= INT_MAX)
+		goto out;
+
+	filp = fget(fd);
+	if (filp == NULL)
+		goto out;
+
+	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
+	    reg->hr_block_bytes == 0)
+		goto out;
+
+	inode = igrab(filp->f_mapping->host);
+	if (inode == NULL)
+		goto out;
+
+	if (!S_ISBLK(inode->i_mode))
+		goto out;
+
+	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	if (ret) {
+		reg->hr_bdev = NULL;
+		goto out;
+	}
+	inode = NULL;
+
+	bdevname(reg->hr_bdev, reg->hr_dev_name);
+
+	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	if (sectsize != reg->hr_block_bytes) {
+		mlog(ML_ERROR,
+		     "blocksize %u incorrect for device, expected %d",
+		     reg->hr_block_bytes, sectsize);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	o2hb_init_region_params(reg);
+
+	/* Generation of zero is invalid */
+	do {
+		get_random_bytes(&reg->hr_generation,
+				 sizeof(reg->hr_generation));
+	} while (reg->hr_generation == 0);
+
+	ret = o2hb_map_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = o2hb_populate_slot_data(reg);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
+
+	/*
+	 * A node is considered live after it has beat LIVE_THRESHOLD
+	 * times.  We're not steady until we've given them a chance
+	 * _after_ our first read.
+	 */
+	atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
+
+	reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+				   reg->hr_item.ci_name);
+	if (IS_ERR(reg->hr_task)) {
+		ret = PTR_ERR(reg->hr_task);
+		mlog_errno(ret);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(o2hb_steady_queue,
+				atomic_read(&reg->hr_steady_iterations) == 0);
+	if (ret) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+		goto out;
+	}
+
+	ret = count;
+out:
+	if (filp)
+		fput(filp);
+	if (inode)
+		iput(inode);
+	if (ret < 0) {
+		if (reg->hr_bdev) {
+			blkdev_put(reg->hr_bdev);
+			reg->hr_bdev = NULL;
+		}
+	}
+	return ret;
+}
+
+struct o2hb_region_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_region *, char *);
+	ssize_t (*store)(struct o2hb_region *, const char *, size_t);
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "block_bytes",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_block_bytes_read,
+	.store	= o2hb_region_block_bytes_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_start_block = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "start_block",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_start_block_read,
+	.store	= o2hb_region_start_block_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_blocks = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "blocks",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_blocks_read,
+	.store	= o2hb_region_blocks_write,
+};
+
+static struct o2hb_region_attribute o2hb_region_attr_dev = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dev",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_region_dev_read,
+	.store	= o2hb_region_dev_write,
+};
+
+static struct configfs_attribute *o2hb_region_attrs[] = {
+	&o2hb_region_attr_block_bytes.attr,
+	&o2hb_region_attr_start_block.attr,
+	&o2hb_region_attr_blocks.attr,
+	&o2hb_region_attr_dev.attr,
+	NULL,
+};
+
+static ssize_t o2hb_region_show(struct config_item *item,
+				struct configfs_attribute *attr,
+				char *page)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_region_attr->show)
+		ret = o2hb_region_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_region_store(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 const char *page, size_t count)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+	struct o2hb_region_attribute *o2hb_region_attr =
+		container_of(attr, struct o2hb_region_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_region_attr->store)
+		ret = o2hb_region_attr->store(reg, page, count);
+	return ret;
+}
+
+static struct configfs_item_operations o2hb_region_item_ops = {
+	.release		= o2hb_region_release,
+	.show_attribute		= o2hb_region_show,
+	.store_attribute	= o2hb_region_store,
+};
+
+static struct config_item_type o2hb_region_type = {
+	.ct_item_ops	= &o2hb_region_item_ops,
+	.ct_attrs	= o2hb_region_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* heartbeat set */
+
+struct o2hb_heartbeat_group {
+	struct config_group hs_group;
+	/* some stuff? */
+};
+
+static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
+{
+	return group ?
+		container_of(group, struct o2hb_heartbeat_group, hs_group)
+		: NULL;
+}
+
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
+{
+	struct o2hb_region *reg = NULL;
+	struct config_item *ret = NULL;
+
+	reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
+	if (reg == NULL)
+		goto out; /* ENOMEM */
+
+	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+	ret = &reg->hr_item;
+
+	spin_lock(&o2hb_live_lock);
+	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
+	spin_unlock(&o2hb_live_lock);
+out:
+	if (ret == NULL)
+		kfree(reg);
+
+	return ret;
+}
+
+static void o2hb_heartbeat_group_drop_item(struct config_group *group,
+					   struct config_item *item)
+{
+	struct o2hb_region *reg = to_o2hb_region(item);
+
+	/* stop the thread when the user removes the region dir */
+	if (reg->hr_task) {
+		kthread_stop(reg->hr_task);
+		reg->hr_task = NULL;
+	}
+
+	config_item_put(item);
+}
+
+struct o2hb_heartbeat_group_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
+	ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
+};
+
+static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
+					 struct configfs_attribute *attr,
+					 char *page)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = 0;
+
+	if (o2hb_heartbeat_group_attr->show)
+		ret = o2hb_heartbeat_group_attr->show(reg, page);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
+					  struct configfs_attribute *attr,
+					  const char *page, size_t count)
+{
+	struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
+	struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
+		container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
+	ssize_t ret = -EINVAL;
+
+	if (o2hb_heartbeat_group_attr->store)
+		ret = o2hb_heartbeat_group_attr->store(reg, page, count);
+	return ret;
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
+						     char *page)
+{
+	return sprintf(page, "%u\n", o2hb_dead_threshold);
+}
+
+static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
+						    const char *page,
+						    size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *)page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+
+	/* this will validate ranges for us. */
+	o2hb_dead_threshold_set((unsigned int) tmp);
+
+	return count;
+}
+
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
+	.attr	= { .ca_owner = THIS_MODULE,
+		    .ca_name = "dead_threshold",
+		    .ca_mode = S_IRUGO | S_IWUSR },
+	.show	= o2hb_heartbeat_group_threshold_show,
+	.store	= o2hb_heartbeat_group_threshold_store,
+};
+
+static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
+	&o2hb_heartbeat_group_attr_threshold.attr,
+	NULL,
+};
+
+static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+	.show_attribute		= o2hb_heartbeat_group_show,
+	.store_attribute	= o2hb_heartbeat_group_store,
+};
+
+static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
+	.make_item	= o2hb_heartbeat_group_make_item,
+	.drop_item	= o2hb_heartbeat_group_drop_item,
+};
+
+static struct config_item_type o2hb_heartbeat_group_type = {
+	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
+	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_attrs	= o2hb_heartbeat_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+/* this is just here to avoid touching group in heartbeat.h which the
+ * entire damn world #includes */
+struct config_group *o2hb_alloc_hb_set(void)
+{
+	struct o2hb_heartbeat_group *hs = NULL;
+	struct config_group *ret = NULL;
+
+	hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+	if (hs == NULL)
+		goto out;
+
+	config_group_init_type_name(&hs->hs_group, "heartbeat",
+				    &o2hb_heartbeat_group_type);
+
+	ret = &hs->hs_group;
+out:
+	if (ret == NULL)
+		kfree(hs);
+	return ret;
+}
+
+void o2hb_free_hb_set(struct config_group *group)
+{
+	struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
+	kfree(hs);
+}
+
+/* hb callback registration and issueing */
+
+static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
+{
+	if (type == O2HB_NUM_CB)
+		return ERR_PTR(-EINVAL);
+
+	return &o2hb_callbacks[type];
+}
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority)
+{
+	INIT_LIST_HEAD(&hc->hc_item);
+	hc->hc_func = func;
+	hc->hc_data = data;
+	hc->hc_priority = priority;
+	hc->hc_type = type;
+	hc->hc_magic = O2HB_CB_MAGIC;
+}
+EXPORT_SYMBOL_GPL(o2hb_setup_callback);
+
+int o2hb_register_callback(struct o2hb_callback_func *hc)
+{
+	struct o2hb_callback_func *tmp;
+	struct list_head *iter;
+	struct o2hb_callback *hbcall;
+	int ret;
+
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+	BUG_ON(!list_empty(&hc->hc_item));
+
+	hbcall = hbcall_from_type(hc->hc_type);
+	if (IS_ERR(hbcall)) {
+		ret = PTR_ERR(hbcall);
+		goto out;
+	}
+
+	down_write(&o2hb_callback_sem);
+
+	list_for_each(iter, &hbcall->list) {
+		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
+		if (hc->hc_priority < tmp->hc_priority) {
+			list_add_tail(&hc->hc_item, iter);
+			break;
+		}
+	}
+	if (list_empty(&hc->hc_item))
+		list_add_tail(&hc->hc_item, &hbcall->list);
+
+	up_write(&o2hb_callback_sem);
+	ret = 0;
+out:
+	mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+	     ret, __builtin_return_address(0), hc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2hb_register_callback);
+
+int o2hb_unregister_callback(struct o2hb_callback_func *hc)
+{
+	BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
+
+	mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+	     __builtin_return_address(0), hc);
+
+	if (list_empty(&hc->hc_item))
+		return 0;
+
+	down_write(&o2hb_callback_sem);
+
+	list_del_init(&hc->hc_item);
+
+	up_write(&o2hb_callback_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
+
+int o2hb_check_node_heartbeating(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
+
+int o2hb_check_node_heartbeating_from_callback(u8 node_num)
+{
+	unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+	o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+	if (!test_bit(node_num, testing_map)) {
+		mlog(ML_HEARTBEAT,
+		     "node (%u) does not have heartbeating enabled.\n",
+		     node_num);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
+
+/* Makes sure our local node is configured with a node number, and is
+ * heartbeating. */
+int o2hb_check_local_node_heartbeating(void)
+{
+	u8 node_num;
+
+	/* if this node was set then we have networking */
+	node_num = o2nm_this_node();
+	if (node_num == O2NM_MAX_NODES) {
+		mlog(ML_HEARTBEAT, "this node has not been configured.\n");
+		return 0;
+	}
+
+	return o2hb_check_node_heartbeating(node_num);
+}
+EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
+
+/*
+ * this is just a hack until we get the plumbing which flips file systems
+ * read only and drops the hb ref instead of killing the node dead.
+ */
+void o2hb_stop_all_regions(void)
+{
+	struct o2hb_region *reg;
+
+	mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
+
+	spin_lock(&o2hb_live_lock);
+
+	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
+		reg->hr_unclean_stop = 1;
+
+	spin_unlock(&o2hb_live_lock);
+}
+EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 0000000..cac6223
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_HEARTBEAT_H
+#define O2CLUSTER_HEARTBEAT_H
+
+#include "ocfs2_heartbeat.h"
+
+#define O2HB_REGION_TIMEOUT_MS		2000
+
+/* number of changes to be seen as live */
+#define O2HB_LIVE_THRESHOLD	   2
+/* number of equal samples to be seen as dead */
+extern unsigned int o2hb_dead_threshold;
+#define O2HB_DEFAULT_DEAD_THRESHOLD	   7
+/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
+#define O2HB_MIN_DEAD_THRESHOLD	  2
+#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
+
+#define O2HB_CB_MAGIC		0x51d1e4ec
+
+/* callback stuff */
+enum o2hb_callback_type {
+	O2HB_NODE_DOWN_CB = 0,
+	O2HB_NODE_UP_CB,
+	O2HB_NUM_CB
+};
+
+struct o2nm_node;
+typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
+
+struct o2hb_callback_func {
+	u32			hc_magic;
+	struct list_head	hc_item;
+	o2hb_cb_func		*hc_func;
+	void			*hc_data;
+	int			hc_priority;
+	enum o2hb_callback_type hc_type;
+};
+
+struct config_group *o2hb_alloc_hb_set(void);
+void o2hb_free_hb_set(struct config_group *group);
+
+void o2hb_setup_callback(struct o2hb_callback_func *hc,
+			 enum o2hb_callback_type type,
+			 o2hb_cb_func *func,
+			 void *data,
+			 int priority);
+int o2hb_register_callback(struct o2hb_callback_func *hc);
+int o2hb_unregister_callback(struct o2hb_callback_func *hc);
+void o2hb_fill_node_map(unsigned long *map,
+			unsigned bytes);
+void o2hb_init(void);
+int o2hb_check_node_heartbeating(u8 node_num);
+int o2hb_check_node_heartbeating_from_callback(u8 node_num);
+int o2hb_check_local_node_heartbeating(void);
+void o2hb_stop_all_regions(void);
+
+#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 0000000..9409606
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_heartbeat.h
+ *
+ * On-disk structures for ocfs2_heartbeat
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_HEARTBEAT_H
+#define _OCFS2_HEARTBEAT_H
+
+struct o2hb_disk_heartbeat_block {
+	__le64 hb_seq;
+	__u8  hb_node;
+	__u8  hb_pad1[3];
+	__le32 hb_cksum;
+	__le64 hb_generation;
+};
+
+#endif /* _OCFS2_HEARTBEAT_H */
-- 
cgit v1.1


From 98211489d4147e41b11703e4245846d60b3acce4 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Node messaging via tcp. Used by the dlm and the file system for point
to point communication between nodes.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/cluster/quorum.c       |  315 +++++++
 fs/ocfs2/cluster/quorum.h       |   36 +
 fs/ocfs2/cluster/sys.c          |  124 +++
 fs/ocfs2/cluster/sys.h          |   33 +
 fs/ocfs2/cluster/tcp.c          | 1829 +++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/cluster/tcp.h          |  113 +++
 fs/ocfs2/cluster/tcp_internal.h |  174 ++++
 7 files changed, 2624 insertions(+)
 create mode 100644 fs/ocfs2/cluster/quorum.c
 create mode 100644 fs/ocfs2/cluster/quorum.h
 create mode 100644 fs/ocfs2/cluster/sys.c
 create mode 100644 fs/ocfs2/cluster/sys.h
 create mode 100644 fs/ocfs2/cluster/tcp.c
 create mode 100644 fs/ocfs2/cluster/tcp.h
 create mode 100644 fs/ocfs2/cluster/tcp_internal.h

diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 0000000..7bba98f
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* This quorum hack is only here until we transition to some more rational
+ * approach that is driven from userspace.  Honest.  No foolin'.
+ *
+ * Imagine two nodes lose network connectivity to each other but they're still
+ * up and operating in every other way.  Presumably a network timeout indicates
+ * that a node is broken and should be recovered.  They can't both recover each
+ * other and both carry on without serialising their access to the file system.
+ * They need to decide who is authoritative.  Now extend that problem to
+ * arbitrary groups of nodes losing connectivity between each other.
+ *
+ * So we declare that a node which has given up on connecting to a majority
+ * of nodes who are still heartbeating will fence itself.
+ *
+ * There are huge opportunities for races here.  After we give up on a node's
+ * connection we need to wait long enough to give heartbeat an opportunity
+ * to declare the node as truly dead.  We also need to be careful with the
+ * race between when we see a node start heartbeating and when we connect
+ * to it.
+ *
+ * So nodes that are in this transtion put a hold on the quorum decision
+ * with a counter.  As they fall out of this transition they drop the count
+ * and if they're the last, they fire off the decision.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "heartbeat.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_QUORUM
+#include "masklog.h"
+#include "quorum.h"
+
+static struct o2quo_state {
+	spinlock_t		qs_lock;
+	struct work_struct	qs_work;
+	int			qs_pending;
+	int			qs_heartbeating;
+	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_connected;
+	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int			qs_holds;
+	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+} o2quo_state;
+
+/* this is horribly heavy-handed.  It should instead flip the file
+ * system RO and call some userspace script. */
+static void o2quo_fence_self(void)
+{
+	/* panic spins with interrupts enabled.  with preempt
+	 * threads can still schedule, etc, etc */
+	o2hb_stop_all_regions();
+	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+}
+
+/* Indicate that a timeout occured on a hearbeat region write. The
+ * other nodes in the cluster may consider us dead at that time so we
+ * want to "fence" ourselves so that we don't scribble on the disk
+ * after they think they've recovered us. This can't solve all
+ * problems related to writeout after recovery but this hack can at
+ * least close some of those gaps. When we have real fencing, this can
+ * go away as our node would be fenced externally before other nodes
+ * begin recovery. */
+void o2quo_disk_timeout(void)
+{
+	o2quo_fence_self();
+}
+
+static void o2quo_make_decision(void *arg)
+{
+	int quorum;
+	int lowest_hb, lowest_reachable = 0, fence = 0;
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
+	if (lowest_hb != O2NM_MAX_NODES)
+		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
+
+	mlog(0, "heartbeating: %d, connected: %d, "
+	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
+	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
+
+	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
+	    qs->qs_heartbeating == 1)
+		goto out;
+
+	if (qs->qs_heartbeating & 1) {
+		/* the odd numbered cluster case is straight forward --
+		 * if we can't talk to the majority we're hosed */
+		quorum = (qs->qs_heartbeating + 1)/2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+	} else {
+		/* the even numbered cluster adds the possibility of each half
+		 * of the cluster being able to talk amongst themselves.. in
+		 * that case we're hosed if we can't talk to the group that has
+		 * the lowest numbered node */
+		quorum = qs->qs_heartbeating / 2;
+		if (qs->qs_connected < quorum) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "only connected to %u nodes and %u is needed "
+			     "to make a quorum out of %u heartbeating nodes\n",
+			     qs->qs_connected, quorum,
+			     qs->qs_heartbeating);
+			fence = 1;
+		}
+		else if ((qs->qs_connected == quorum) &&
+			 !lowest_reachable) {
+			mlog(ML_ERROR, "fencing this node because it is "
+			     "connected to a half-quorum of %u out of %u "
+			     "nodes which doesn't include the lowest active "
+			     "node %u\n", quorum, qs->qs_heartbeating,
+			     lowest_hb);
+			fence = 1;
+		}
+	}
+
+out:
+	spin_unlock(&qs->qs_lock);
+	if (fence)
+		o2quo_fence_self();
+}
+
+static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
+		qs->qs_holds++;
+		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
+			        "node %u\n", node);
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
+	}
+}
+
+static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
+{
+	assert_spin_locked(&qs->qs_lock);
+
+	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
+		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
+		if (--qs->qs_holds == 0) {
+			if (qs->qs_pending) {
+				qs->qs_pending = 0;
+				schedule_work(&qs->qs_work);
+			}
+		}
+		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
+				node, qs->qs_holds);
+	}
+}
+
+/* as a node comes up we delay the quorum decision until we know the fate of
+ * the connection.  the hold will be droped in conn_up or hb_down.  it might be
+ * perpetuated by con_err until hb_down.  if we already have a conn, we might
+ * be dropping a hold that conn_up got. */
+void o2quo_hb_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating++;
+	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	set_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	if (!test_bit(node, qs->qs_conn_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* hb going down releases any holds we might have had due to this node from
+ * conn_up, conn_err, or hb_up */
+void o2quo_hb_down(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_heartbeating--;
+	mlog_bug_on_msg(qs->qs_heartbeating < 0,
+			"node %u, %d heartbeating\n",
+			node, qs->qs_heartbeating);
+	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
+	clear_bit(node, qs->qs_hb_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
+
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* this tells us that we've decided that the node is still heartbeating
+ * even though we've lost it's conn.  it must only be called after conn_err
+ * and indicates that we must now make a quorum decision in the future,
+ * though we might be doing so after waiting for holds to drain.  Here
+ * we'll be dropping the hold from conn_err. */
+void o2quo_hb_still_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	mlog(0, "node %u\n", node);
+
+	qs->qs_pending = 1;
+	o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* This is analagous to hb_up.  as a node's connection comes up we delay the
+ * quorum decision until we see it heartbeating.  the hold will be droped in
+ * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
+ * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * */
+void o2quo_conn_up(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	qs->qs_connected++;
+	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
+		        "node %u\n", node);
+	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
+	set_bit(node, qs->qs_conn_bm);
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (!test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+	else
+		o2quo_clear_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+/* we've decided that we won't ever be connecting to the node again.  if it's
+ * still heartbeating we grab a hold that will delay decisions until either the
+ * node stops heartbeating from hb_down or the caller decides that the node is
+ * still up and calls still_up */
+void o2quo_conn_err(u8 node)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock(&qs->qs_lock);
+
+	if (test_bit(node, qs->qs_conn_bm)) {
+		qs->qs_connected--;
+		mlog_bug_on_msg(qs->qs_connected < 0,
+				"node %u, connected %d\n",
+				node, qs->qs_connected);
+
+		clear_bit(node, qs->qs_conn_bm);
+	}
+
+	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
+
+	if (test_bit(node, qs->qs_hb_bm))
+		o2quo_set_hold(qs, node);
+
+	spin_unlock(&qs->qs_lock);
+}
+
+void o2quo_init(void)
+{
+	struct o2quo_state *qs = &o2quo_state;
+
+	spin_lock_init(&qs->qs_lock);
+	INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+}
+
+void o2quo_exit(void)
+{
+	flush_scheduled_work();
+}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 0000000..6649cc6
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_QUORUM_H
+#define O2CLUSTER_QUORUM_H
+
+void o2quo_init(void);
+void o2quo_exit(void);
+
+void o2quo_hb_up(u8 node);
+void o2quo_hb_down(u8 node);
+void o2quo_hb_still_up(u8 node);
+void o2quo_conn_up(u8 node);
+void o2quo_conn_err(u8 node);
+void o2quo_disk_timeout(void);
+
+#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 0000000..f1e9946
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.c
+ *
+ * OCFS2 cluster sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_nodemanager.h"
+#include "masklog.h"
+#include "sys.h"
+
+struct o2cb_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(char *buf);
+	ssize_t (*store)(const char *buf, size_t count);
+};
+
+#define O2CB_ATTR(_name, _mode, _show, _store)	\
+struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
+#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
+
+static ssize_t o2cb_interface_revision_show(char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
+}
+
+O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+
+static struct attribute *o2cb_attrs[] = {
+	&o2cb_attr_interface_revision.attr,
+	NULL,
+};
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	   const char * buffer, size_t count);
+static struct sysfs_ops o2cb_sysfs_ops = {
+	.show	= o2cb_show,
+	.store	= o2cb_store,
+};
+
+static struct kobj_type o2cb_subsys_type = {
+	.default_attrs	= o2cb_attrs,
+	.sysfs_ops	= &o2cb_sysfs_ops,
+};
+
+/* gives us o2cb_subsys */
+decl_subsys(o2cb, NULL, NULL);
+
+static ssize_t
+o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->show)
+		return o2cb_attr->show(buffer);
+	return -EIO;
+}
+
+static ssize_t
+o2cb_store(struct kobject * kobj, struct attribute * attr,
+	     const char * buffer, size_t count)
+{
+	struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
+	struct subsystem *sbs = to_o2cb_subsys(kobj);
+
+	BUG_ON(sbs != &o2cb_subsys);
+
+	if (o2cb_attr->store)
+		return o2cb_attr->store(buffer, count);
+	return -EIO;
+}
+
+void o2cb_sys_shutdown(void)
+{
+	mlog_sys_shutdown();
+	subsystem_unregister(&o2cb_subsys);
+}
+
+int o2cb_sys_init(void)
+{
+	int ret;
+
+	o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
+	ret = subsystem_register(&o2cb_subsys);
+	if (ret)
+		return ret;
+
+	ret = mlog_sys_init(&o2cb_subsys);
+	if (ret)
+		subsystem_unregister(&o2cb_subsys);
+	return ret;
+}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 0000000..d66b8ab
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sys.h
+ *
+ * Function prototypes for o2cb sysfs interface
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation,
+ * version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_SYS_H
+#define O2CLUSTER_SYS_H
+
+void o2cb_sys_shutdown(void);
+int o2cb_sys_init(void);
+
+#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 0000000..35d92c0
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ *
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * ----
+ *
+ * Callers for this were originally written against a very simple synchronus
+ * API.  This implementation reflects those simple callers.  Some day I'm sure
+ * we'll need to move to a more robust posting/callback mechanism.
+ *
+ * Transmit calls pass in kernel virtual addresses and block copying this into
+ * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
+ * for a failed socket to timeout.  TX callers can also pass in a poniter to an
+ * 'int' which gets filled with an errno off the wire in response to the
+ * message they send.
+ *
+ * Handlers for unsolicited messages are registered.  Each socket has a page
+ * that incoming data is copied into.  First the header, then the data.
+ * Handlers are called from only one thread with a reference to this per-socket
+ * page.  This page is destroyed after the handler call, so it can't be
+ * referenced beyond the call.  Handlers may block but are discouraged from
+ * doing so.
+ *
+ * Any framing errors (bad magic, large payload lengths) close a connection.
+ *
+ * Our sock_container holds the state we associate with a socket.  It's current
+ * framing state is held there as well as the refcounting we do around when it
+ * is safe to tear down the socket.  The socket is only finally torn down from
+ * the container when the container loses all of its references -- so as long
+ * as you hold a ref on the container you can trust that the socket is valid
+ * for use with kernel socket APIs.
+ *
+ * Connections are initiated between a pair of nodes when the node with the
+ * higher node number gets a heartbeat callback which indicates that the lower
+ * numbered node has started heartbeating.  The lower numbered node is passive
+ * and only accepts the connection if the higher numbered node is heartbeating.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <net/tcp.h>
+
+#include <asm/uaccess.h>
+
+#include "heartbeat.h"
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+#include "quorum.h"
+
+#include "tcp_internal.h"
+
+/* 
+ * The linux network stack isn't sparse endian clean.. It has macros like
+ * ntohs() which perform the endian checks and structs like sockaddr_in
+ * which aren't annotated.  So __force is found here to get the build
+ * clean.  When they emerge from the dark ages and annotate the code
+ * we can remove these.
+ */
+
+#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num,	\
+			  NIPQUAD(sc->sc_node->nd_ipv4_address),	\
+			  ntohs(sc->sc_node->nd_ipv4_port)
+
+/*
+ * In the following two log macros, the whitespace after the ',' just
+ * before ##args is intentional. Otherwise, gcc 2.95 will eat the
+ * previous token if args expands to nothing.
+ */
+#define msglog(hdr, fmt, args...) do {					\
+	typeof(hdr) __hdr = (hdr);					\
+	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
+	     "key %08x num %u] " fmt,					\
+	     be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), 	\
+	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
+	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
+	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
+} while (0)
+
+#define sclog(sc, fmt, args...) do {					\
+	typeof(sc) __sc = (sc);						\
+	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
+	     "pg_off %zu] " fmt, __sc,					\
+	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
+	    ##args);							\
+} while (0)
+
+static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static struct rb_root o2net_handler_tree = RB_ROOT;
+
+static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
+
+/* XXX someday we'll need better accounting */
+static struct socket *o2net_listen_sock = NULL;
+
+/*
+ * listen work is only queued by the listening socket callbacks on the
+ * o2net_wq.  teardown detaches the callbacks before destroying the workqueue.
+ * quorum work is queued as sock containers are shutdown.. stop_listening
+ * tears down all the node's sock containers, preventing future shutdowns
+ * and queued quroum work, before canceling delayed quorum work and
+ * destroying the work queue.
+ */
+static struct workqueue_struct *o2net_wq;
+static struct work_struct o2net_listen_work;
+
+static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
+#define O2NET_HB_PRI 0x1
+
+static struct o2net_handshake *o2net_hand;
+static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
+
+static int o2net_sys_err_translations[O2NET_ERR_MAX] =
+		{[O2NET_ERR_NONE]	= 0,
+		 [O2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
+		 [O2NET_ERR_OVERFLOW]	= -EOVERFLOW,
+		 [O2NET_ERR_DIED]	= -EHOSTDOWN,};
+
+/* can't quite avoid *all* internal declarations :/ */
+static void o2net_sc_connect_completed(void *arg);
+static void o2net_rx_until_empty(void *arg);
+static void o2net_shutdown_sc(void *arg);
+static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_sc_send_keep_req(void *arg);
+static void o2net_idle_timer(unsigned long data);
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+
+static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
+{
+	int trans;
+	BUG_ON(err >= O2NET_ERR_MAX);
+	trans = o2net_sys_err_translations[err];
+
+	/* Just in case we mess up the translation table above */
+	BUG_ON(err != O2NET_ERR_NONE && trans == 0);
+	return trans;
+}
+
+static struct o2net_node * o2net_nn_from_num(u8 node_num)
+{
+	BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
+	return &o2net_nodes[node_num];
+}
+
+static u8 o2net_num_from_nn(struct o2net_node *nn)
+{
+	BUG_ON(nn == NULL);
+	return nn - o2net_nodes;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+{
+	int ret = 0;
+
+	do {
+		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
+			ret = -EAGAIN;
+			break;
+		}
+		spin_lock(&nn->nn_lock);
+		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
+		if (ret == 0)
+			list_add_tail(&nsw->ns_node_item,
+				      &nn->nn_status_list);
+		spin_unlock(&nn->nn_lock);
+	} while (ret == -EAGAIN);
+
+	if (ret == 0)  {
+		init_waitqueue_head(&nsw->ns_wq);
+		nsw->ns_sys_status = O2NET_ERR_NONE;
+		nsw->ns_status = 0;
+	}
+
+	return ret;
+}
+
+static void o2net_complete_nsw_locked(struct o2net_node *nn,
+				      struct o2net_status_wait *nsw,
+				      enum o2net_system_error sys_status,
+				      s32 status)
+{
+	assert_spin_locked(&nn->nn_lock);
+
+	if (!list_empty(&nsw->ns_node_item)) {
+		list_del_init(&nsw->ns_node_item);
+		nsw->ns_sys_status = sys_status;
+		nsw->ns_status = status;
+		idr_remove(&nn->nn_status_idr, nsw->ns_id);
+		wake_up(&nsw->ns_wq);
+	}
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       u64 id, enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	if (nsw == NULL) {
+		if (id > INT_MAX)
+			goto out;
+
+		nsw = idr_find(&nn->nn_status_idr, id);
+		if (nsw == NULL)
+			goto out;
+	}
+
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+
+out:
+	spin_unlock(&nn->nn_lock);
+	return;
+}
+
+static void o2net_complete_nodes_nsw(struct o2net_node *nn)
+{
+	struct list_head *iter, *tmp;
+	unsigned int num_kills = 0;
+	struct o2net_status_wait *nsw;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+		nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+		o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
+		num_kills++;
+	}
+
+	mlog(0, "completed %d messages for node %u\n", num_kills,
+	     o2net_num_from_nn(nn));
+}
+
+static int o2net_nsw_completed(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw)
+{
+	int completed;
+	spin_lock(&nn->nn_lock);
+	completed = list_empty(&nsw->ns_node_item);
+	spin_unlock(&nn->nn_lock);
+	return completed;
+}
+
+/* ------------------------------------------------------------ */
+
+static void sc_kref_release(struct kref *kref)
+{
+	struct o2net_sock_container *sc = container_of(kref,
+					struct o2net_sock_container, sc_kref);
+	sclog(sc, "releasing\n");
+
+	if (sc->sc_sock) {
+		sock_release(sc->sc_sock);
+		sc->sc_sock = NULL;
+	}
+
+	o2nm_node_put(sc->sc_node);
+	sc->sc_node = NULL;
+
+	kfree(sc);
+}
+
+static void sc_put(struct o2net_sock_container *sc)
+{
+	sclog(sc, "put\n");
+	kref_put(&sc->sc_kref, sc_kref_release);
+}
+static void sc_get(struct o2net_sock_container *sc)
+{
+	sclog(sc, "get\n");
+	kref_get(&sc->sc_kref);
+}
+static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
+{
+	struct o2net_sock_container *sc, *ret = NULL;
+	struct page *page = NULL;
+
+	page = alloc_page(GFP_NOFS);
+	sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
+	if (sc == NULL || page == NULL)
+		goto out;
+
+	kref_init(&sc->sc_kref);
+	o2nm_node_get(node);
+	sc->sc_node = node;
+
+	INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
+	INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
+	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
+	INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
+
+	init_timer(&sc->sc_idle_timeout);
+	sc->sc_idle_timeout.function = o2net_idle_timer;
+	sc->sc_idle_timeout.data = (unsigned long)sc;
+
+	sclog(sc, "alloced\n");
+
+	ret = sc;
+	sc->sc_page = page;
+	sc = NULL;
+	page = NULL;
+
+out:
+	if (page)
+		__free_page(page);
+	kfree(sc);
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static void o2net_sc_queue_work(struct o2net_sock_container *sc,
+				struct work_struct *work)
+{
+	sc_get(sc);
+	if (!queue_work(o2net_wq, work))
+		sc_put(sc);
+}
+static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
+					struct work_struct *work,
+					int delay)
+{
+	sc_get(sc);
+	if (!queue_delayed_work(o2net_wq, work, delay))
+		sc_put(sc);
+}
+static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
+					 struct work_struct *work)
+{
+	if (cancel_delayed_work(work))
+		sc_put(sc);
+}
+
+static void o2net_set_nn_state(struct o2net_node *nn,
+			       struct o2net_sock_container *sc,
+			       unsigned valid, int err)
+{
+	int was_valid = nn->nn_sc_valid;
+	int was_err = nn->nn_persistent_error;
+	struct o2net_sock_container *old_sc = nn->nn_sc;
+
+	assert_spin_locked(&nn->nn_lock);
+
+	/* the node num comparison and single connect/accept path should stop
+	 * an non-null sc from being overwritten with another */
+	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
+	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
+	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
+
+	/* we won't reconnect after our valid conn goes away for
+	 * this hb iteration.. here so it shows up in the logs */
+	if (was_valid && !valid && err == 0)
+		err = -ENOTCONN;
+
+	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
+	     o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
+	     nn->nn_persistent_error, err);
+
+	nn->nn_sc = sc;
+	nn->nn_sc_valid = valid ? 1 : 0;
+	nn->nn_persistent_error = err;
+
+	/* mirrors o2net_tx_can_proceed() */
+	if (nn->nn_persistent_error || nn->nn_sc_valid)
+		wake_up(&nn->nn_sc_wq);
+
+	if (!was_err && nn->nn_persistent_error) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+	}
+
+	if (was_valid && !valid) {
+		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
+		     SC_NODEF_ARGS(old_sc));
+		o2net_complete_nodes_nsw(nn);
+	}
+
+	if (!was_valid && valid) {
+		o2quo_conn_up(o2net_num_from_nn(nn));
+		/* this is a bit of a hack.  we only try reconnecting
+		 * when heartbeating starts until we get a connection.
+		 * if that connection then dies we don't try reconnecting.
+		 * the only way to start connecting again is to down
+		 * heartbeat and bring it back up. */
+		cancel_delayed_work(&nn->nn_connect_expired);
+		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
+		     o2nm_this_node() > sc->sc_node->nd_num ?
+		     	"connected to" : "accepted connection from",
+		     SC_NODEF_ARGS(sc));
+	}
+
+	/* trigger the connecting worker func as long as we're not valid,
+	 * it will back off if it shouldn't connect.  This can be called
+	 * from node config teardown and so needs to be careful about
+	 * the work queue actually being up. */
+	if (!valid && o2net_wq) {
+		unsigned long delay;
+		/* delay if we're withing a RECONNECT_DELAY of the
+		 * last attempt */
+		delay = (nn->nn_last_connect_attempt +
+			 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			- jiffies;
+		if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+			delay = 0;
+		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
+		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+	}
+
+	/* keep track of the nn's sc ref for the caller */
+	if ((old_sc == NULL) && sc)
+		sc_get(sc);
+	if (old_sc && (old_sc != sc)) {
+		o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
+		sc_put(old_sc);
+	}
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		struct o2net_sock_container *sc = sk->sk_user_data;
+		sclog(sc, "data_ready hit\n");
+		do_gettimeofday(&sc->sc_tv_data_ready);
+		o2net_sc_queue_work(sc, &sc->sc_rx_work);
+		ready = sc->sc_data_ready;
+	} else {
+		ready = sk->sk_data_ready;
+	}
+	read_unlock(&sk->sk_callback_lock);
+
+	ready(sk, bytes);
+}
+
+/* see o2net_register_callbacks() */
+static void o2net_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct o2net_sock_container *sc;
+
+	read_lock(&sk->sk_callback_lock);
+	sc = sk->sk_user_data;
+	if (sc == NULL) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+
+	sclog(sc, "state_change to %d\n", sk->sk_state);
+
+	state_change = sc->sc_state_change;
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			o2net_sc_queue_work(sc, &sc->sc_connect_work);
+			break;
+		default:
+			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+			break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+/*
+ * we register callbacks so we can queue work on events before calling
+ * the original callbacks.  our callbacks our careful to test user_data
+ * to discover when they've reaced with o2net_unregister_callbacks().
+ */
+static void o2net_register_callbacks(struct sock *sk,
+				     struct o2net_sock_container *sc)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+
+	/* accepted sockets inherit the old listen socket data ready */
+	if (sk->sk_data_ready == o2net_listen_data_ready) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+
+	BUG_ON(sk->sk_user_data != NULL);
+	sk->sk_user_data = sc;
+	sc_get(sc);
+
+	sc->sc_data_ready = sk->sk_data_ready;
+	sc->sc_state_change = sk->sk_state_change;
+	sk->sk_data_ready = o2net_data_ready;
+	sk->sk_state_change = o2net_state_change;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int o2net_unregister_callbacks(struct sock *sk,
+			           struct o2net_sock_container *sc)
+{
+	int ret = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data == sc) {
+		ret = 1;
+		sk->sk_user_data = NULL;
+		sk->sk_data_ready = sc->sc_data_ready;
+		sk->sk_state_change = sc->sc_state_change;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return ret;
+}
+
+/*
+ * this is a little helper that is called by callers who have seen a problem
+ * with an sc and want to detach it from the nn if someone already hasn't beat
+ * them to it.  if an error is given then the shutdown will be persistent
+ * and pending transmits will be canceled.
+ */
+static void o2net_ensure_shutdown(struct o2net_node *nn,
+			           struct o2net_sock_container *sc,
+				   int err)
+{
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc == sc)
+		o2net_set_nn_state(nn, NULL, 0, err);
+	spin_unlock(&nn->nn_lock);
+}
+
+/*
+ * This work queue function performs the blocking parts of socket shutdown.  A
+ * few paths lead here.  set_nn_state will trigger this callback if it sees an
+ * sc detached from the nn.  state_change will also trigger this callback
+ * directly when it sees errors.  In that case we need to call set_nn_state
+ * ourselves as state_change couldn't get the nn_lock and call set_nn_state
+ * itself.
+ */
+static void o2net_shutdown_sc(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	sclog(sc, "shutting down\n");
+
+	/* drop the callbacks ref and call shutdown only once */
+	if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
+		/* we shouldn't flush as we're in the thread, the
+		 * races with pending sc work structs are harmless */
+		del_timer_sync(&sc->sc_idle_timeout);
+		o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+		sc_put(sc);
+		sc->sc_sock->ops->shutdown(sc->sc_sock,
+					   RCV_SHUTDOWN|SEND_SHUTDOWN);
+	}
+
+	/* not fatal so failed connects before the other guy has our
+	 * heartbeat can be retried */
+	o2net_ensure_shutdown(nn, sc, 0);
+	sc_put(sc);
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
+			     u32 key)
+{
+	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
+
+	if (ret == 0)
+		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
+
+	return ret;
+}
+
+static struct o2net_msg_handler *
+o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
+			  struct rb_node **ret_parent)
+{
+        struct rb_node **p = &o2net_handler_tree.rb_node;
+        struct rb_node *parent = NULL;
+	struct o2net_msg_handler *nmh, *ret = NULL;
+	int cmp;
+
+        while (*p) {
+                parent = *p;
+                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+		cmp = o2net_handler_cmp(nmh, msg_type, key);
+
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else {
+			ret = nmh;
+                        break;
+		}
+        }
+
+        if (ret_p != NULL)
+                *ret_p = p;
+        if (ret_parent != NULL)
+                *ret_parent = parent;
+
+        return ret;
+}
+
+static void o2net_handler_kref_release(struct kref *kref)
+{
+	struct o2net_msg_handler *nmh;
+	nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
+
+	kfree(nmh);
+}
+
+static void o2net_handler_put(struct o2net_msg_handler *nmh)
+{
+	kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+}
+
+/* max_len is protection for the handler func.  incoming messages won't
+ * be given to the handler if their payload is longer than the max. */
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list)
+{
+	struct o2net_msg_handler *nmh = NULL;
+	struct rb_node **p, *parent;
+	int ret = 0;
+
+	if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "max_len for message handler out of range: %u\n",
+			max_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!msg_type) {
+		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
+		ret = -EINVAL;
+		goto out;
+
+	}
+	if (!func) {
+		mlog(0, "no message handler provided: %u, %p\n",
+		       msg_type, func);
+		ret = -EINVAL;
+		goto out;
+	}
+
+       	nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
+	if (nmh == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	nmh->nh_func = func;
+	nmh->nh_func_data = data;
+	nmh->nh_msg_type = msg_type;
+	nmh->nh_max_len = max_len;
+	nmh->nh_key = key;
+	/* the tree and list get this ref.. they're both removed in
+	 * unregister when this ref is dropped */
+	kref_init(&nmh->nh_kref);
+	INIT_LIST_HEAD(&nmh->nh_unregister_item);
+
+	write_lock(&o2net_handler_lock);
+	if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
+		ret = -EEXIST;
+	else {
+	        rb_link_node(&nmh->nh_node, parent, p);
+		rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
+		list_add_tail(&nmh->nh_unregister_item, unreg_list);
+
+		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
+		     func, msg_type, key);
+		/* we've had some trouble with handlers seemingly vanishing. */
+		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
+							  &parent) == NULL,
+			        "couldn't find handler we *just* registerd "
+				"for type %u key %08x\n", msg_type, key);
+	}
+	write_unlock(&o2net_handler_lock);
+	if (ret)
+		goto out;
+
+out:
+	if (ret)
+		kfree(nmh);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_register_handler);
+
+void o2net_unregister_handler_list(struct list_head *list)
+{
+	struct list_head *pos, *n;
+	struct o2net_msg_handler *nmh;
+
+	write_lock(&o2net_handler_lock);
+	list_for_each_safe(pos, n, list) {
+		nmh = list_entry(pos, struct o2net_msg_handler,
+				 nh_unregister_item);
+		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
+		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
+		rb_erase(&nmh->nh_node, &o2net_handler_tree);
+		list_del_init(&nmh->nh_unregister_item);
+		kref_put(&nmh->nh_kref, o2net_handler_kref_release);
+	}
+	write_unlock(&o2net_handler_lock);
+}
+EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
+
+static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
+{
+	struct o2net_msg_handler *nmh;
+
+	read_lock(&o2net_handler_lock);
+	nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
+	if (nmh)
+		kref_get(&nmh->nh_kref);
+	read_unlock(&o2net_handler_lock);
+
+	return nmh;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct kvec vec = {
+		.iov_len = len,
+		.iov_base = data,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&vec,
+       		.msg_flags = MSG_DONTWAIT,
+	};
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
+			      size_t veclen, size_t total)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct msghdr msg = {
+		.msg_iov = (struct iovec *)vec,
+		.msg_iovlen = veclen,
+	};
+
+	if (sock == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ret = sock_sendmsg(sock, &msg, total);
+	set_fs(oldfs);
+	if (ret != total) {
+		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+		     total);
+		if (ret >= 0)
+			ret = -EPIPE; /* should be smarter, I bet */
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (ret < 0)
+		mlog(0, "returning error: %d\n", ret);
+	return ret;
+}
+
+static void o2net_sendpage(struct o2net_sock_container *sc,
+			   void *kmalloced_virt,
+			   size_t size)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	ssize_t ret;
+
+
+	ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+					 virt_to_page(kmalloced_virt),
+					 (long)kmalloced_virt & ~PAGE_MASK,
+					 size, MSG_DONTWAIT);
+	if (ret != size) {
+		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT 
+		     " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+}
+
+static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
+{
+	memset(msg, 0, sizeof(struct o2net_msg));
+	msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
+	msg->data_len = cpu_to_be16(data_len);
+	msg->msg_type = cpu_to_be16(msg_type);
+	msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
+	msg->status = 0;
+	msg->key = cpu_to_be32(key);
+}
+
+static int o2net_tx_can_proceed(struct o2net_node *nn,
+			        struct o2net_sock_container **sc_ret,
+				int *error)
+{
+	int ret = 0;
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_persistent_error) {
+		ret = 1;
+		*sc_ret = NULL;
+		*error = nn->nn_persistent_error;
+	} else if (nn->nn_sc_valid) {
+		kref_get(&nn->nn_sc->sc_kref);
+
+		ret = 1;
+		*sc_ret = nn->nn_sc;
+		*error = 0;
+	}
+	spin_unlock(&nn->nn_lock);
+
+	return ret;
+}
+
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
+			   size_t caller_veclen, u8 target_node, int *status)
+{
+	int ret, error = 0;
+	struct o2net_msg *msg = NULL;
+	size_t veclen, caller_bytes = 0;
+	struct kvec *vec = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn = o2net_nn_from_num(target_node);
+	struct o2net_status_wait nsw = {
+		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
+	};
+
+	if (o2net_wq == NULL) {
+		mlog(0, "attempt to tx without o2netd running\n");
+		ret = -ESRCH;
+		goto out;
+	}
+
+	if (caller_veclen == 0) {
+		mlog(0, "bad kvec array length\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
+	if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
+		mlog(0, "total payload len %zu too large\n", caller_bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (target_node == o2nm_this_node()) {
+		ret = -ELOOP;
+		goto out;
+	}
+
+	ret = wait_event_interruptible(nn->nn_sc_wq,
+				       o2net_tx_can_proceed(nn, &sc, &error));
+	if (!ret && error)
+		ret = error;
+	if (ret)
+		goto out;
+
+	veclen = caller_veclen + 1;
+	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
+	if (vec == NULL) {
+		mlog(0, "failed to %zu element kvec!\n", veclen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
+	if (!msg) {
+		mlog(0, "failed to allocate a o2net_msg!\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	o2net_init_msg(msg, caller_bytes, msg_type, key);
+
+	vec[0].iov_len = sizeof(struct o2net_msg);
+	vec[0].iov_base = msg;
+	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
+
+	ret = o2net_prep_nsw(nn, &nsw);
+	if (ret)
+		goto out;
+
+	msg->msg_num = cpu_to_be32(nsw.ns_id);
+
+	/* finally, convert the message header to network byte-order
+	 * and send */
+	ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
+				 sizeof(struct o2net_msg) + caller_bytes);
+	msglog(msg, "sending returned %d\n", ret);
+	if (ret < 0) {
+		mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
+		goto out;
+	}
+
+	/* wait on other node's handler */
+	wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+
+	/* Note that we avoid overwriting the callers status return
+	 * variable if a system error was reported on the other
+	 * side. Callers beware. */
+	ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
+	if (status && !ret)
+		*status = nsw.ns_status;
+
+	mlog(0, "woken, returning system status %d, user status %d\n",
+	     ret, nsw.ns_status);
+out:
+	if (sc)
+		sc_put(sc);
+	if (vec)
+		kfree(vec);
+	if (msg)
+		kfree(msg);
+	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(o2net_send_message_vec);
+
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status)
+{
+	struct kvec vec = {
+		.iov_base = data,
+		.iov_len = len,
+	};
+	return o2net_send_message_vec(msg_type, key, &vec, 1,
+				      target_node, status);
+}
+EXPORT_SYMBOL_GPL(o2net_send_message);
+
+static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
+				   enum o2net_system_error syserr, int err)
+{
+	struct kvec vec = {
+		.iov_base = hdr,
+		.iov_len = sizeof(struct o2net_msg),
+	};
+
+	BUG_ON(syserr >= O2NET_ERR_MAX);
+
+	/* leave other fields intact from the incoming message, msg_num
+	 * in particular */
+	hdr->sys_status = cpu_to_be32(syserr);
+	hdr->status = cpu_to_be32(err);
+	hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC);  // twiddle the magic
+	hdr->data_len = 0;
+
+	msglog(hdr, "about to send status magic %d\n", err);
+	/* hdr has been in host byteorder this whole time */
+	return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
+}
+
+/* this returns -errno if the header was unknown or too large, etc.
+ * after this is called the buffer us reused for the next message */
+static int o2net_process_message(struct o2net_sock_container *sc,
+				 struct o2net_msg *hdr)
+{
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	int ret = 0, handler_status;
+	enum  o2net_system_error syserr;
+	struct o2net_msg_handler *nmh = NULL;
+
+	msglog(hdr, "processing message\n");
+
+	o2net_sc_postpone_idle(sc);
+
+	switch(be16_to_cpu(hdr->magic)) {
+		case O2NET_MSG_STATUS_MAGIC:
+			/* special type for returning message status */
+			o2net_complete_nsw(nn, NULL,
+					   be32_to_cpu(hdr->msg_num),
+					   be32_to_cpu(hdr->sys_status),
+					   be32_to_cpu(hdr->status));
+			goto out;
+		case O2NET_MSG_KEEP_REQ_MAGIC:
+			o2net_sendpage(sc, o2net_keep_resp,
+				       sizeof(*o2net_keep_resp));
+			goto out;
+		case O2NET_MSG_KEEP_RESP_MAGIC:
+			goto out;
+		case O2NET_MSG_MAGIC:
+			break;
+		default:
+			msglog(hdr, "bad magic\n");
+			ret = -EINVAL;
+			goto out;
+			break;
+	}
+
+	/* find a handler for it */
+	handler_status = 0;
+	nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
+				be32_to_cpu(hdr->key));
+	if (!nmh) {
+		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
+		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
+		syserr = O2NET_ERR_NO_HNDLR;
+		goto out_respond;
+	}
+
+	syserr = O2NET_ERR_NONE;
+
+	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
+		syserr = O2NET_ERR_OVERFLOW;
+
+	if (syserr != O2NET_ERR_NONE)
+		goto out_respond;
+
+	do_gettimeofday(&sc->sc_tv_func_start);
+	sc->sc_msg_key = be32_to_cpu(hdr->key);
+	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
+	handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
+					     be16_to_cpu(hdr->data_len),
+					nmh->nh_func_data);
+	do_gettimeofday(&sc->sc_tv_func_stop);
+
+out_respond:
+	/* this destroys the hdr, so don't use it after this */
+	ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
+				      handler_status);
+	hdr = NULL;
+	mlog(0, "sending handler status %d, syserr %d returned %d\n",
+	     handler_status, syserr, ret);
+
+out:
+	if (nmh)
+		o2net_handler_put(nmh);
+	return ret;
+}
+
+static int o2net_check_handshake(struct o2net_sock_container *sc)
+{
+	struct o2net_handshake *hand = page_address(sc->sc_page);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
+		mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
+		     "version %llu but %llu is required, disconnecting\n",
+		     SC_NODEF_ARGS(sc),
+		     (unsigned long long)be64_to_cpu(hand->protocol_version),
+		     O2NET_PROTOCOL_VERSION);
+
+		/* don't bother reconnecting if its the wrong version. */
+		o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+		return -1;
+	}
+
+	sc->sc_handshake_ok = 1;
+
+	spin_lock(&nn->nn_lock);
+	/* set valid and queue the idle timers only if it hasn't been
+	 * shut down already */
+	if (nn->nn_sc == sc) {
+		o2net_sc_postpone_idle(sc);
+		o2net_set_nn_state(nn, sc, 1, 0);
+	}
+	spin_unlock(&nn->nn_lock);
+
+	/* shift everything up as though it wasn't there */
+	sc->sc_page_off -= sizeof(struct o2net_handshake);
+	if (sc->sc_page_off)
+		memmove(hand, hand + 1, sc->sc_page_off);
+
+	return 0;
+}
+
+/* this demuxes the queued rx bytes into header or payload bits and calls
+ * handlers as each full message is read off the socket.  it returns -error,
+ * == 0 eof, or > 0 for progress made.*/
+static int o2net_advance_rx(struct o2net_sock_container *sc)
+{
+	struct o2net_msg *hdr;
+	int ret = 0;
+	void *data;
+	size_t datalen;
+
+	sclog(sc, "receiving\n");
+	do_gettimeofday(&sc->sc_tv_advance_start);
+
+	/* do we need more header? */
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0) {
+			sc->sc_page_off += ret;
+
+			/* this working relies on the handshake being
+			 * smaller than the normal message header */
+			if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
+			    !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
+				ret = -EPROTO;
+				goto out;
+			}
+
+			/* only swab incoming here.. we can
+			 * only get here once as we cross from
+			 * being under to over */
+			if (sc->sc_page_off == sizeof(struct o2net_msg)) {
+				hdr = page_address(sc->sc_page);
+				if (be16_to_cpu(hdr->data_len) >
+				    O2NET_MAX_PAYLOAD_BYTES)
+					ret = -EOVERFLOW;
+			}
+		}
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off < sizeof(struct o2net_msg)) {
+		/* oof, still don't have a header */
+		goto out;
+	}
+
+	/* this was swabbed above when we first read it */
+	hdr = page_address(sc->sc_page);
+
+	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
+
+	/* do we need more payload? */
+	if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
+		/* need more payload */
+		data = page_address(sc->sc_page) + sc->sc_page_off;
+		datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
+			  sc->sc_page_off;
+		ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+		if (ret > 0)
+			sc->sc_page_off += ret;
+		if (ret <= 0)
+			goto out;
+	}
+
+	if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
+		/* we can only get here once, the first time we read
+		 * the payload.. so set ret to progress if the handler
+		 * works out. after calling this the message is toast */
+		ret = o2net_process_message(sc, hdr);
+		if (ret == 0)
+			ret = 1;
+		sc->sc_page_off = 0;
+	}
+
+out:
+	sclog(sc, "ret = %d\n", ret);
+	do_gettimeofday(&sc->sc_tv_advance_stop);
+	return ret;
+}
+
+/* this work func is triggerd by data ready.  it reads until it can read no
+ * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
+ * our work the work struct will be marked and we'll be called again. */
+static void o2net_rx_until_empty(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+	int ret;
+
+	do {
+		ret = o2net_advance_rx(sc);
+	} while (ret > 0);
+
+	if (ret <= 0 && ret != -EAGAIN) {
+		struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+		sclog(sc, "saw error %d, closing\n", ret);
+		/* not permanent so read failed handshake can retry */
+		o2net_ensure_shutdown(nn, sc, 0);
+	}
+
+	sc_put(sc);
+}
+
+static int o2net_set_nodelay(struct socket *sock)
+{
+	int ret, val = 1;
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	/*
+	 * Dear unsuspecting programmer,
+	 *
+	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
+	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
+	 * silently turn into SO_DEBUG.
+	 *
+	 * Yours,
+	 * Keeper of hilariously fragile interfaces.
+	 */
+	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (char __user *)&val, sizeof(val));
+
+	set_fs(oldfs);
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+/* called when a connect completes and after a sock is accepted.  the
+ * rx path will see the response and mark the sc valid */
+static void o2net_sc_connect_completed(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
+              (unsigned long long)O2NET_PROTOCOL_VERSION,
+	      (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+	sc_put(sc);
+}
+
+/* this is called as a work_struct func. */
+static void o2net_sc_send_keep_req(void *arg)
+{
+	struct o2net_sock_container *sc = arg;
+
+	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	sc_put(sc);
+}
+
+/* socket shutdown does a del_timer_sync against this as it tears down.
+ * we can't start this timer until we've got to the point in sc buildup
+ * where shutdown is going to be involved */
+static void o2net_idle_timer(unsigned long data)
+{
+	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct timeval now;
+
+	do_gettimeofday(&now);
+
+	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
+	mlog(ML_NOTICE, "here are some times that might help debug the "
+	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
+	     "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
+	     sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 
+	     now.tv_sec, now.tv_usec,
+	     sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 
+	     sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 
+	     sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 
+	     sc->sc_msg_key, sc->sc_msg_type,
+	     sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
+	     sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
+
+	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
+	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				    O2NET_KEEPALIVE_DELAY_SECS * HZ);
+	do_gettimeofday(&sc->sc_tv_timer);
+	mod_timer(&sc->sc_idle_timeout,
+		  jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
+}
+
+/* this work func is kicked whenever a path sets the nn state which doesn't
+ * have valid set.  This includes seeing hb come up, losing a connection,
+ * having a connect attempt fail, etc. This centralizes the logic which decides
+ * if a connect attempt should be made or if we should give up and all future
+ * transmit attempts should fail */
+static void o2net_start_connect(void *arg)
+{
+	struct o2net_node *nn = arg;
+	struct o2net_sock_container *sc = NULL;
+	struct o2nm_node *node = NULL;
+	struct socket *sock = NULL;
+	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
+	int ret = 0;
+
+	/* if we're greater we initiate tx, otherwise we accept */
+	if (o2nm_this_node() <= o2net_num_from_nn(nn))
+		goto out;
+
+	/* watch for racing with tearing a node down */
+	node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
+	if (node == NULL) {
+		ret = 0;
+		goto out;
+	}
+
+	spin_lock(&nn->nn_lock);
+	/* see if we already have one pending or have given up */
+	if (nn->nn_sc || nn->nn_persistent_error)
+		arg = NULL;
+	spin_unlock(&nn->nn_lock);
+	if (arg == NULL) /* *shrug*, needed some indicator */
+		goto out;
+
+	nn->nn_last_connect_attempt = jiffies;
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		mlog(0, "couldn't allocate sc\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(0, "can't create socket: %d\n", ret);
+		goto out;
+	}
+	sc->sc_sock = sock; /* freed by sc_kref_release */
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	myaddr.sin_family = AF_INET;
+	myaddr.sin_port = (__force u16)htons(0); /* any port */
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
+			      sizeof(myaddr));
+	if (ret) {
+		mlog(0, "bind failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = o2net_set_nodelay(sc->sc_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+
+	spin_lock(&nn->nn_lock);
+	/* handshake completion will set nn->nn_sc_valid */
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	remoteaddr.sin_family = AF_INET;
+	remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
+	remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+
+	ret = sc->sc_sock->ops->connect(sc->sc_sock,
+					(struct sockaddr *)&remoteaddr,
+					sizeof(remoteaddr),
+					O_NONBLOCK);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (ret) {
+		mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
+		     "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+		/* 0 err so that another will be queued and attempted
+		 * from set_nn_state */
+		if (sc)
+			o2net_ensure_shutdown(nn, sc, 0);
+	}
+	if (sc)
+		sc_put(sc);
+	if (node)
+		o2nm_node_put(node);
+
+	return;
+}
+
+static void o2net_connect_expired(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	spin_lock(&nn->nn_lock);
+	if (!nn->nn_sc_valid) {
+		mlog(ML_ERROR, "no connection established with node %u after "
+		     "%u seconds, giving up and returning errors.\n",
+		     o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
+
+		o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	}
+	spin_unlock(&nn->nn_lock);
+}
+
+static void o2net_still_up(void *arg)
+{
+	struct o2net_node *nn = arg;
+
+	o2quo_hb_still_up(o2net_num_from_nn(nn));
+}
+
+/* ------------------------------------------------------------ */
+
+void o2net_disconnect_node(struct o2nm_node *node)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
+
+	/* don't reconnect until it's heartbeating again */
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
+	spin_unlock(&nn->nn_lock);
+
+	if (o2net_wq) {
+		cancel_delayed_work(&nn->nn_connect_expired);
+		cancel_delayed_work(&nn->nn_connect_work);
+		cancel_delayed_work(&nn->nn_still_up);
+		flush_workqueue(o2net_wq);
+	}
+}
+
+static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
+				  void *data)
+{
+	o2quo_hb_down(node_num);
+
+	if (node_num != o2nm_this_node())
+		o2net_disconnect_node(node);
+}
+
+static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
+				void *data)
+{
+	struct o2net_node *nn = o2net_nn_from_num(node_num);
+
+	o2quo_hb_up(node_num);
+
+	/* ensure an immediate connect attempt */
+	nn->nn_last_connect_attempt = jiffies -
+		(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
+
+	if (node_num != o2nm_this_node()) {
+		/* heartbeat doesn't work unless a local node number is
+		 * configured and doing so brings up the o2net_wq, so we can
+		 * use it.. */
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
+				   O2NET_IDLE_TIMEOUT_SECS * HZ);
+
+		/* believe it or not, accept and node hearbeating testing
+		 * can succeed for this node before we got here.. so
+		 * only use set_nn_state to clear the persistent error
+		 * if that hasn't already happened */
+		spin_lock(&nn->nn_lock);
+		if (nn->nn_persistent_error)
+			o2net_set_nn_state(nn, NULL, 0, 0);
+		spin_unlock(&nn->nn_lock);
+	}
+}
+
+void o2net_unregister_hb_callbacks(void)
+{
+	int ret;
+
+	ret = o2hb_unregister_callback(&o2net_hb_up);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+		     "callback!\n", ret);
+
+	ret = o2hb_unregister_callback(&o2net_hb_down);
+	if (ret < 0)
+		mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+		     "callback!\n", ret);
+}
+
+int o2net_register_hb_callbacks(void)
+{
+	int ret;
+
+	o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
+			    o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
+	o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
+			    o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
+
+	ret = o2hb_register_callback(&o2net_hb_up);
+	if (ret == 0)
+		ret = o2hb_register_callback(&o2net_hb_down);
+
+	if (ret)
+		o2net_unregister_hb_callbacks();
+
+	return ret;
+}
+
+/* ------------------------------------------------------------ */
+
+static int o2net_accept_one(struct socket *sock)
+{
+	int ret, slen;
+	struct sockaddr_in sin;
+	struct socket *new_sock = NULL;
+	struct o2nm_node *node = NULL;
+	struct o2net_sock_container *sc = NULL;
+	struct o2net_node *nn;
+
+	BUG_ON(sock == NULL);
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	new_sock->sk->sk_allocation = GFP_ATOMIC;
+
+	ret = o2net_set_nodelay(new_sock);
+	if (ret) {
+		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
+		goto out;
+	}
+
+	slen = sizeof(sin);
+	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
+				       &slen, 1);
+	if (ret < 0)
+		goto out;
+
+	node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+	if (node == NULL) {
+		mlog(ML_NOTICE, "attempt to connect from unknown node at "
+		     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (o2nm_this_node() > node->nd_num) {
+		mlog(ML_NOTICE, "unexpected connect attempted from a lower "
+		     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port), node->nd_num);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* this happens all the time when the other node sees our heartbeat
+	 * and tries to connect before we see their heartbeat */
+	if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
+		mlog(ML_CONN, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nn = o2net_nn_from_num(node->nd_num);
+
+	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		ret = -EBUSY;
+	else
+		ret = 0;
+	spin_unlock(&nn->nn_lock);
+	if (ret) {
+		mlog(ML_NOTICE, "attempt to connect from node '%s' at "
+		     "%u.%u.%u.%u:%d but it already has an open connection\n",
+		     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+		     ntohs((__force __be16)sin.sin_port));
+		goto out;
+	}
+
+	sc = sc_alloc(node);
+	if (sc == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	sc->sc_sock = new_sock;
+	new_sock = NULL;
+
+	spin_lock(&nn->nn_lock);
+	o2net_set_nn_state(nn, sc, 0, 0);
+	spin_unlock(&nn->nn_lock);
+
+	o2net_register_callbacks(sc->sc_sock->sk, sc);
+	o2net_sc_queue_work(sc, &sc->sc_rx_work);
+
+	o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	if (node)
+		o2nm_node_put(node);
+	if (sc)
+		sc_put(sc);
+	return ret;
+}
+
+static void o2net_accept_many(void *arg)
+{
+	struct socket *sock = arg;
+	while (o2net_accept_one(sock) == 0)
+		cond_resched();
+}
+
+static void o2net_listen_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (ready == NULL) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/* ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the acceptor has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket */
+	if (sk->sk_state == TCP_LISTEN) {
+		mlog(ML_TCP, "bytes: %d\n", bytes);
+		queue_work(o2net_wq, &o2net_listen_work);
+	}
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+static int o2net_open_listening_sock(__be16 port)
+{
+	struct socket *sock = NULL;
+	int ret;
+	struct sockaddr_in sin = {
+		.sin_family = PF_INET,
+		.sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
+		.sin_port = (__force u16)port,
+	};
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+		goto out;
+	}
+
+	sock->sk->sk_allocation = GFP_ATOMIC;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = o2net_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	o2net_listen_sock = sock;
+	INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
+
+	sock->sk->sk_reuse = 1;
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
+		     ntohs(port), ret);
+		goto out;
+	}
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0) {
+		mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
+		     ntohs(port), ret);
+	}
+
+out:
+	if (ret) {
+		o2net_listen_sock = NULL;
+		if (sock)
+			sock_release(sock);
+	}
+	return ret;
+}
+
+/*
+ * called from node manager when we should bring up our network listening
+ * socket.  node manager handles all the serialization to only call this
+ * once and to match it with o2net_stop_listening().  note,
+ * o2nm_this_node() doesn't work yet as we're being called while it
+ * is being set up.
+ */
+int o2net_start_listening(struct o2nm_node *node)
+{
+	int ret = 0;
+
+	BUG_ON(o2net_wq != NULL);
+	BUG_ON(o2net_listen_sock != NULL);
+
+	mlog(ML_KTHREAD, "starting o2net thread...\n");
+	o2net_wq = create_singlethread_workqueue("o2net");
+	if (o2net_wq == NULL) {
+		mlog(ML_ERROR, "unable to launch o2net thread\n");
+		return -ENOMEM; /* ? */
+	}
+
+	ret = o2net_open_listening_sock(node->nd_ipv4_port);
+	if (ret) {
+		destroy_workqueue(o2net_wq);
+		o2net_wq = NULL;
+	} else
+		o2quo_conn_up(node->nd_num);
+
+	return ret;
+}
+
+/* again, o2nm_this_node() doesn't work here as we're involved in
+ * tearing it down */
+void o2net_stop_listening(struct o2nm_node *node)
+{
+	struct socket *sock = o2net_listen_sock;
+	size_t i;
+
+	BUG_ON(o2net_wq == NULL);
+	BUG_ON(o2net_listen_sock == NULL);
+
+	/* stop the listening socket from generating work */
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready = sock->sk->sk_user_data;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2nm_node *node = o2nm_get_node_by_num(i);
+		if (node) {
+			o2net_disconnect_node(node);
+			o2nm_node_put(node);
+		}
+	}
+
+	/* finish all work and tear down the work queue */
+	mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
+	destroy_workqueue(o2net_wq);
+	o2net_wq = NULL;
+
+	sock_release(o2net_listen_sock);
+	o2net_listen_sock = NULL;
+
+	o2quo_conn_err(node->nd_num);
+}
+
+/* ------------------------------------------------------------ */
+
+int o2net_init(void)
+{
+	unsigned long i;
+
+	o2quo_init();
+
+	o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
+	o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
+		kfree(o2net_hand);
+		kfree(o2net_keep_req);
+		kfree(o2net_keep_resp);
+		return -ENOMEM;
+	}
+
+	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
+	o2net_hand->connector_id = cpu_to_be64(1);
+
+	o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
+	o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
+
+	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
+		struct o2net_node *nn = o2net_nn_from_num(i);
+
+		spin_lock_init(&nn->nn_lock);
+		INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
+		INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
+		INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
+		/* until we see hb from a node we'll return einval */
+		nn->nn_persistent_error = -ENOTCONN;
+		init_waitqueue_head(&nn->nn_sc_wq);
+		idr_init(&nn->nn_status_idr);
+		INIT_LIST_HEAD(&nn->nn_status_list);
+	}
+
+	return 0;
+}
+
+void o2net_exit(void)
+{
+	o2quo_exit();
+	kfree(o2net_hand);
+	kfree(o2net_keep_req);
+	kfree(o2net_keep_resp);
+}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 0000000..a6f4585
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * tcp.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef O2CLUSTER_TCP_H
+#define O2CLUSTER_TCP_H
+
+#include <linux/socket.h>
+#ifdef __KERNEL__
+#include <net/sock.h>
+#include <linux/tcp.h>
+#else
+#include <sys/socket.h>
+#endif
+#include <linux/inet.h>
+#include <linux/in.h>
+
+struct o2net_msg
+{
+	__be16 magic;
+	__be16 data_len;
+	__be16 msg_type;
+	__be16 pad1;
+	__be32 sys_status;
+	__be32 status;
+	__be32 key;
+	__be32 msg_num;
+	__u8  buf[0];
+};
+
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
+
+#define O2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct o2net_msg))
+
+/* TODO: figure this out.... */
+static inline int o2net_link_down(int err, struct socket *sock)
+{
+	if (sock) {
+		if (sock->sk->sk_state != TCP_ESTABLISHED &&
+	    	    sock->sk->sk_state != TCP_CLOSE_WAIT)
+			return 1;
+	}
+
+	if (err >= 0)
+		return 0;
+	switch (err) {
+		/* ????????????????????????? */
+		case -ERESTARTSYS:
+		case -EBADF:
+		/* When the server has died, an ICMP port unreachable
+		 * message prompts ECONNREFUSED. */
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+			return 1;
+	}
+	return 0;
+}
+
+enum {
+	O2NET_DRIVER_UNINITED,
+	O2NET_DRIVER_READY,
+};
+
+int o2net_init_tcp_sock(struct inode *inode);
+int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+		       u8 target_node, int *status);
+int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
+			   size_t veclen, u8 target_node, int *status);
+int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
+			    struct inode *group);
+
+int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
+			   o2net_msg_handler_func *func, void *data,
+			   struct list_head *unreg_list);
+void o2net_unregister_handler_list(struct list_head *list);
+
+struct o2nm_node;
+int o2net_register_hb_callbacks(void);
+void o2net_unregister_hb_callbacks(void);
+int o2net_start_listening(struct o2nm_node *node);
+void o2net_stop_listening(struct o2nm_node *node);
+void o2net_disconnect_node(struct o2nm_node *node);
+
+int o2net_init(void);
+void o2net_exit(void);
+int o2net_proc_init(struct proc_dir_entry *parent);
+void o2net_proc_exit(struct proc_dir_entry *parent);
+
+#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 0000000..ff9e2e2
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef O2CLUSTER_TCP_INTERNAL_H
+#define O2CLUSTER_TCP_INTERNAL_H
+
+#define O2NET_MSG_MAGIC           ((u16)0xfa55)
+#define O2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
+#define O2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
+#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
+
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS	O2HB_REGION_TIMEOUT_MS
+
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS	((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
+#define O2NET_KEEPALIVE_DELAY_SECS	5
+#define O2NET_IDLE_TIMEOUT_SECS		10
+
+/* 
+ * This version number represents quite a lot, unfortunately.  It not
+ * only represents the raw network message protocol on the wire but also
+ * locking semantics of the file system using the protocol.  It should 
+ * be somewhere else, I'm sure, but right now it isn't.
+ *
+ * New in version 2:
+ * 	- full 64 bit i_size in the metadata lock lvbs
+ * 	- introduction of "rw" lock and pushing meta/data locking down
+ */
+#define O2NET_PROTOCOL_VERSION 2ULL
+struct o2net_handshake {
+	__be64	protocol_version;
+	__be64	connector_id;
+};
+
+struct o2net_node {
+	/* this is never called from int/bh */
+	spinlock_t			nn_lock;
+
+	/* set the moment an sc is allocated and a connect is started */
+	struct o2net_sock_container	*nn_sc;
+	/* _valid is only set after the handshake passes and tx can happen */
+	unsigned			nn_sc_valid:1;
+	/* if this is set tx just returns it */
+	int				nn_persistent_error;
+
+	/* threads waiting for an sc to arrive wait on the wq for generation
+	 * to increase.  it is increased when a connecting socket succeeds
+	 * or fails or when an accepted socket is attached. */
+	wait_queue_head_t		nn_sc_wq;
+
+	struct idr			nn_status_idr;
+	struct list_head		nn_status_list;
+
+	/* connects are attempted from when heartbeat comes up until either hb
+	 * goes down, the node is unconfigured, no connect attempts succeed
+	 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+	 * is queued from set_nn_state both from hb up and from itself if a
+	 * connect attempt fails and so can be self-arming.  shutdown is
+	 * careful to first mark the nn such that no connects will be attempted
+	 * before canceling delayed connect work and flushing the queue. */
+	struct work_struct		nn_connect_work;
+	unsigned long			nn_last_connect_attempt;
+
+	/* this is queued as nodes come up and is canceled when a connection is
+	 * established.  this expiring gives up on the node and errors out
+	 * transmits */
+	struct work_struct		nn_connect_expired;
+
+	/* after we give up on a socket we wait a while before deciding
+	 * that it is still heartbeating and that we should do some
+	 * quorum work */
+	struct work_struct		nn_still_up;
+};
+
+struct o2net_sock_container {
+	struct kref		sc_kref;
+	/* the next two are vaild for the life time of the sc */
+	struct socket		*sc_sock;
+	struct o2nm_node	*sc_node;
+
+	/* all of these sc work structs hold refs on the sc while they are
+	 * queued.  they should not be able to ref a freed sc.  the teardown
+	 * race is with o2net_wq destruction in o2net_stop_listening() */
+
+	/* rx and connect work are generated from socket callbacks.  sc
+	 * shutdown removes the callbacks and then flushes the work queue */
+	struct work_struct	sc_rx_work;
+	struct work_struct	sc_connect_work;
+	/* shutdown work is triggered in two ways.  the simple way is
+	 * for a code path calls ensure_shutdown which gets a lock, removes
+	 * the sc from the nn, and queues the work.  in this case the
+	 * work is single-shot.  the work is also queued from a sock
+	 * callback, though, and in this case the work will find the sc
+	 * still on the nn and will call ensure_shutdown itself.. this
+	 * ends up triggering the shutdown work again, though nothing
+	 * will be done in that second iteration.  so work queue teardown
+	 * has to be careful to remove the sc from the nn before waiting
+	 * on the work queue so that the shutdown work doesn't remove the
+	 * sc and rearm itself.
+	 */
+	struct work_struct	sc_shutdown_work;
+
+	struct timer_list	sc_idle_timeout;
+	struct work_struct	sc_keepalive_work;
+
+	unsigned		sc_handshake_ok:1;
+
+	struct page 		*sc_page;
+	size_t			sc_page_off;
+
+	/* original handlers for the sockets */
+	void			(*sc_state_change)(struct sock *sk);
+	void			(*sc_data_ready)(struct sock *sk, int bytes);
+
+	struct timeval 		sc_tv_timer;
+	struct timeval 		sc_tv_data_ready;
+	struct timeval 		sc_tv_advance_start;
+	struct timeval 		sc_tv_advance_stop;
+	struct timeval 		sc_tv_func_start;
+	struct timeval 		sc_tv_func_stop;
+	u32			sc_msg_key;
+	u16			sc_msg_type;
+};
+
+struct o2net_msg_handler {
+	struct rb_node		nh_node;
+	u32			nh_max_len;
+	u32			nh_msg_type;
+	u32			nh_key;
+	o2net_msg_handler_func	*nh_func;
+	o2net_msg_handler_func	*nh_func_data;
+	struct kref		nh_kref;
+	struct list_head	nh_unregister_item;
+};
+
+enum o2net_system_error {
+	O2NET_ERR_NONE = 0,
+	O2NET_ERR_NO_HNDLR,
+	O2NET_ERR_OVERFLOW,
+	O2NET_ERR_DIED,
+	O2NET_ERR_MAX
+};
+
+struct o2net_status_wait {
+	enum o2net_system_error	ns_sys_status;
+	s32			ns_status;
+	int			ns_id;
+	wait_queue_head_t	ns_wq;
+	struct list_head	ns_node_item;
+};
+
+#endif /* O2CLUSTER_TCP_INTERNAL_H */
-- 
cgit v1.1


From 6714d8e86bf443f6f7af50f9d432025649f091f5 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

A distributed lock manager built with the cluster file system use case
in mind. The OCFS2 dlm exposes a VMS style API, though things have
been simplified internally. The only lock levels implemented currently
are NLMODE, PRMODE and EXMODE.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/ocfs2/dlm/Makefile      |    6 +
 fs/ocfs2/dlm/dlmapi.h      |  214 ++++
 fs/ocfs2/dlm/dlmast.c      |  466 ++++++++
 fs/ocfs2/dlm/dlmcommon.h   |  884 +++++++++++++++
 fs/ocfs2/dlm/dlmconvert.c  |  530 +++++++++
 fs/ocfs2/dlm/dlmconvert.h  |   35 +
 fs/ocfs2/dlm/dlmdebug.c    |  246 ++++
 fs/ocfs2/dlm/dlmdebug.h    |   30 +
 fs/ocfs2/dlm/dlmdomain.c   | 1469 ++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmdomain.h   |   36 +
 fs/ocfs2/dlm/dlmlock.c     |  676 +++++++++++
 fs/ocfs2/dlm/dlmmaster.c   | 2666 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmrecovery.c | 2132 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmthread.c   |  695 ++++++++++++
 fs/ocfs2/dlm/dlmunlock.c   |  672 +++++++++++
 fs/ocfs2/dlm/dlmver.c      |   42 +
 fs/ocfs2/dlm/dlmver.h      |   31 +
 17 files changed, 10830 insertions(+)
 create mode 100644 fs/ocfs2/dlm/Makefile
 create mode 100644 fs/ocfs2/dlm/dlmapi.h
 create mode 100644 fs/ocfs2/dlm/dlmast.c
 create mode 100644 fs/ocfs2/dlm/dlmcommon.h
 create mode 100644 fs/ocfs2/dlm/dlmconvert.c
 create mode 100644 fs/ocfs2/dlm/dlmconvert.h
 create mode 100644 fs/ocfs2/dlm/dlmdebug.c
 create mode 100644 fs/ocfs2/dlm/dlmdebug.h
 create mode 100644 fs/ocfs2/dlm/dlmdomain.c
 create mode 100644 fs/ocfs2/dlm/dlmdomain.h
 create mode 100644 fs/ocfs2/dlm/dlmlock.c
 create mode 100644 fs/ocfs2/dlm/dlmmaster.c
 create mode 100644 fs/ocfs2/dlm/dlmrecovery.c
 create mode 100644 fs/ocfs2/dlm/dlmthread.c
 create mode 100644 fs/ocfs2/dlm/dlmunlock.c
 create mode 100644 fs/ocfs2/dlm/dlmver.c
 create mode 100644 fs/ocfs2/dlm/dlmver.h

diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 0000000..2a5274b
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,6 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o
+
+ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
+	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 0000000..53652f5
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmapi.h
+ *
+ * externally exported dlm interfaces
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMAPI_H
+#define DLMAPI_H
+
+struct dlm_lock;
+struct dlm_ctxt;
+
+/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
+enum dlm_status {
+	DLM_NORMAL = 0,           /*  0: request in progress */
+	DLM_GRANTED,              /*  1: request granted */
+	DLM_DENIED,               /*  2: request denied */
+	DLM_DENIED_NOLOCKS,       /*  3: request denied, out of system resources */
+	DLM_WORKING,              /*  4: async request in progress */
+	DLM_BLOCKED,              /*  5: lock request blocked */
+	DLM_BLOCKED_ORPHAN,       /*  6: lock request blocked by a orphan lock*/
+	DLM_DENIED_GRACE_PERIOD,  /*  7: topological change in progress */
+	DLM_SYSERR,               /*  8: system error */
+	DLM_NOSUPPORT,            /*  9: unsupported */
+	DLM_CANCELGRANT,          /* 10: can't cancel convert: already granted */
+	DLM_IVLOCKID,             /* 11: bad lockid */
+	DLM_SYNC,                 /* 12: synchronous request granted */
+	DLM_BADTYPE,              /* 13: bad resource type */
+	DLM_BADRESOURCE,          /* 14: bad resource handle */
+	DLM_MAXHANDLES,           /* 15: no more resource handles */
+	DLM_NOCLINFO,             /* 16: can't contact cluster manager */
+	DLM_NOLOCKMGR,            /* 17: can't contact lock manager */
+	DLM_NOPURGED,             /* 18: can't contact purge daemon */
+	DLM_BADARGS,              /* 19: bad api args */
+	DLM_VOID,                 /* 20: no status */
+	DLM_NOTQUEUED,            /* 21: NOQUEUE was specified and request failed */
+	DLM_IVBUFLEN,             /* 22: invalid resource name length */
+	DLM_CVTUNGRANT,           /* 23: attempted to convert ungranted lock */
+	DLM_BADPARAM,             /* 24: invalid lock mode specified */
+	DLM_VALNOTVALID,          /* 25: value block has been invalidated */
+	DLM_REJECTED,             /* 26: request rejected, unrecognized client */
+	DLM_ABORT,                /* 27: blocked lock request cancelled */
+	DLM_CANCEL,               /* 28: conversion request cancelled */
+	DLM_IVRESHANDLE,          /* 29: invalid resource handle */
+	DLM_DEADLOCK,             /* 30: deadlock recovery refused this request */
+	DLM_DENIED_NOASTS,        /* 31: failed to allocate AST */
+	DLM_FORWARD,              /* 32: request must wait for primary's response */
+	DLM_TIMEOUT,              /* 33: timeout value for lock has expired */
+	DLM_IVGROUPID,            /* 34: invalid group specification */
+	DLM_VERS_CONFLICT,        /* 35: version conflicts prevent request handling */
+	DLM_BAD_DEVICE_PATH,      /* 36: Locks device does not exist or path wrong */
+	DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
+	DLM_NO_CONTROL_DEVICE,    /* 38: Cannot set options on opened device */
+
+	DLM_RECOVERING,           /* 39: extension, allows caller to fail a lock
+				     request if it is being recovered */
+	DLM_MIGRATING,            /* 40: extension, allows caller to fail a lock
+				     request if it is being migrated */
+	DLM_MAXSTATS,             /* 41: upper limit for return code validation */
+};
+
+/* for pretty-printing dlm_status error messages */
+const char *dlm_errmsg(enum dlm_status err);
+/* for pretty-printing dlm_status error names */
+const char *dlm_errname(enum dlm_status err);
+
+/* Eventually the DLM will use standard errno values, but in the
+ * meantime this lets us track dlm errors as they bubble up. When we
+ * bring its error reporting into line with the rest of the stack,
+ * these can just be replaced with calls to mlog_errno. */
+#define dlm_error(st) do {						\
+	if ((st) != DLM_RECOVERING &&					\
+	    (st) != DLM_MIGRATING &&					\
+	    (st) != DLM_FORWARD)					\
+		mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st)));	\
+} while (0)
+
+#define DLM_LKSB_UNUSED1           0x01  
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+#define DLM_LVB_LEN  64
+
+/* Callers are only allowed access to the lvb and status members of
+ * this struct. */
+struct dlm_lockstatus {
+	enum dlm_status status;
+	u32 flags;
+	struct dlm_lock *lockid;
+	char lvb[DLM_LVB_LEN];
+};
+
+/* Valid lock modes. */
+#define LKM_IVMODE      (-1)            /* invalid mode */
+#define LKM_NLMODE      0               /* null lock */
+#define LKM_CRMODE      1               /* concurrent read    unsupported */
+#define LKM_CWMODE      2               /* concurrent write   unsupported */
+#define LKM_PRMODE      3               /* protected read */
+#define LKM_PWMODE      4               /* protected write    unsupported */
+#define LKM_EXMODE      5               /* exclusive */
+#define LKM_MAXMODE     5
+#define LKM_MODEMASK    0xff
+
+/* Flags passed to dlmlock and dlmunlock:
+ * reserved: flags used by the "real" dlm
+ * only a few are supported by this dlm
+ * (U) = unsupported by ocfs2 dlm */
+#define LKM_ORPHAN       0x00000010  /* this lock is orphanable (U) */
+#define LKM_PARENTABLE   0x00000020  /* this lock was orphaned (U) */
+#define LKM_BLOCK        0x00000040  /* blocking lock request (U) */
+#define LKM_LOCAL        0x00000080  /* local lock request */
+#define LKM_VALBLK       0x00000100  /* lock value block request */
+#define LKM_NOQUEUE      0x00000200  /* non blocking request */
+#define LKM_CONVERT      0x00000400  /* conversion request */
+#define LKM_NODLCKWT     0x00000800  /* this lock wont deadlock (U) */
+#define LKM_UNLOCK       0x00001000  /* deallocate this lock */
+#define LKM_CANCEL       0x00002000  /* cancel conversion request */
+#define LKM_DEQALL       0x00004000  /* remove all locks held by proc (U) */
+#define LKM_INVVALBLK    0x00008000  /* invalidate lock value block */
+#define LKM_SYNCSTS      0x00010000  /* return synchronous status if poss (U) */
+#define LKM_TIMEOUT      0x00020000  /* lock request contains timeout (U) */
+#define LKM_SNGLDLCK     0x00040000  /* request can self-deadlock (U) */
+#define LKM_FINDLOCAL    0x00080000  /* find local lock request (U) */
+#define LKM_PROC_OWNED   0x00100000  /* owned by process, not group (U) */
+#define LKM_XID          0x00200000  /* use transaction id for deadlock (U) */
+#define LKM_XID_CONFLICT 0x00400000  /* do not allow lock inheritance (U) */
+#define LKM_FORCE        0x00800000  /* force unlock flag */
+#define LKM_REVVALBLK    0x01000000  /* temporary solution: re-validate
+					lock value block (U) */
+/* unused */
+#define LKM_UNUSED1      0x00000001  /* unused */
+#define LKM_UNUSED2      0x00000002  /* unused */
+#define LKM_UNUSED3      0x00000004  /* unused */
+#define LKM_UNUSED4      0x00000008  /* unused */
+#define LKM_UNUSED5      0x02000000  /* unused */
+#define LKM_UNUSED6      0x04000000  /* unused */
+#define LKM_UNUSED7      0x08000000  /* unused */
+
+/* ocfs2 extensions: internal only
+ * should never be used by caller */
+#define LKM_MIGRATION    0x10000000  /* extension: lockres is to be migrated
+					to another node */
+#define LKM_PUT_LVB      0x20000000  /* extension: lvb is being passed
+					should be applied to lockres */
+#define LKM_GET_LVB      0x40000000  /* extension: lvb should be copied
+					from lockres when lock is granted */
+#define LKM_RECOVERY     0x80000000  /* extension: flag for recovery lock
+					used to avoid recovery rwsem */
+
+
+typedef void (dlm_astlockfunc_t)(void *);
+typedef void (dlm_bastlockfunc_t)(void *, int);
+typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm,
+			int mode,
+			struct dlm_lockstatus *lksb,
+			int flags,
+			const char *name,
+			dlm_astlockfunc_t *ast,
+			void *data,
+			dlm_bastlockfunc_t *bast);
+
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
+			  struct dlm_lockstatus *lksb,
+			  int flags,
+			  dlm_astunlockfunc_t *unlockast,
+			  void *data);
+
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm);
+
+void dlm_print_one_lock(struct dlm_lock *lockid);
+
+typedef void (dlm_eviction_func)(int, void *);
+struct dlm_eviction_cb {
+	struct list_head        ec_item;
+	dlm_eviction_func       *ec_func;
+	void                    *ec_data;
+};
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data);
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb);
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
+
+#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 0000000..8d17d28
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmast.c
+ *
+ * AST and BAST functionality for local and remote nodes
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+#include "cluster/endian.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock);
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+/* Should be called as an ast gets queued to see if the new
+ * lock level will obsolete a pending bast.
+ * For example, if dlm_thread queued a bast for an EX lock that
+ * was blocking another EX, but before sending the bast the
+ * lock owner downconverted to NL, the bast is now obsolete.
+ * Only the ast should be sent.
+ * This is needed because the lock and convert paths can queue
+ * asts out-of-band (not waiting for dlm_thread) in order to
+ * allow for LKM_NOQUEUE to get immediate responses. */
+static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	assert_spin_locked(&dlm->ast_lock);
+	assert_spin_locked(&lock->spinlock);
+
+	if (lock->ml.highest_blocked == LKM_IVMODE)
+		return 0;
+	BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
+
+	if (lock->bast_pending &&
+	    list_empty(&lock->bast_list))
+		/* old bast already sent, ok */
+		return 0;
+
+	if (lock->ml.type == LKM_EXMODE)
+		/* EX blocks anything left, any bast still valid */
+		return 0;
+	else if (lock->ml.type == LKM_NLMODE)
+		/* NL blocks nothing, no reason to send any bast, cancel it */
+		return 1;
+	else if (lock->ml.highest_blocked != LKM_EXMODE)
+		/* PR only blocks EX */
+		return 1;
+
+	return 0;
+}
+
+static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	assert_spin_locked(&dlm->ast_lock);
+	if (!list_empty(&lock->ast_list)) {
+		mlog(ML_ERROR, "ast list not empty!!  pending=%d, newlevel=%d\n",
+		     lock->ast_pending, lock->ml.type);
+		BUG();
+	}
+	BUG_ON(!list_empty(&lock->ast_list));
+	if (lock->ast_pending)
+		mlog(0, "lock has an ast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+
+	/* check to see if this ast obsoletes the bast */
+	if (dlm_should_cancel_bast(dlm, lock)) {
+		struct dlm_lock_resource *res = lock->lockres;
+		mlog(0, "%s: cancelling bast for %.*s\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		lock->bast_pending = 0;
+		list_del_init(&lock->bast_list);
+		lock->ml.highest_blocked = LKM_IVMODE;
+		/* removing lock from list, remove a ref.  guaranteed
+		 * this won't be the last ref because of the get above,
+		 * so res->spinlock will not be taken here */
+		dlm_lock_put(lock);
+		/* free up the reserved bast that we are cancelling.
+		 * guaranteed that this will not be the last reserved
+		 * ast because *both* an ast and a bast were reserved 
+		 * to get to this point.  the res->spinlock will not be
+		 * taken here */
+		dlm_lockres_release_ast(dlm, res);
+	}
+	list_add_tail(&lock->ast_list, &dlm->pending_asts);
+	lock->ast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_ast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+	assert_spin_locked(&dlm->ast_lock);
+
+	BUG_ON(!list_empty(&lock->bast_list));
+	if (lock->bast_pending)
+		mlog(0, "lock has a bast getting flushed right now\n");
+
+	/* putting lock on list, add a ref */
+	dlm_lock_get(lock);
+	spin_lock(&lock->spinlock);
+	list_add_tail(&lock->bast_list, &dlm->pending_basts);
+	lock->bast_pending = 1;
+	spin_unlock(&lock->spinlock);
+}
+
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	mlog_entry_void();
+
+	BUG_ON(!dlm);
+	BUG_ON(!lock);
+
+	spin_lock(&dlm->ast_lock);
+	__dlm_queue_bast(dlm, lock);
+	spin_unlock(&dlm->ast_lock);
+}
+
+static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock)
+{
+	struct dlm_lockstatus *lksb = lock->lksb;
+	BUG_ON(!lksb);
+
+	/* only updates if this node masters the lockres */
+	if (res->owner == dlm->node_num) {
+
+		spin_lock(&res->spinlock);
+		/* check the lksb flags for the direction */
+		if (lksb->flags & DLM_LKSB_GET_LVB) {
+			mlog(0, "getting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
+		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
+			mlog(0, "setting lvb from lockres for %s node\n",
+				  lock->ml.node == dlm->node_num ? "master" :
+				  "remote");
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		}
+		spin_unlock(&res->spinlock);
+	}
+
+	/* reset any lvb flags on the lksb */
+	lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+}
+
+void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	dlm_astlockfunc_t *fn;
+	struct dlm_lockstatus *lksb;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	fn = lock->ast;
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	dlm_update_lvb(dlm, res, lock);
+	(*fn)(lock->astdata);
+}
+
+
+int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		      struct dlm_lock *lock)
+{
+	int ret;
+	struct dlm_lockstatus *lksb;
+	int lksbflags;
+
+	mlog_entry_void();
+
+	lksb = lock->lksb;
+	BUG_ON(lock->ml.node == dlm->node_num);
+
+	lksbflags = lksb->flags;
+	dlm_update_lvb(dlm, res, lock);
+
+	/* lock request came from another node
+	 * go do the ast over there */
+	ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
+	return ret;
+}
+
+void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+		       struct dlm_lock *lock, int blocked_type)
+{
+	dlm_bastlockfunc_t *fn = lock->bast;
+
+	mlog_entry_void();
+	BUG_ON(lock->ml.node != dlm->node_num);
+
+	(*fn)(lock->astdata, blocked_type);
+}
+
+
+
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	int ret;
+	unsigned int locklen;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
+	char *name;
+	struct list_head *iter, *head=NULL;
+	u64 cookie;
+	u32 flags;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = past->name;
+	locklen = past->namelen;
+	cookie = be64_to_cpu(past->cookie);
+	flags = be32_to_cpu(past->flags);
+
+	if (locklen > DLM_LOCKID_NAME_MAX) {
+		ret = DLM_IVBUFLEN;
+		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		goto leave;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		ret = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+		  (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
+
+	if (past->type != DLM_AST &&
+	    past->type != DLM_BAST) {
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
+		     "name=%.*s\n", past->type, cookie, locklen, name);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	res = dlm_lookup_lockres(dlm, name, locklen);
+	if (!res) {
+		mlog(ML_ERROR, "got %sast for unknown lockres! "
+			       "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
+		     past->type == DLM_AST ? "" : "b",
+		     cookie, locklen, name, locklen);
+		ret = DLM_IVLOCKID;
+		goto leave;
+	}
+
+	/* cannot get a proxy ast message if this node owns it */
+	BUG_ON(res->owner == dlm->node_num);
+
+	mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "responding with DLM_RECOVERING!\n");
+		ret = DLM_RECOVERING;
+		goto unlock_out;
+	}
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "responding with DLM_MIGRATING!\n");
+		ret = DLM_MIGRATING;
+		goto unlock_out;
+	}
+	/* try convert queue for both ast/bast */
+	head = &res->converting;
+	lock = NULL;
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	/* if not on convert, try blocked for ast, granted for bast */
+	if (past->type == DLM_AST)
+		head = &res->blocked;
+	else
+		head = &res->granted;
+
+	list_for_each(iter, head) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (be64_to_cpu(lock->ml.cookie) == cookie)
+			goto do_ast;
+	}
+
+	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%"MLFu64", "
+		       "name=%.*s, namelen=%u\n",
+             past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
+
+	ret = DLM_NORMAL;
+unlock_out:
+	spin_unlock(&res->spinlock);
+	goto leave;
+
+do_ast:
+	ret = DLM_NORMAL;
+	if (past->type == DLM_AST) {
+		/* do not alter lock refcount.  switching lists. */
+		list_del_init(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
+		mlog(0, "ast: adding to granted list... type=%d, "
+			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		if (lock->ml.convert_type != LKM_IVMODE) {
+			lock->ml.type = lock->ml.convert_type;
+			lock->ml.convert_type = LKM_IVMODE;
+		} else {
+			// should already be there....
+		}
+
+		lock->lksb->status = DLM_NORMAL;
+
+		/* if we requested the lvb, fetch it into our lksb now */
+		if (flags & LKM_GET_LVB) {
+			BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
+			memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	if (past->type == DLM_AST)
+		dlm_do_local_ast(dlm, res, lock);
+	else
+		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
+
+leave:
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+	return ret;
+}
+
+
+
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			   struct dlm_lock *lock, int msg_type,
+			   int blocked_type, int flags)
+{
+	int ret = 0;
+	struct dlm_proxy_ast past;
+	struct kvec vec[2];
+	size_t veclen = 1;
+	int status;
+
+	mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
+		   res->lockname.len, res->lockname.name, lock->ml.node,
+		   msg_type, blocked_type);
+
+	memset(&past, 0, sizeof(struct dlm_proxy_ast));
+	past.node_idx = dlm->node_num;
+	past.type = msg_type;
+	past.blocked_type = blocked_type;
+	past.namelen = res->lockname.len;
+	memcpy(past.name, res->lockname.name, past.namelen);
+	past.cookie = lock->ml.cookie;
+
+	vec[0].iov_len = sizeof(struct dlm_proxy_ast);
+	vec[0].iov_base = &past;
+	if (flags & DLM_LKSB_GET_LVB) {
+		mlog(0, "returning requested LVB data\n");
+		be32_add_cpu(&past.flags, LKM_GET_LVB);
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
+				     lock->ml.node, &status);
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		if (status == DLM_RECOVERING) {
+			mlog(ML_ERROR, "sent AST to node %u, it thinks this "
+			     "node is dead!\n", lock->ml.node);
+			BUG();
+		} else if (status == DLM_MIGRATING) {
+			mlog(ML_ERROR, "sent AST to node %u, it returned "
+			     "DLM_MIGRATING!\n", lock->ml.node);
+			BUG();
+		} else if (status != DLM_NORMAL) {
+			mlog(ML_ERROR, "AST to node %u returned %d!\n",
+			     lock->ml.node, status);
+			/* ignore it */
+		}
+		ret = 0;
+	}
+	return ret;
+}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 0000000..3fecba0
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmcommon.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCOMMON_H
+#define DLMCOMMON_H
+
+#include <linux/kref.h>
+
+#define DLM_HB_NODE_DOWN_PRI     (0xf000000)
+#define DLM_HB_NODE_UP_PRI       (0x8000000)
+
+#define DLM_LOCKID_NAME_MAX    32
+
+#define DLM_DOMAIN_NAME_MAX_LEN    255
+#define DLM_LOCK_RES_OWNER_UNKNOWN     O2NM_MAX_NODES
+#define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
+#define DLM_THREAD_MS                  200   // flush at least every 200 ms
+
+#define DLM_HASH_BITS     7
+#define DLM_HASH_SIZE     (1 << DLM_HASH_BITS)
+#define DLM_HASH_MASK     (DLM_HASH_SIZE - 1)
+
+enum dlm_ast_type {
+	DLM_AST = 0,
+	DLM_BAST,
+	DLM_ASTUNLOCK
+};
+
+
+#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
+			 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
+			 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
+
+#define DLM_RECOVERY_LOCK_NAME       "$RECOVERY"
+#define DLM_RECOVERY_LOCK_NAME_LEN   9
+
+static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
+{
+	if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
+	    memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
+		return 1;
+	return 0;
+}
+
+#define DLM_RECO_STATE_ACTIVE  0x0001
+
+struct dlm_recovery_ctxt
+{
+	struct list_head resources;
+	struct list_head received;
+	struct list_head node_data;
+	u8  new_master;
+	u8  dead_node;
+	u16 state;
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	wait_queue_head_t event;
+};
+
+enum dlm_ctxt_state {
+	DLM_CTXT_NEW = 0,
+	DLM_CTXT_JOINED,
+	DLM_CTXT_IN_SHUTDOWN,
+	DLM_CTXT_LEAVING,
+};
+
+struct dlm_ctxt
+{
+	struct list_head list;
+	struct list_head *resources;
+	struct list_head dirty_list;
+	struct list_head purge_list;
+	struct list_head pending_asts;
+	struct list_head pending_basts;
+	unsigned int purge_count;
+	spinlock_t spinlock;
+	spinlock_t ast_lock;
+	char *name;
+	u8 node_num;
+	u32 key;
+	u8  joining_node;
+	wait_queue_head_t dlm_join_events;
+	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	struct dlm_recovery_ctxt reco;
+	spinlock_t master_lock;
+	struct list_head master_list;
+	struct list_head mle_hb_events;
+
+	/* these give a really vague idea of the system load */
+	atomic_t local_resources;
+	atomic_t remote_resources;
+	atomic_t unknown_resources;
+
+	/* NOTE: Next three are protected by dlm_domain_lock */
+	struct kref dlm_refs;
+	enum dlm_ctxt_state dlm_state;
+	unsigned int num_joins;
+
+	struct o2hb_callback_func dlm_hb_up;
+	struct o2hb_callback_func dlm_hb_down;
+	struct task_struct *dlm_thread_task;
+	struct task_struct *dlm_reco_thread_task;
+	wait_queue_head_t dlm_thread_wq;
+	wait_queue_head_t dlm_reco_thread_wq;
+	wait_queue_head_t ast_wq;
+	wait_queue_head_t migration_wq;
+
+	struct work_struct dispatched_work;
+	struct list_head work_list;
+	spinlock_t work_lock;
+	struct list_head dlm_domain_handlers;
+	struct list_head	dlm_eviction_callbacks;
+};
+
+/* these keventd work queue items are for less-frequently
+ * called functions that cannot be directly called from the
+ * net message handlers for some reason, usually because
+ * they need to send net messages of their own. */
+void dlm_dispatch_work(void *data);
+
+struct dlm_lock_resource;
+struct dlm_work_item;
+
+typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
+
+struct dlm_request_all_locks_priv
+{
+	u8 reco_master;
+	u8 dead_node;
+};
+
+struct dlm_mig_lockres_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 real_master;
+};
+
+struct dlm_assert_master_priv
+{
+	struct dlm_lock_resource *lockres;
+	u8 request_from;
+	u32 flags;
+	unsigned ignore_higher:1;
+};
+
+
+struct dlm_work_item
+{
+	struct list_head list;
+	dlm_workfunc_t *func;
+	struct dlm_ctxt *dlm;
+	void *data;
+	union {
+		struct dlm_request_all_locks_priv ral;
+		struct dlm_mig_lockres_priv ml;
+		struct dlm_assert_master_priv am;
+	} u;
+};
+
+static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
+				      struct dlm_work_item *i,
+				      dlm_workfunc_t *f, void *data)
+{
+	memset(i, 0, sizeof(*i));
+	i->func = f;
+	INIT_LIST_HEAD(&i->list);
+	i->data = data;
+	i->dlm = dlm;  /* must have already done a dlm_grab on this! */
+}
+
+
+
+static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
+					  u8 node)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	dlm->joining_node = node;
+	wake_up(&dlm->dlm_join_events);
+}
+
+#define DLM_LOCK_RES_UNINITED             0x00000001
+#define DLM_LOCK_RES_RECOVERING           0x00000002
+#define DLM_LOCK_RES_READY                0x00000004
+#define DLM_LOCK_RES_DIRTY                0x00000008
+#define DLM_LOCK_RES_IN_PROGRESS          0x00000010
+#define DLM_LOCK_RES_MIGRATING            0x00000020
+
+#define DLM_PURGE_INTERVAL_MS   (8 * 1000)
+
+struct dlm_lock_resource
+{
+	/* WARNING: Please see the comment in dlm_init_lockres before
+	 * adding fields here. */
+	struct list_head list;
+	struct kref      refs;
+
+	/* please keep these next 3 in this order
+	 * some funcs want to iterate over all lists */
+	struct list_head granted;
+	struct list_head converting;
+	struct list_head blocked;
+
+	struct list_head dirty;
+	struct list_head recovering; // dlm_recovery_ctxt.resources list
+
+	/* unused lock resources have their last_used stamped and are
+	 * put on a list for the dlm thread to run. */
+	struct list_head purge;
+	unsigned long    last_used;
+
+	unsigned migration_pending:1;
+	atomic_t asts_reserved;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	u8  owner;              //node which owns the lock resource, or unknown
+	u16 state;
+	struct qstr lockname;
+	char lvb[DLM_LVB_LEN];
+};
+
+struct dlm_migratable_lock
+{
+	__be64 cookie;
+
+	/* these 3 are just padding for the in-memory structure, but
+	 * list and flags are actually used when sent over the wire */
+	__be16 pad1;
+	u8 list;  // 0=granted, 1=converting, 2=blocked
+	u8 flags;
+
+	s8 type;
+	s8 convert_type;
+	s8 highest_blocked;
+	u8 node;
+};  // 16 bytes
+
+struct dlm_lock
+{
+	struct dlm_migratable_lock ml;
+
+	struct list_head list;
+	struct list_head ast_list;
+	struct list_head bast_list;
+	struct dlm_lock_resource *lockres;
+	spinlock_t spinlock;
+	struct kref lock_refs;
+
+	// ast and bast must be callable while holding a spinlock!
+	dlm_astlockfunc_t *ast;
+	dlm_bastlockfunc_t *bast;
+	void *astdata;
+	struct dlm_lockstatus *lksb;
+	unsigned ast_pending:1,
+		 bast_pending:1,
+		 convert_pending:1,
+		 lock_pending:1,
+		 cancel_pending:1,
+		 unlock_pending:1,
+		 lksb_kernel_allocated:1;
+};
+
+
+#define DLM_LKSB_UNUSED1           0x01
+#define DLM_LKSB_PUT_LVB           0x02
+#define DLM_LKSB_GET_LVB           0x04
+#define DLM_LKSB_UNUSED2           0x08
+#define DLM_LKSB_UNUSED3           0x10
+#define DLM_LKSB_UNUSED4           0x20
+#define DLM_LKSB_UNUSED5           0x40
+#define DLM_LKSB_UNUSED6           0x80
+
+
+enum dlm_lockres_list {
+	DLM_GRANTED_LIST = 0,
+	DLM_CONVERTING_LIST,
+	DLM_BLOCKED_LIST
+};
+
+static inline struct list_head *
+dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
+{
+	struct list_head *ret = NULL;
+	if (idx == DLM_GRANTED_LIST)
+		ret = &res->granted;
+	else if (idx == DLM_CONVERTING_LIST)
+		ret = &res->converting;
+	else if (idx == DLM_BLOCKED_LIST)
+		ret = &res->blocked;
+	else
+		BUG();
+	return ret;
+}
+
+
+
+
+struct dlm_node_iter
+{
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int curnode;
+};
+
+
+enum {
+	DLM_MASTER_REQUEST_MSG    = 500,
+	DLM_UNUSED_MSG1,         /* 501 */
+	DLM_ASSERT_MASTER_MSG,	 /* 502 */
+	DLM_CREATE_LOCK_MSG,	 /* 503 */
+	DLM_CONVERT_LOCK_MSG,	 /* 504 */
+	DLM_PROXY_AST_MSG,	 /* 505 */
+	DLM_UNLOCK_LOCK_MSG,	 /* 506 */
+	DLM_UNUSED_MSG2,	 /* 507 */
+	DLM_MIGRATE_REQUEST_MSG, /* 508 */
+	DLM_MIG_LOCKRES_MSG, 	 /* 509 */
+	DLM_QUERY_JOIN_MSG,	 /* 510 */
+	DLM_ASSERT_JOINED_MSG,	 /* 511 */
+	DLM_CANCEL_JOIN_MSG,	 /* 512 */
+	DLM_EXIT_DOMAIN_MSG,	 /* 513 */
+	DLM_MASTER_REQUERY_MSG,	 /* 514 */
+	DLM_LOCK_REQUEST_MSG,	 /* 515 */
+	DLM_RECO_DATA_DONE_MSG,	 /* 516 */
+	DLM_BEGIN_RECO_MSG,	 /* 517 */
+	DLM_FINALIZE_RECO_MSG	 /* 518 */
+};
+
+struct dlm_reco_node_data
+{
+	int state;
+	u8 node_num;
+	struct list_head list;
+};
+
+enum {
+	DLM_RECO_NODE_DATA_DEAD = -1,
+	DLM_RECO_NODE_DATA_INIT = 0,
+	DLM_RECO_NODE_DATA_REQUESTING,
+	DLM_RECO_NODE_DATA_REQUESTED,
+	DLM_RECO_NODE_DATA_RECEIVING,
+	DLM_RECO_NODE_DATA_DONE,
+	DLM_RECO_NODE_DATA_FINALIZE_SENT,
+};
+
+
+enum {
+	DLM_MASTER_RESP_NO = 0,
+	DLM_MASTER_RESP_YES,
+	DLM_MASTER_RESP_MAYBE,
+	DLM_MASTER_RESP_ERROR
+};
+
+
+struct dlm_master_request
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_ASSERT_MASTER_MLE_CLEANUP      0x00000001
+#define DLM_ASSERT_MASTER_REQUERY          0x00000002
+#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
+struct dlm_assert_master
+{
+	u8 node_idx;
+	u8 namelen;
+	__be16 pad1;
+	__be32 flags;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_migrate_request
+{
+	u8 master;
+	u8 new_master;
+	u8 namelen;
+	u8 pad1;
+	__be32 pad2;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_master_requery
+{
+	u8 pad1;
+	u8 pad2;
+	u8 node_idx;
+	u8 namelen;
+	__be32 pad3;
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+#define DLM_MRES_RECOVERY   0x01
+#define DLM_MRES_MIGRATION  0x02
+#define DLM_MRES_ALL_DONE   0x04
+
+/*
+ * We would like to get one whole lockres into a single network
+ * message whenever possible.  Generally speaking, there will be
+ * at most one dlm_lock on a lockres for each node in the cluster,
+ * plus (infrequently) any additional locks coming in from userdlm.
+ *
+ * struct _dlm_lockres_page
+ * {
+ * 	dlm_migratable_lockres mres;
+ * 	dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
+ * 	u8 pad[DLM_MIG_LOCKRES_RESERVED];
+ * };
+ *
+ * from ../cluster/tcp.h
+ *    NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(net_msg))
+ *    (roughly 4080 bytes)
+ * and sizeof(dlm_migratable_lockres) = 112 bytes
+ * and sizeof(dlm_migratable_lock) = 16 bytes
+ *
+ * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
+ * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
+ *
+ *  (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
+ *     sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
+ *        NET_MAX_PAYLOAD_BYTES
+ *  (240 * 16) + 112 + 128 = 4080
+ *
+ * So a lockres would need more than 240 locks before it would
+ * use more than one network packet to recover.  Not too bad.
+ */
+#define DLM_MAX_MIGRATABLE_LOCKS   240
+
+struct dlm_migratable_lockres
+{
+	u8 master;
+	u8 lockname_len;
+	u8 num_locks;    // locks sent in this structure
+	u8 flags;
+	__be32 total_locks; // locks to be sent for this migration cookie
+	__be64 mig_cookie;  // cookie for this lockres migration
+			 // or zero if not needed
+	// 16 bytes
+	u8 lockname[DLM_LOCKID_NAME_MAX];
+	// 48 bytes
+	u8 lvb[DLM_LVB_LEN];
+	// 112 bytes
+	struct dlm_migratable_lock ml[0];  // 16 bytes each, begins at byte 112
+};
+#define DLM_MIG_LOCKRES_MAX_LEN  \
+	(sizeof(struct dlm_migratable_lockres) + \
+	 (sizeof(struct dlm_migratable_lock) * \
+	  DLM_MAX_MIGRATABLE_LOCKS) )
+
+/* from above, 128 bytes
+ * for some undetermined future use */
+#define DLM_MIG_LOCKRES_RESERVED   (NET_MAX_PAYLOAD_BYTES - \
+				    DLM_MIG_LOCKRES_MAX_LEN)
+
+struct dlm_create_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_convert_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 pad1;
+	u8 node_idx;
+	s8 requested_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_CONVERT_LOCK_MAX_LEN  (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
+
+struct dlm_unlock_lock
+{
+	__be64 cookie;
+
+	__be32 flags;
+	__be16 pad1;
+	u8 node_idx;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_UNLOCK_LOCK_MAX_LEN  (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
+
+struct dlm_proxy_ast
+{
+	__be64 cookie;
+
+	__be32 flags;
+	u8 node_idx;
+	u8 type;
+	u8 blocked_type;
+	u8 namelen;
+
+	u8 name[O2NM_MAX_NAME_LEN];
+
+	s8 lvb[0];
+};
+#define DLM_PROXY_AST_MAX_LEN  (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
+
+#define DLM_MOD_KEY (0x666c6172)
+enum dlm_query_join_response {
+	JOIN_DISALLOW = 0,
+	JOIN_OK,
+	JOIN_OK_NO_MAP,
+};
+
+struct dlm_lock_request
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+struct dlm_reco_data_done
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+
+	/* unused for now */
+	/* eventually we can use this to attempt
+	 * lvb recovery based on each node's info */
+	u8 reco_lvb[DLM_LVB_LEN];
+};
+
+struct dlm_begin_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+
+struct dlm_query_join_request
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_assert_joined
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_cancel_join
+{
+	u8 node_idx;
+	u8 pad1[2];
+	u8 name_len;
+	u8 domain[O2NM_MAX_NAME_LEN];
+};
+
+struct dlm_exit_domain
+{
+	u8 node_idx;
+	u8 pad1[3];
+};
+
+struct dlm_finalize_reco
+{
+	u8 node_idx;
+	u8 dead_node;
+	__be16 pad1;
+	__be32 pad2;
+};
+
+static inline enum dlm_status
+__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
+{
+	enum dlm_status status = DLM_NORMAL;
+
+	assert_spin_locked(&res->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING)
+		status = DLM_RECOVERING;
+	else if (res->state & DLM_LOCK_RES_MIGRATING)
+		status = DLM_MIGRATING;
+	else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
+		status = DLM_FORWARD;
+
+	return status;
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb);
+void dlm_lock_get(struct dlm_lock *lock);
+void dlm_lock_put(struct dlm_lock *lock);
+
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res);
+
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock);
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock);
+
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock);
+
+int dlm_launch_thread(struct dlm_ctxt *dlm);
+void dlm_complete_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+
+void dlm_put(struct dlm_ctxt *dlm);
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
+
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res);
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res);
+void dlm_purge_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *lockres);
+void dlm_lockres_get(struct dlm_lock_resource *res);
+void dlm_lockres_put(struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+			  struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+						const char *name,
+						unsigned int len);
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					      const char *name,
+					      unsigned int len);
+
+int dlm_is_host_down(int errno);
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res,
+			      u8 owner);
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+						 const char *lockid,
+						 int flags);
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+					  const char *name,
+					  unsigned int namelen);
+
+void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void dlm_do_local_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+int dlm_do_remote_ast(struct dlm_ctxt *dlm,
+		      struct dlm_lock_resource *res,
+		      struct dlm_lock *lock);
+void dlm_do_local_bast(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res,
+		       struct dlm_lock *lock,
+		       int blocked_type);
+int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
+			   struct dlm_lock_resource *res,
+			   struct dlm_lock *lock,
+			   int msg_type,
+			   int blocked_type, int flags);
+static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock,
+				      int blocked_type)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
+				      blocked_type, 0);
+}
+
+static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_lock *lock,
+				     int flags)
+{
+	return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
+				      0, flags);
+}
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
+
+u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+
+
+int dlm_nm_init(struct dlm_ctxt *dlm);
+int dlm_heartbeat_init(struct dlm_ctxt *dlm);
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			u8 target);
+int dlm_finish_migration(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 u8 old_master);
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res);
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
+
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher,
+			       u8 request_from,
+			       u32 flags);
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm,
+			 struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to,
+			 u8 flags);
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
+
+/* will exit holding res->spinlock, but may drop in function */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
+void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
+
+/* will exit holding res->spinlock, but may drop in function */
+static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
+{
+	__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
+				    	  DLM_LOCK_RES_RECOVERING|
+					  DLM_LOCK_RES_MIGRATING));
+}
+
+
+int dlm_init_mle_cache(void);
+void dlm_destroy_mle_cache(void);
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+void dlm_clean_master_list(struct dlm_ctxt *dlm,
+			   u8 dead_node);
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+
+
+static inline const char * dlm_lock_mode_name(int mode)
+{
+	switch (mode) {
+		case LKM_EXMODE:
+			return "EX";
+		case LKM_PRMODE:
+			return "PR";
+		case LKM_NLMODE:
+			return "NL";
+	}
+	return "UNKNOWN";
+}
+
+
+static inline int dlm_lock_compatible(int existing, int request)
+{
+	/* NO_LOCK compatible with all */
+	if (request == LKM_NLMODE ||
+	    existing == LKM_NLMODE)
+		return 1;
+
+	/* EX incompatible with all non-NO_LOCK */
+	if (request == LKM_EXMODE)
+		return 0;
+
+	/* request must be PR, which is compatible with PR */
+	if (existing == LKM_PRMODE)
+		return 1;
+
+	return 0;
+}
+
+static inline int dlm_lock_on_list(struct list_head *head,
+				   struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, head) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			return 1;
+	}
+	return 0;
+}
+
+
+static inline enum dlm_status dlm_err_to_dlm_status(int err)
+{
+	enum dlm_status ret;
+	if (err == -ENOMEM)
+		ret = DLM_SYSERR;
+	else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
+		ret = DLM_NOLOCKMGR;
+	else if (err == -EINVAL)
+		ret = DLM_BADPARAM;
+	else if (err == -ENAMETOOLONG)
+		ret = DLM_IVBUFLEN;
+	else
+		ret = DLM_BADARGS;
+	return ret;
+}
+
+
+static inline void dlm_node_iter_init(unsigned long *map,
+				      struct dlm_node_iter *iter)
+{
+	memcpy(iter->node_map, map, sizeof(iter->node_map));
+	iter->curnode = -1;
+}
+
+static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
+{
+	int bit;
+	bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+	iter->curnode = bit;
+	return bit;
+}
+
+
+
+#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 0000000..6001b22
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.c
+ *
+ * underlying calls for lock conversion
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+/* NOTE: __dlmconvert_master is the only function in here that
+ * needs a spinlock held on entry (res->spinlock) and it is the
+ * only one that holds a lock on exit (res->spinlock).
+ * All other functions in here need no locks and drop all of
+ * the locks that they acquire. */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread);
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type);
+
+/*
+ * this is only called directly by dlmlock(), and only when the
+ * local node is the owner of the lockres
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: see __dlmconvert_master
+ */
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status;
+
+	spin_lock(&res->spinlock);
+	/* we are not in a network handler, this is fine */
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	status = __dlmconvert_master(dlm, res, lock, flags, type,
+				     &call_ast, &kick_thread);
+
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
+		dlm_error(status);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+/* performs lock conversion at the lockres master site
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         takes and drops lock->spinlock
+ *   held on exit:  res->spinlock
+ * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
+ *   call_ast: whether ast should be called for this lock
+ *   kick_thread: whether dlm_kick_thread should be called
+ */
+static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags,
+					   int type, int *call_ast,
+					   int *kick_thread)
+{
+	enum dlm_status status = DLM_NORMAL;
+	struct list_head *iter;
+	struct dlm_lock *tmplock=NULL;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
+		   lock->ml.type, lock->ml.convert_type, type);
+
+	spin_lock(&lock->spinlock);
+
+	/* already converting? */
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		mlog(ML_ERROR, "attempted to convert a lock with a lock "
+		     "conversion pending\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	/* must be on grant queue to convert */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		mlog(ML_ERROR, "attempted to convert a lock not on grant "
+		     "queue\n");
+		status = DLM_DENIED;
+		goto unlock_exit;
+	}
+
+	if (flags & LKM_VALBLK) {
+		switch (lock->ml.type) {
+			case LKM_EXMODE:
+				/* EX + LKM_VALBLK + convert == set lvb */
+				mlog(0, "will set lvb: converting %s->%s\n",
+				     dlm_lock_mode_name(lock->ml.type),
+				     dlm_lock_mode_name(type));
+				lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+				break;
+			case LKM_PRMODE:
+			case LKM_NLMODE:
+				/* refetch if new level is not NL */
+				if (type > LKM_NLMODE) {
+					mlog(0, "will fetch new value into "
+					     "lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					lock->lksb->flags |= DLM_LKSB_GET_LVB;
+				} else {
+					mlog(0, "will NOT fetch new value "
+					     "into lvb: converting %s->%s\n",
+					     dlm_lock_mode_name(lock->ml.type),
+					     dlm_lock_mode_name(type));
+					flags &= ~(LKM_VALBLK);
+				}
+				break;
+		}
+	}
+
+
+	/* in-place downconvert? */
+	if (type <= lock->ml.type)
+		goto grant;
+
+	/* upconvert from here on */
+	status = DLM_NORMAL;
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (tmplock == lock)
+			continue;
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+		if (!dlm_lock_compatible(tmplock->ml.type, type))
+			goto switch_queues;
+		/* existing conversion requests take precedence */
+		if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
+			goto switch_queues;
+	}
+
+	/* fall thru to grant */
+
+grant:
+	mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
+	     res->lockname.name, dlm_lock_mode_name(type));
+	/* immediately grant the new lock type */
+	lock->lksb->status = DLM_NORMAL;
+	if (lock->ml.node == dlm->node_num)
+		mlog(0, "doing in-place convert for nonlocal lock\n");
+	lock->ml.type = type;
+	status = DLM_NORMAL;
+	*call_ast = 1;
+	goto unlock_exit;
+
+switch_queues:
+	if (flags & LKM_NOQUEUE) {
+		mlog(0, "failed to convert NOQUEUE lock %.*s from "
+		     "%d to %d...\n", res->lockname.len, res->lockname.name,
+		     lock->ml.type, type);
+		status = DLM_NOTQUEUED;
+		goto unlock_exit;
+	}
+	mlog(0, "res %.*s, queueing...\n", res->lockname.len,
+	     res->lockname.name);
+
+	lock->ml.convert_type = type;
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+
+unlock_exit:
+	spin_unlock(&lock->spinlock);
+	if (status == DLM_DENIED) {
+		__dlm_print_one_lock_resource(res);
+	}
+	if (status == DLM_NORMAL)
+		*kick_thread = 1;
+	return status;
+}
+
+void dlm_revert_pending_convert(struct dlm_lock_resource *res,
+				struct dlm_lock *lock)
+{
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+}
+
+/* messages the master site to do lock conversion
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
+ */
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type)
+{
+	enum dlm_status status;
+
+	mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
+	     lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		mlog(0, "bailing out early since res is RECOVERING "
+		     "on secondary queue\n");
+		/* __dlm_print_one_lock_resource(res); */
+		status = DLM_RECOVERING;
+		goto bail;
+	}
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+
+	if (lock->ml.convert_type != LKM_IVMODE) {
+		__dlm_print_one_lock_resource(res);
+		mlog(ML_ERROR, "converting a remote lock that is already "
+		     "converting! (cookie=%"MLFu64", conv=%d)\n",
+		     lock->ml.cookie, lock->ml.convert_type);
+		status = DLM_DENIED;
+		goto bail;
+	}
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	/* move lock to local convert queue */
+	/* do not alter lock refcount.  switching lists. */
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->converting);
+	lock->convert_pending = 1;
+	lock->ml.convert_type = type;
+
+	if (flags & LKM_VALBLK) {
+		if (lock->ml.type == LKM_EXMODE) {
+			flags |= LKM_PUT_LVB;
+			lock->lksb->flags |= DLM_LKSB_PUT_LVB;
+		} else {
+			if (lock->ml.convert_type == LKM_NLMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+	}
+	spin_unlock(&res->spinlock);
+
+	/* no locks held here.
+	 * need to wait for a reply as to whether it got queued or not. */
+	status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->convert_pending = 0;
+	/* if it failed, move it back to granted queue */
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_convert(res, lock);
+	}
+bail:
+	spin_unlock(&res->spinlock);
+
+	/* TODO: should this be a wake_one? */
+	/* wake up any IN_PROGRESS waiters */
+	wake_up(&res->wq);
+
+	return status;
+}
+
+/* sends DLM_CONVERT_LOCK_MSG to master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, status from remote node
+ */
+static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
+					   struct dlm_lock_resource *res,
+					   struct dlm_lock *lock, int flags, int type)
+{
+	struct dlm_convert_lock convert;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&convert, 0, sizeof(struct dlm_convert_lock));
+	convert.node_idx = dlm->node_num;
+	convert.requested_type = type;
+	convert.cookie = lock->ml.cookie;
+	convert.namelen = res->lockname.len;
+	convert.flags = cpu_to_be32(flags);
+	memcpy(convert.name, res->lockname.name, convert.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_convert_lock);
+	vec[0].iov_base = &convert;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
+					vec, veclen, res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(0, "node %u returned DLM_RECOVERING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_MIGRATING) {
+			mlog(0, "node %u returned DLM_MIGRATING from convert "
+			     "message!\n", res->owner);
+		} else if (ret == DLM_FORWARD) {
+			mlog(0, "node %u returned DLM_FORWARD from convert "
+			     "message!\n", res->owner);
+		} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
+			dlm_error(ret);
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from convert message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+/* handler for DLM_CONVERT_LOCK_MSG on master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drop res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
+ *          status from __dlmconvert_master
+ */
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	struct dlm_lockstatus *lksb;
+	enum dlm_status status = DLM_NORMAL;
+	u32 flags;
+	int call_ast = 0, kick_thread = 0;
+
+	if (!dlm_grab(dlm)) {
+		dlm_error(DLM_REJECTED);
+		return DLM_REJECTED;
+	}
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
+		status = DLM_IVBUFLEN;
+		dlm_error(status);
+		goto leave;
+	}
+
+	flags = be32_to_cpu(cnv->flags);
+
+	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
+	     (LKM_PUT_LVB|LKM_GET_LVB)) {
+		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		status = DLM_BADARGS;
+		goto leave;
+	}
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
+	     (flags & LKM_GET_LVB ? "get lvb" : "none"));
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	list_for_each(iter, &res->granted) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock->ml.cookie == cnv->cookie &&
+		    lock->ml.node == cnv->node_idx) {
+			dlm_lock_get(lock);
+			break;
+		}
+		lock = NULL;
+	}
+	spin_unlock(&res->spinlock);
+	if (!lock) {
+		status = DLM_IVLOCKID;
+		dlm_error(status);
+		goto leave;
+	}
+
+	/* found the lock */
+	lksb = lock->lksb;
+
+	/* see if caller needed to get/put lvb */
+	if (flags & LKM_PUT_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
+	} else if (flags & LKM_GET_LVB) {
+		BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+		lksb->flags |= DLM_LKSB_GET_LVB;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status == DLM_NORMAL) {
+		__dlm_lockres_reserve_ast(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+		status = __dlmconvert_master(dlm, res, lock, flags,
+					     cnv->requested_type,
+					     &call_ast, &kick_thread);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
+	}
+
+leave:
+	if (!lock)
+		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+			       "cookie=%"MLFu64"\n",
+		     cnv->cookie);
+	else
+		dlm_lock_put(lock);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 0000000..b2e3677
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmconvert.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMCONVERT_H
+#define DLMCONVERT_H
+
+enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  struct dlm_lock *lock, int flags, int type);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 0000000..f339fe2
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.c
+ *
+ * debug functionality for the dlm
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+
+#include "dlmdomain.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	spin_lock(&res->spinlock);
+	__dlm_print_one_lock_resource(res);
+	spin_unlock(&res->spinlock);
+}
+
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+{
+	struct list_head *iter2;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
+	       res->lockname.len, res->lockname.name,
+	       res->owner, res->state);
+	mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
+	     res->last_used, list_empty(&res->purge) ? "no" : "yes");
+	mlog(ML_NOTICE, "  granted queue: \n");
+	list_for_each(iter2, &res->granted) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  converting queue: \n");
+	list_for_each(iter2, &res->converting) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+	mlog(ML_NOTICE, "  blocked queue: \n");
+	list_for_each(iter2, &res->blocked) {
+		lock = list_entry(iter2, struct dlm_lock, list);
+		spin_lock(&lock->spinlock);
+		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
+		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       list_empty(&lock->ast_list) ? 'y' : 'n',
+		       lock->ast_pending ? 'y' : 'n',
+		       list_empty(&lock->bast_list) ? 'y' : 'n',
+		       lock->bast_pending ? 'y' : 'n');
+		spin_unlock(&lock->spinlock);
+	}
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+	dlm_print_one_lock_resource(lockid->lockres);
+}
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter;
+	struct list_head *bucket;
+	int i;
+
+	mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
+		  dlm->name, dlm->node_num, dlm->key);
+	if (!dlm || !dlm->name) {
+		mlog(ML_ERROR, "dlm=%p\n", dlm);
+		return;
+	}
+
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry(iter, struct dlm_lock_resource, list);
+			dlm_print_one_lock_resource(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static const char *dlm_errnames[] = {
+	[DLM_NORMAL] =			"DLM_NORMAL",
+	[DLM_GRANTED] =			"DLM_GRANTED",
+	[DLM_DENIED] =			"DLM_DENIED",
+	[DLM_DENIED_NOLOCKS] =		"DLM_DENIED_NOLOCKS",
+	[DLM_WORKING] =			"DLM_WORKING",
+	[DLM_BLOCKED] =			"DLM_BLOCKED",
+	[DLM_BLOCKED_ORPHAN] =		"DLM_BLOCKED_ORPHAN",
+	[DLM_DENIED_GRACE_PERIOD] =	"DLM_DENIED_GRACE_PERIOD",
+	[DLM_SYSERR] =			"DLM_SYSERR",
+	[DLM_NOSUPPORT] =		"DLM_NOSUPPORT",
+	[DLM_CANCELGRANT] =		"DLM_CANCELGRANT",
+	[DLM_IVLOCKID] =		"DLM_IVLOCKID",
+	[DLM_SYNC] =			"DLM_SYNC",
+	[DLM_BADTYPE] =			"DLM_BADTYPE",
+	[DLM_BADRESOURCE] =		"DLM_BADRESOURCE",
+	[DLM_MAXHANDLES] =		"DLM_MAXHANDLES",
+	[DLM_NOCLINFO] =		"DLM_NOCLINFO",
+	[DLM_NOLOCKMGR] =		"DLM_NOLOCKMGR",
+	[DLM_NOPURGED] =		"DLM_NOPURGED",
+	[DLM_BADARGS] =			"DLM_BADARGS",
+	[DLM_VOID] =			"DLM_VOID",
+	[DLM_NOTQUEUED] =		"DLM_NOTQUEUED",
+	[DLM_IVBUFLEN] =		"DLM_IVBUFLEN",
+	[DLM_CVTUNGRANT] =		"DLM_CVTUNGRANT",
+	[DLM_BADPARAM] =		"DLM_BADPARAM",
+	[DLM_VALNOTVALID] =		"DLM_VALNOTVALID",
+	[DLM_REJECTED] =		"DLM_REJECTED",
+	[DLM_ABORT] =			"DLM_ABORT",
+	[DLM_CANCEL] =			"DLM_CANCEL",
+	[DLM_IVRESHANDLE] =		"DLM_IVRESHANDLE",
+	[DLM_DEADLOCK] =		"DLM_DEADLOCK",
+	[DLM_DENIED_NOASTS] =		"DLM_DENIED_NOASTS",
+	[DLM_FORWARD] =			"DLM_FORWARD",
+	[DLM_TIMEOUT] =			"DLM_TIMEOUT",
+	[DLM_IVGROUPID] =		"DLM_IVGROUPID",
+	[DLM_VERS_CONFLICT] =		"DLM_VERS_CONFLICT",
+	[DLM_BAD_DEVICE_PATH] =		"DLM_BAD_DEVICE_PATH",
+	[DLM_NO_DEVICE_PERMISSION] =	"DLM_NO_DEVICE_PERMISSION",
+	[DLM_NO_CONTROL_DEVICE ] =	"DLM_NO_CONTROL_DEVICE ",
+	[DLM_RECOVERING] =		"DLM_RECOVERING",
+	[DLM_MIGRATING] =		"DLM_MIGRATING",
+	[DLM_MAXSTATS] =		"DLM_MAXSTATS",
+};
+
+static const char *dlm_errmsgs[] = {
+	[DLM_NORMAL] = 			"request in progress",
+	[DLM_GRANTED] = 		"request granted",
+	[DLM_DENIED] = 			"request denied",
+	[DLM_DENIED_NOLOCKS] = 		"request denied, out of system resources",
+	[DLM_WORKING] = 		"async request in progress",
+	[DLM_BLOCKED] = 		"lock request blocked",
+	[DLM_BLOCKED_ORPHAN] = 		"lock request blocked by a orphan lock",
+	[DLM_DENIED_GRACE_PERIOD] = 	"topological change in progress",
+	[DLM_SYSERR] = 			"system error",
+	[DLM_NOSUPPORT] = 		"unsupported",
+	[DLM_CANCELGRANT] = 		"can't cancel convert: already granted",
+	[DLM_IVLOCKID] = 		"bad lockid",
+	[DLM_SYNC] = 			"synchronous request granted",
+	[DLM_BADTYPE] = 		"bad resource type",
+	[DLM_BADRESOURCE] = 		"bad resource handle",
+	[DLM_MAXHANDLES] = 		"no more resource handles",
+	[DLM_NOCLINFO] = 		"can't contact cluster manager",
+	[DLM_NOLOCKMGR] = 		"can't contact lock manager",
+	[DLM_NOPURGED] = 		"can't contact purge daemon",
+	[DLM_BADARGS] = 		"bad api args",
+	[DLM_VOID] = 			"no status",
+	[DLM_NOTQUEUED] = 		"NOQUEUE was specified and request failed",
+	[DLM_IVBUFLEN] = 		"invalid resource name length",
+	[DLM_CVTUNGRANT] = 		"attempted to convert ungranted lock",
+	[DLM_BADPARAM] = 		"invalid lock mode specified",
+	[DLM_VALNOTVALID] = 		"value block has been invalidated",
+	[DLM_REJECTED] = 		"request rejected, unrecognized client",
+	[DLM_ABORT] = 			"blocked lock request cancelled",
+	[DLM_CANCEL] = 			"conversion request cancelled",
+	[DLM_IVRESHANDLE] = 		"invalid resource handle",
+	[DLM_DEADLOCK] = 		"deadlock recovery refused this request",
+	[DLM_DENIED_NOASTS] = 		"failed to allocate AST",
+	[DLM_FORWARD] = 		"request must wait for primary's response",
+	[DLM_TIMEOUT] = 		"timeout value for lock has expired",
+	[DLM_IVGROUPID] = 		"invalid group specification",
+	[DLM_VERS_CONFLICT] = 		"version conflicts prevent request handling",
+	[DLM_BAD_DEVICE_PATH] = 	"Locks device does not exist or path wrong",
+	[DLM_NO_DEVICE_PERMISSION] = 	"Client has insufficient perms for device",
+	[DLM_NO_CONTROL_DEVICE] = 	"Cannot set options on opened device ",
+	[DLM_RECOVERING] = 		"lock resource being recovered",
+	[DLM_MIGRATING] = 		"lock resource being migrated",
+	[DLM_MAXSTATS] = 		"invalid error number",
+};
+
+const char *dlm_errmsg(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errmsgs[DLM_MAXSTATS];
+	return dlm_errmsgs[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errmsg);
+
+const char *dlm_errname(enum dlm_status err)
+{
+	if (err >= DLM_MAXSTATS || err < 0)
+		return dlm_errnames[DLM_MAXSTATS];
+	return dlm_errnames[err];
+}
+EXPORT_SYMBOL_GPL(dlm_errname);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 0000000..6858510
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 0000000..da3c220
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.c
+ *
+ * defines domain join / leave apis
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmdebug.h"
+#include "dlmdomain.h"
+
+#include "dlmver.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
+#include "cluster/masklog.h"
+
+/*
+ *
+ * spinlock lock ordering: if multiple locks are needed, obey this ordering:
+ *    dlm_domain_lock
+ *    struct dlm_ctxt->spinlock
+ *    struct dlm_lock_resource->spinlock
+ *    struct dlm_ctxt->master_lock
+ *    struct dlm_ctxt->ast_lock
+ *    dlm_master_list_entry->spinlock
+ *    dlm_lock->spinlock
+ *
+ */
+
+spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(dlm_domains);
+static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+
+#define DLM_DOMAIN_BACKOFF_MS 200
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
+
+void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+{
+	list_del_init(&lockres->list);
+	dlm_lockres_put(lockres);
+}
+
+void __dlm_insert_lockres(struct dlm_ctxt *dlm,
+		       struct dlm_lock_resource *res)
+{
+	struct list_head *bucket;
+	struct qstr *q;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	q = &res->lockname;
+	q->hash = full_name_hash(q->name, q->len);
+	bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+
+	/* get a reference for our hashtable */
+	dlm_lockres_get(res);
+
+	list_add_tail(&res->list, bucket);
+}
+
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+					 const char *name,
+					 unsigned int len)
+{
+	unsigned int hash;
+	struct list_head *iter;
+	struct dlm_lock_resource *tmpres=NULL;
+	struct list_head *bucket;
+
+	mlog_entry("%.*s\n", len, name);
+
+	assert_spin_locked(&dlm->spinlock);
+
+	hash = full_name_hash(name, len);
+
+	bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+
+	/* check for pre-existing lock */
+	list_for_each(iter, bucket) {
+		tmpres = list_entry(iter, struct dlm_lock_resource, list);
+		if (tmpres->lockname.len == len &&
+		    memcmp(tmpres->lockname.name, name, len) == 0) {
+			dlm_lockres_get(tmpres);
+			break;
+		}
+
+		tmpres = NULL;
+	}
+	return tmpres;
+}
+
+struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
+				    const char *name,
+				    unsigned int len)
+{
+	struct dlm_lock_resource *res;
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, len);
+	spin_unlock(&dlm->spinlock);
+	return res;
+}
+
+static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
+{
+	struct dlm_ctxt *tmp = NULL;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm_domain_lock);
+
+	/* tmp->name here is always NULL terminated,
+	 * but domain may not be! */
+	list_for_each(iter, &dlm_domains) {
+		tmp = list_entry (iter, struct dlm_ctxt, list);
+		if (strlen(tmp->name) == len &&
+		    memcmp(tmp->name, domain, len)==0)
+			break;
+		tmp = NULL;
+	}
+
+	return tmp;
+}
+
+/* For null terminated domain strings ONLY */
+static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
+{
+	assert_spin_locked(&dlm_domain_lock);
+
+	return __dlm_lookup_domain_full(domain, strlen(domain));
+}
+
+
+/* returns true on one of two conditions:
+ * 1) the domain does not exist
+ * 2) the domain exists and it's state is "joined" */
+static int dlm_wait_on_domain_helper(const char *domain)
+{
+	int ret = 0;
+	struct dlm_ctxt *tmp = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	tmp = __dlm_lookup_domain(domain);
+	if (!tmp)
+		ret = 1;
+	else if (tmp->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+	return ret;
+}
+
+static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
+{
+	if (dlm->resources)
+		free_page((unsigned long) dlm->resources);
+
+	if (dlm->name)
+		kfree(dlm->name);
+
+	kfree(dlm);
+}
+
+/* A little strange - this function will be called while holding
+ * dlm_domain_lock and is expected to be holding it on the way out. We
+ * will however drop and reacquire it multiple times */
+static void dlm_ctxt_release(struct kref *kref)
+{
+	struct dlm_ctxt *dlm;
+
+	dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
+
+	BUG_ON(dlm->num_joins);
+	BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
+
+	/* we may still be in the list if we hit an error during join. */
+	list_del_init(&dlm->list);
+
+	spin_unlock(&dlm_domain_lock);
+
+	mlog(0, "freeing memory from domain %s\n", dlm->name);
+
+	wake_up(&dlm_domain_events);
+
+	dlm_free_ctxt_mem(dlm);
+
+	spin_lock(&dlm_domain_lock);
+}
+
+void dlm_put(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm_domain_lock);
+	kref_put(&dlm->dlm_refs, dlm_ctxt_release);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_get(struct dlm_ctxt *dlm)
+{
+	kref_get(&dlm->dlm_refs);
+}
+
+/* given a questionable reference to a dlm object, gets a reference if
+ * it can find it in the list, otherwise returns NULL in which case
+ * you shouldn't trust your pointer. */
+struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *target = NULL;
+
+	spin_lock(&dlm_domain_lock);
+
+	list_for_each(iter, &dlm_domains) {
+		target = list_entry (iter, struct dlm_ctxt, list);
+
+		if (target == dlm) {
+			__dlm_get(target);
+			break;
+		}
+
+		target = NULL;
+	}
+
+	spin_unlock(&dlm_domain_lock);
+
+	return target;
+}
+
+int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm_domain_lock);
+	ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
+		(dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain_handlers(dlm);
+	dlm_complete_thread(dlm);
+	dlm_complete_recovery_thread(dlm);
+
+	/* We've left the domain. Now we can take ourselves out of the
+	 * list and allow the kref stuff to help us free the
+	 * memory. */
+	spin_lock(&dlm_domain_lock);
+	list_del_init(&dlm->list);
+	spin_unlock(&dlm_domain_lock);
+
+	/* Wake up anyone waiting for us to remove this domain */
+	wake_up(&dlm_domain_events);
+}
+
+static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+{
+	int i;
+	struct dlm_lock_resource *res;
+
+	mlog(0, "Migrating locks from domain %s\n", dlm->name);
+restart:
+	spin_lock(&dlm->spinlock);
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		while (!list_empty(&dlm->resources[i])) {
+			res = list_entry(dlm->resources[i].next,
+				     struct dlm_lock_resource, list);
+			/* need reference when manually grabbing lockres */
+			dlm_lockres_get(res);
+			/* this should unhash the lockres
+			 * and exit with dlm->spinlock */
+			mlog(0, "purging res=%p\n", res);
+			if (dlm_lockres_is_dirty(dlm, res)) {
+				/* HACK!  this should absolutely go.
+				 * need to figure out why some empty
+				 * lockreses are still marked dirty */
+				mlog(ML_ERROR, "lockres %.*s dirty!\n",
+				     res->lockname.len, res->lockname.name);
+
+				spin_unlock(&dlm->spinlock);
+				dlm_kick_thread(dlm, res);
+				wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+				dlm_lockres_put(res);
+				goto restart;
+			}
+			dlm_purge_lockres(dlm, res);
+			dlm_lockres_put(res);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+}
+
+static int dlm_no_joining_node(struct dlm_ctxt *dlm)
+{
+	int ret;
+
+	spin_lock(&dlm->spinlock);
+	ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
+	spin_unlock(&dlm->spinlock);
+
+	return ret;
+}
+
+static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
+{
+	/* Yikes, a double spinlock! I need domain_lock for the dlm
+	 * state and the dlm spinlock for join state... Sorry! */
+again:
+	spin_lock(&dlm_domain_lock);
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "Node %d is joining, we wait on it.\n",
+			  dlm->joining_node);
+		spin_unlock(&dlm->spinlock);
+		spin_unlock(&dlm_domain_lock);
+
+		wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
+		goto again;
+	}
+
+	dlm->dlm_state = DLM_CTXT_LEAVING;
+	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm_domain_lock);
+}
+
+static void __dlm_print_nodes(struct dlm_ctxt *dlm)
+{
+	int node = -1;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		mlog(ML_NOTICE, " node %d\n", node);
+	}
+}
+
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	unsigned int node;
+	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+	mlog_entry("%p %u %p", msg, len, data);
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	node = exit_msg->node_idx;
+
+	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+
+	spin_lock(&dlm->spinlock);
+	clear_bit(node, dlm->domain_map);
+	__dlm_print_nodes(dlm);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, node, 0);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+
+	return 0;
+}
+
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_exit_domain leave_msg;
+
+	mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
+		  node, dlm->name, dlm->node_num);
+
+	memset(&leave_msg, 0, sizeof(leave_msg));
+	leave_msg.node_idx = dlm->node_num;
+
+	status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
+				    &leave_msg, sizeof(leave_msg), node,
+				    NULL);
+
+	mlog(0, "status return %d from o2net_send_message\n", status);
+
+	return status;
+}
+
+
+static void dlm_leave_domain(struct dlm_ctxt *dlm)
+{
+	int node, clear_node, status;
+
+	/* At this point we've migrated away all our locks and won't
+	 * accept mastership of new ones. The dlm is responsible for
+	 * almost nothing now. We make sure not to confuse any joining
+	 * nodes and then commence shutdown procedure. */
+
+	spin_lock(&dlm->spinlock);
+	/* Clear ourselves from the domain map */
+	clear_bit(dlm->node_num, dlm->domain_map);
+	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
+				     0)) < O2NM_MAX_NODES) {
+		/* Drop the dlm spinlock. This is safe wrt the domain_map.
+		 * -nodes cannot be added now as the
+		 *   query_join_handlers knows to respond with OK_NO_MAP
+		 * -we catch the right network errors if a node is
+		 *   removed from the map while we're sending him the
+		 *   exit message. */
+		spin_unlock(&dlm->spinlock);
+
+		clear_node = 1;
+
+		status = dlm_send_one_domain_exit(dlm, node);
+		if (status < 0 &&
+		    status != -ENOPROTOOPT &&
+		    status != -ENOTCONN) {
+			mlog(ML_NOTICE, "Error %d sending domain exit message "
+			     "to node %d\n", status, node);
+
+			/* Not sure what to do here but lets sleep for
+			 * a bit in case this was a transient
+			 * error... */
+			msleep(DLM_DOMAIN_BACKOFF_MS);
+			clear_node = 0;
+		}
+
+		spin_lock(&dlm->spinlock);
+		/* If we're not clearing the node bit then we intend
+		 * to loop back around to try again. */
+		if (clear_node)
+			clear_bit(node, dlm->domain_map);
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+int dlm_joined(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_JOINED)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+int dlm_shutting_down(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+
+	spin_lock(&dlm_domain_lock);
+
+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		ret = 1;
+
+	spin_unlock(&dlm_domain_lock);
+
+	return ret;
+}
+
+void dlm_unregister_domain(struct dlm_ctxt *dlm)
+{
+	int leave = 0;
+
+	spin_lock(&dlm_domain_lock);
+	BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
+	BUG_ON(!dlm->num_joins);
+
+	dlm->num_joins--;
+	if (!dlm->num_joins) {
+		/* We mark it "in shutdown" now so new register
+		 * requests wait until we've completely left the
+		 * domain. Don't use DLM_CTXT_LEAVING yet as we still
+		 * want new domain joins to communicate with us at
+		 * least until we've completed migration of our
+		 * resources. */
+		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		leave = 1;
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	if (leave) {
+		mlog(0, "shutting down domain %s\n", dlm->name);
+
+		/* We changed dlm state, notify the thread */
+		dlm_kick_thread(dlm, NULL);
+
+		dlm_migrate_all_locks(dlm);
+		dlm_mark_domain_leaving(dlm);
+		dlm_leave_domain(dlm);
+		dlm_complete_dlm_shutdown(dlm);
+	}
+	dlm_put(dlm);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_domain);
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_query_join_request *query;
+	enum dlm_query_join_response response;
+	struct dlm_ctxt *dlm = NULL;
+
+	query = (struct dlm_query_join_request *) msg->buf;
+
+	mlog(0, "node %u wants to join domain %s\n", query->node_idx,
+		  query->domain);
+
+	/*
+	 * If heartbeat doesn't consider the node live, tell it
+	 * to back off and try again.  This gives heartbeat a chance
+	 * to catch up.
+	 */
+	if (!o2hb_check_node_heartbeating(query->node_idx)) {
+		mlog(0, "node %u is not in our live map yet\n",
+		     query->node_idx);
+
+		response = JOIN_DISALLOW;
+		goto respond;
+	}
+
+	response = JOIN_OK_NO_MAP;
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+	/* Once the dlm ctxt is marked as leaving then we don't want
+	 * to be put in someone's domain map. */
+	if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
+		spin_lock(&dlm->spinlock);
+
+		if (dlm->dlm_state == DLM_CTXT_NEW &&
+		    dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/*If this is a brand new context and we
+			 * haven't started our join process yet, then
+			 * the other node won the race. */
+			response = JOIN_OK_NO_MAP;
+		} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* Disallow parallel joins. */
+			response = JOIN_DISALLOW;
+		} else {
+			/* Alright we're fully a part of this domain
+			 * so we keep some state as to who's joining
+			 * and indicate to him that needs to be fixed
+			 * up. */
+			response = JOIN_OK;
+			__dlm_set_joining_node(dlm, query->node_idx);
+		}
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+respond:
+	mlog(0, "We respond with %u\n", response);
+
+	return response;
+}
+
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_assert_joined *assert;
+	struct dlm_ctxt *dlm = NULL;
+
+	assert = (struct dlm_assert_joined *) msg->buf;
+
+	mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
+		  assert->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
+	/* XXX should we consider no dlm ctxt an error? */
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Alright, this node has officially joined our
+		 * domain. Set him in the map and clean up our
+		 * leftover join state. */
+		BUG_ON(dlm->joining_node != assert->node_idx);
+		set_bit(assert->node_idx, dlm->domain_map);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		__dlm_print_nodes(dlm);
+
+		/* notify anything attached to the heartbeat events */
+		dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_cancel_join *cancel;
+	struct dlm_ctxt *dlm = NULL;
+
+	cancel = (struct dlm_cancel_join *) msg->buf;
+
+	mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
+		  cancel->domain);
+
+	spin_lock(&dlm_domain_lock);
+	dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
+
+	if (dlm) {
+		spin_lock(&dlm->spinlock);
+
+		/* Yikes, this guy wants to cancel his join. No
+		 * problem, we simply cleanup our join state. */
+		BUG_ON(dlm->joining_node != cancel->node_idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+
+		spin_unlock(&dlm->spinlock);
+	}
+	spin_unlock(&dlm_domain_lock);
+
+	return 0;
+}
+
+static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_cancel_join cancel_msg;
+
+	memset(&cancel_msg, 0, sizeof(cancel_msg));
+	cancel_msg.node_idx = dlm->node_num;
+	cancel_msg.name_len = strlen(dlm->name);
+	memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
+
+	status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+				    &cancel_msg, sizeof(cancel_msg), node,
+				    NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	return status;
+}
+
+/* map_size should be in bytes. */
+static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
+				 unsigned long *node_map,
+				 unsigned int map_size)
+{
+	int status, tmpstat;
+	unsigned int node;
+
+	if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
+			 sizeof(unsigned long))) {
+		mlog(ML_ERROR,
+		     "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
+		     map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+		return -EINVAL;
+	}
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		tmpstat = dlm_send_one_join_cancel(dlm, node);
+		if (tmpstat) {
+			mlog(ML_ERROR, "Error return %d cancelling join on "
+			     "node %d\n", tmpstat, node);
+			if (!status)
+				status = tmpstat;
+		}
+	}
+
+	if (status)
+		mlog_errno(status);
+	return status;
+}
+
+static int dlm_request_join(struct dlm_ctxt *dlm,
+			    int node,
+			    enum dlm_query_join_response *response)
+{
+	int status, retval;
+	struct dlm_query_join_request join_msg;
+
+	mlog(0, "querying node %d\n", node);
+
+	memset(&join_msg, 0, sizeof(join_msg));
+	join_msg.node_idx = dlm->node_num;
+	join_msg.name_len = strlen(dlm->name);
+	memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+
+	status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
+				    sizeof(join_msg), node, &retval);
+	if (status < 0 && status != -ENOPROTOOPT) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* -ENOPROTOOPT from the net code means the other side isn't
+	    listening for our message type -- that's fine, it means
+	    his dlm isn't up, so we can consider him a 'yes' but not
+	    joined into the domain.  */
+	if (status == -ENOPROTOOPT) {
+		status = 0;
+		*response = JOIN_OK_NO_MAP;
+	} else if (retval == JOIN_DISALLOW ||
+		   retval == JOIN_OK ||
+		   retval == JOIN_OK_NO_MAP) {
+		*response = retval;
+	} else {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
+		     node);
+	}
+
+	mlog(0, "status %d, node %d response is %d\n", status, node,
+		  *response);
+
+bail:
+	return status;
+}
+
+static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
+				    unsigned int node)
+{
+	int status;
+	struct dlm_assert_joined assert_msg;
+
+	mlog(0, "Sending join assert to node %u\n", node);
+
+	memset(&assert_msg, 0, sizeof(assert_msg));
+	assert_msg.node_idx = dlm->node_num;
+	assert_msg.name_len = strlen(dlm->name);
+	memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
+
+	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+				    &assert_msg, sizeof(assert_msg), node,
+				    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
+				  unsigned long *node_map)
+{
+	int status, node, live;
+
+	status = 0;
+	node = -1;
+	while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		do {
+			/* It is very important that this message be
+			 * received so we spin until either the node
+			 * has died or it gets the message. */
+			status = dlm_send_one_join_assert(dlm, node);
+
+			spin_lock(&dlm->spinlock);
+			live = test_bit(node, dlm->live_nodes_map);
+			spin_unlock(&dlm->spinlock);
+
+			if (status) {
+				mlog(ML_ERROR, "Error return %d asserting "
+				     "join on node %d\n", status, node);
+
+				/* give us some time between errors... */
+				if (live)
+					msleep(DLM_DOMAIN_BACKOFF_MS);
+			}
+		} while (status && live);
+	}
+}
+
+struct domain_join_ctxt {
+	unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+static int dlm_should_restart_join(struct dlm_ctxt *dlm,
+				   struct domain_join_ctxt *ctxt,
+				   enum dlm_query_join_response response)
+{
+	int ret;
+
+	if (response == JOIN_DISALLOW) {
+		mlog(0, "Latest response of disallow -- should restart\n");
+		return 1;
+	}
+
+	spin_lock(&dlm->spinlock);
+	/* For now, we restart the process if the node maps have
+	 * changed at all */
+	ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
+		     sizeof(dlm->live_nodes_map));
+	spin_unlock(&dlm->spinlock);
+
+	if (ret)
+		mlog(0, "Node maps changed -- should restart\n");
+
+	return ret;
+}
+
+static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
+{
+	int status = 0, tmpstat, node;
+	struct domain_join_ctxt *ctxt;
+	enum dlm_query_join_response response;
+
+	mlog_entry("%p", dlm);
+
+	ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* group sem locking should work for us here -- we're already
+	 * registered for heartbeat events so filling this should be
+	 * atomic wrt getting those handlers called. */
+	o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+
+	spin_lock(&dlm->spinlock);
+	memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
+
+	__dlm_set_joining_node(dlm, dlm->node_num);
+
+	spin_unlock(&dlm->spinlock);
+
+	node = -1;
+	while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
+				     node + 1)) < O2NM_MAX_NODES) {
+		if (node == dlm->node_num)
+			continue;
+
+		status = dlm_request_join(dlm, node, &response);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Ok, either we got a response or the node doesn't have a
+		 * dlm up. */
+		if (response == JOIN_OK)
+			set_bit(node, ctxt->yes_resp_map);
+
+		if (dlm_should_restart_join(dlm, ctxt, response)) {
+			status = -EAGAIN;
+			goto bail;
+		}
+	}
+
+	mlog(0, "Yay, done querying nodes!\n");
+
+	/* Yay, everyone agree's we can join the domain. My domain is
+	 * comprised of all nodes who were put in the
+	 * yes_resp_map. Copy that into our domain map and send a join
+	 * assert message to clean up everyone elses state. */
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->domain_map, ctxt->yes_resp_map,
+	       sizeof(ctxt->yes_resp_map));
+	set_bit(dlm->node_num, dlm->domain_map);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
+
+	/* Joined state *must* be set before the joining node
+	 * information, otherwise the query_join handler may read no
+	 * current joiner but a state of NEW and tell joining nodes
+	 * we're not in the domain. */
+	spin_lock(&dlm_domain_lock);
+	dlm->dlm_state = DLM_CTXT_JOINED;
+	dlm->num_joins++;
+	spin_unlock(&dlm_domain_lock);
+
+bail:
+	spin_lock(&dlm->spinlock);
+	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	if (!status)
+		__dlm_print_nodes(dlm);
+	spin_unlock(&dlm->spinlock);
+
+	if (ctxt) {
+		/* Do we need to send a cancel message to any nodes? */
+		if (status < 0) {
+			tmpstat = dlm_send_join_cancels(dlm,
+							ctxt->yes_resp_map,
+							sizeof(ctxt->yes_resp_map));
+			if (tmpstat < 0)
+				mlog_errno(tmpstat);
+		}
+		kfree(ctxt);
+	}
+
+	mlog(0, "returning %d\n", status);
+	return status;
+}
+
+static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
+{
+	o2hb_unregister_callback(&dlm->dlm_hb_up);
+	o2hb_unregister_callback(&dlm->dlm_hb_down);
+	o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
+}
+
+static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	mlog(0, "registering handlers.\n");
+
+	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
+			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_down);
+	if (status)
+		goto bail;
+
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+	status = o2hb_register_callback(&dlm->dlm_hb_up);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_master_request),
+					dlm_master_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
+					sizeof(struct dlm_assert_master),
+					dlm_assert_master_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
+					sizeof(struct dlm_create_lock),
+					dlm_create_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
+					DLM_CONVERT_LOCK_MAX_LEN,
+					dlm_convert_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					DLM_UNLOCK_LOCK_MAX_LEN,
+					dlm_unlock_lock_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
+					DLM_PROXY_AST_MAX_LEN,
+					dlm_proxy_ast_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
+					sizeof(struct dlm_exit_domain),
+					dlm_exit_domain_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_migrate_request),
+					dlm_migrate_request_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
+					DLM_MIG_LOCKRES_MAX_LEN,
+					dlm_mig_lockres_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
+					sizeof(struct dlm_master_requery),
+					dlm_master_requery_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
+					sizeof(struct dlm_lock_request),
+					dlm_request_all_locks_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
+					sizeof(struct dlm_reco_data_done),
+					dlm_reco_data_done_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
+					sizeof(struct dlm_begin_reco),
+					dlm_begin_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
+					sizeof(struct dlm_finalize_reco),
+					dlm_finalize_reco_handler,
+					dlm, &dlm->dlm_domain_handlers);
+	if (status)
+		goto bail;
+
+bail:
+	if (status)
+		dlm_unregister_domain_handlers(dlm);
+
+	return status;
+}
+
+static int dlm_join_domain(struct dlm_ctxt *dlm)
+{
+	int status;
+
+	BUG_ON(!dlm);
+
+	mlog(0, "Join domain %s\n", dlm->name);
+
+	status = dlm_register_domain_handlers(dlm);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = dlm_launch_recovery_thread(dlm);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	do {
+		unsigned int backoff;
+		status = dlm_try_to_join_domain(dlm);
+
+		/* If we're racing another node to the join, then we
+		 * need to back off temporarily and let them
+		 * complete. */
+		if (status == -EAGAIN) {
+			if (signal_pending(current)) {
+				status = -ERESTARTSYS;
+				goto bail;
+			}
+
+			/*
+			 * <chip> After you!
+			 * <dale> No, after you!
+			 * <chip> I insist!
+			 * <dale> But you first!
+			 * ...
+			 */
+			backoff = (unsigned int)(jiffies & 0x3);
+			backoff *= DLM_DOMAIN_BACKOFF_MS;
+			mlog(0, "backoff %d\n", backoff);
+			msleep(backoff);
+		}
+	} while (status == -EAGAIN);
+
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	wake_up(&dlm_domain_events);
+
+	if (status) {
+		dlm_unregister_domain_handlers(dlm);
+		dlm_complete_thread(dlm);
+		dlm_complete_recovery_thread(dlm);
+	}
+
+	return status;
+}
+
+static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
+				u32 key)
+{
+	int i;
+	struct dlm_ctxt *dlm = NULL;
+
+	dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
+	if (!dlm) {
+		mlog_errno(-ENOMEM);
+		goto leave;
+	}
+
+	dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+	if (dlm->name == NULL) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+
+	dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
+	if (!dlm->resources) {
+		mlog_errno(-ENOMEM);
+		kfree(dlm->name);
+		kfree(dlm);
+		dlm = NULL;
+		goto leave;
+	}
+	memset(dlm->resources, 0, PAGE_SIZE);
+
+	for (i=0; i<DLM_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&dlm->resources[i]);
+
+	strcpy(dlm->name, domain);
+	dlm->key = key;
+	dlm->node_num = o2nm_this_node();
+
+	spin_lock_init(&dlm->spinlock);
+	spin_lock_init(&dlm->master_lock);
+	spin_lock_init(&dlm->ast_lock);
+	INIT_LIST_HEAD(&dlm->list);
+	INIT_LIST_HEAD(&dlm->dirty_list);
+	INIT_LIST_HEAD(&dlm->reco.resources);
+	INIT_LIST_HEAD(&dlm->reco.received);
+	INIT_LIST_HEAD(&dlm->reco.node_data);
+	INIT_LIST_HEAD(&dlm->purge_list);
+	INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+	dlm->reco.state = 0;
+
+	INIT_LIST_HEAD(&dlm->pending_asts);
+	INIT_LIST_HEAD(&dlm->pending_basts);
+
+	mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
+		  dlm->recovery_map, &(dlm->recovery_map[0]));
+
+	memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
+	memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
+	memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+
+	dlm->dlm_thread_task = NULL;
+	dlm->dlm_reco_thread_task = NULL;
+	init_waitqueue_head(&dlm->dlm_thread_wq);
+	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
+	init_waitqueue_head(&dlm->reco.event);
+	init_waitqueue_head(&dlm->ast_wq);
+	init_waitqueue_head(&dlm->migration_wq);
+	INIT_LIST_HEAD(&dlm->master_list);
+	INIT_LIST_HEAD(&dlm->mle_hb_events);
+
+	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
+	init_waitqueue_head(&dlm->dlm_join_events);
+
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	atomic_set(&dlm->local_resources, 0);
+	atomic_set(&dlm->remote_resources, 0);
+	atomic_set(&dlm->unknown_resources, 0);
+
+	spin_lock_init(&dlm->work_lock);
+	INIT_LIST_HEAD(&dlm->work_list);
+	INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
+
+	kref_init(&dlm->dlm_refs);
+	dlm->dlm_state = DLM_CTXT_NEW;
+
+	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
+
+	mlog(0, "context init: refcount %u\n",
+		  atomic_read(&dlm->dlm_refs.refcount));
+
+leave:
+	return dlm;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain"
+ */
+struct dlm_ctxt * dlm_register_domain(const char *domain,
+			       u32 key)
+{
+	int ret;
+	struct dlm_ctxt *dlm = NULL;
+	struct dlm_ctxt *new_ctxt = NULL;
+
+	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		mlog(ML_ERROR, "domain name length too long\n");
+		goto leave;
+	}
+
+	if (!o2hb_check_local_node_heartbeating()) {
+		mlog(ML_ERROR, "the local node has not been configured, or is "
+		     "not heartbeating\n");
+		ret = -EPROTO;
+		goto leave;
+	}
+
+	mlog(0, "register called for domain \"%s\"\n", domain);
+
+retry:
+	dlm = NULL;
+	if (signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	spin_lock(&dlm_domain_lock);
+
+	dlm = __dlm_lookup_domain(domain);
+	if (dlm) {
+		if (dlm->dlm_state != DLM_CTXT_JOINED) {
+			spin_unlock(&dlm_domain_lock);
+
+			mlog(0, "This ctxt is not joined yet!\n");
+			wait_event_interruptible(dlm_domain_events,
+						 dlm_wait_on_domain_helper(
+							 domain));
+			goto retry;
+		}
+
+		__dlm_get(dlm);
+		dlm->num_joins++;
+
+		spin_unlock(&dlm_domain_lock);
+
+		ret = 0;
+		goto leave;
+	}
+
+	/* doesn't exist */
+	if (!new_ctxt) {
+		spin_unlock(&dlm_domain_lock);
+
+		new_ctxt = dlm_alloc_ctxt(domain, key);
+		if (new_ctxt)
+			goto retry;
+
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	/* a little variable switch-a-roo here... */
+	dlm = new_ctxt;
+	new_ctxt = NULL;
+
+	/* add the new domain */
+	list_add_tail(&dlm->list, &dlm_domains);
+	spin_unlock(&dlm_domain_lock);
+
+	ret = dlm_join_domain(dlm);
+	if (ret) {
+		mlog_errno(ret);
+		dlm_put(dlm);
+		goto leave;
+	}
+
+	ret = 0;
+leave:
+	if (new_ctxt)
+		dlm_free_ctxt_mem(new_ctxt);
+
+	if (ret < 0)
+		dlm = ERR_PTR(ret);
+
+	return dlm;
+}
+EXPORT_SYMBOL_GPL(dlm_register_domain);
+
+static LIST_HEAD(dlm_join_handlers);
+
+static void dlm_unregister_net_handlers(void)
+{
+	o2net_unregister_handler_list(&dlm_join_handlers);
+}
+
+static int dlm_register_net_handlers(void)
+{
+	int status = 0;
+
+	status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_query_join_request),
+					dlm_query_join_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_assert_joined),
+					dlm_assert_joined_handler,
+					NULL, &dlm_join_handlers);
+	if (status)
+		goto bail;
+
+	status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+					sizeof(struct dlm_cancel_join),
+					dlm_cancel_join_handler,
+					NULL, &dlm_join_handlers);
+
+bail:
+	if (status < 0)
+		dlm_unregister_net_handlers();
+
+	return status;
+}
+
+/* Domain eviction callback handling.
+ *
+ * The file system requires notification of node death *before* the
+ * dlm completes it's recovery work, otherwise it may be able to
+ * acquire locks on resources requiring recovery. Since the dlm can
+ * evict a node from it's domain *before* heartbeat fires, a similar
+ * mechanism is required. */
+
+/* Eviction is not expected to happen often, so a per-domain lock is
+ * not necessary. Eviction callbacks are allowed to sleep for short
+ * periods of time. */
+static DECLARE_RWSEM(dlm_callback_sem);
+
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num)
+{
+	struct list_head *iter;
+	struct dlm_eviction_cb *cb;
+
+	down_read(&dlm_callback_sem);
+	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
+		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
+
+		cb->ec_func(node_num, cb->ec_data);
+	}
+	up_read(&dlm_callback_sem);
+}
+
+void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
+			   dlm_eviction_func *f,
+			   void *data)
+{
+	INIT_LIST_HEAD(&cb->ec_item);
+	cb->ec_func = f;
+	cb->ec_data = data;
+}
+EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
+
+void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
+			      struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
+
+void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
+{
+	down_write(&dlm_callback_sem);
+	list_del_init(&cb->ec_item);
+	up_write(&dlm_callback_sem);
+}
+EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
+
+static int __init dlm_init(void)
+{
+	int status;
+
+	dlm_print_version();
+
+	status = dlm_init_mle_cache();
+	if (status)
+		return -1;
+
+	status = dlm_register_net_handlers();
+	if (status) {
+		dlm_destroy_mle_cache();
+		return -1;
+	}
+
+	return 0;
+}
+
+static void __exit dlm_exit (void)
+{
+	dlm_unregister_net_handlers();
+	dlm_destroy_mle_cache();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(dlm_init);
+module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 0000000..2f7f60b
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdomain.h
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDOMAIN_H
+#define DLMDOMAIN_H
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_joined(struct dlm_ctxt *dlm);
+int dlm_shutting_down(struct dlm_ctxt *dlm);
+void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
+					int node_num);
+
+#endif
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 0000000..d1a0038
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmlock.c
+ *
+ * underlying calls for lock creation
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#include "dlmconvert.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_next_cookie = 1;
+
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags);
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie);
+static void dlm_lock_release(struct kref *kref);
+static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+
+/* Tell us whether we can grant a new lock request.
+ * locking:
+ *   caller needs:  res->spinlock
+ *   taken:         none
+ *   held on exit:  none
+ * returns: 1 if the lock can be granted, 0 otherwise.
+ */
+static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
+				  struct dlm_lock *lock)
+{
+	struct list_head *iter;
+	struct dlm_lock *tmplock;
+
+	list_for_each(iter, &res->granted) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	list_for_each(iter, &res->converting) {
+		tmplock = list_entry(iter, struct dlm_lock, list);
+
+		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* performs lock creation at the lockres master site
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOTQUEUED
+ */
+static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	int call_ast = 0, kick_thread = 0;
+	enum dlm_status status = DLM_NORMAL;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+
+	spin_lock(&res->spinlock);
+	/* if called from dlm_create_lock_handler, need to
+	 * ensure it will not sleep in dlm_wait_on_lockres */
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL &&
+	    lock->ml.node != dlm->node_num) {
+		/* erf.  state changed after lock was dropped. */
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		return status;
+	}
+	__dlm_wait_on_lockres(res);
+	__dlm_lockres_reserve_ast(res);
+
+	if (dlm_can_grant_new_lock(res, lock)) {
+		mlog(0, "I can grant this lock right away\n");
+		/* got it right away */
+		lock->lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+
+		/* for the recovery lock, we can't allow the ast
+		 * to be queued since the dlmthread is already
+		 * frozen.  but the recovery lock is always locked
+		 * with LKM_NOQUEUE so we do not need the ast in
+		 * this special case */
+		if (!dlm_is_recovery_lock(res->lockname.name,
+					  res->lockname.len)) {
+			kick_thread = 1;
+			call_ast = 1;
+		}
+	} else {
+		/* for NOQUEUE request, unless we get the
+		 * lock right away, return DLM_NOTQUEUED */
+		if (flags & LKM_NOQUEUE)
+			status = DLM_NOTQUEUED;
+		else {
+			dlm_lock_get(lock);
+			list_add_tail(&lock->list, &res->blocked);
+			kick_thread = 1;
+		}
+	}
+
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* either queue the ast or release it */
+	if (call_ast)
+		dlm_queue_ast(dlm, lock);
+	else
+		dlm_lockres_release_ast(dlm, res);
+
+	dlm_lockres_calc_usage(dlm, res);
+	if (kick_thread)
+		dlm_kick_thread(dlm, res);
+
+	return status;
+}
+
+void dlm_revert_pending_lock(struct dlm_lock_resource *res,
+			     struct dlm_lock *lock)
+{
+	/* remove from local queue if it failed */
+	list_del_init(&lock->list);
+	lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+}
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_DENIED, DLM_RECOVERING, or net status
+ */
+static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      struct dlm_lock *lock, int flags)
+{
+	enum dlm_status status = DLM_DENIED;
+
+	mlog_entry("type=%d\n", lock->ml.type);
+	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+	     res->lockname.name, flags);
+
+	spin_lock(&res->spinlock);
+
+	/* will exit this call with spinlock held */
+	__dlm_wait_on_lockres(res);
+	res->state |= DLM_LOCK_RES_IN_PROGRESS;
+
+	/* add lock to local (secondary) queue */
+	dlm_lock_get(lock);
+	list_add_tail(&lock->list, &res->blocked);
+	lock->lock_pending = 1;
+	spin_unlock(&res->spinlock);
+
+	/* spec seems to say that you will get DLM_NORMAL when the lock
+	 * has been queued, meaning we need to wait for a reply here. */
+	status = dlm_send_remote_lock_request(dlm, res, lock, flags);
+
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	lock->lock_pending = 0;
+	if (status != DLM_NORMAL) {
+		if (status != DLM_NOTQUEUED)
+			dlm_error(status);
+		dlm_revert_pending_lock(res, lock);
+		dlm_lock_put(lock);
+	}
+	spin_unlock(&res->spinlock);
+
+	dlm_lockres_calc_usage(dlm, res);
+
+	wake_up(&res->wq);
+	return status;
+}
+
+
+/* for remote lock creation.
+ * locking:
+ *   caller needs:  none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NOLOCKMGR, or net status
+ */
+static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
+					       struct dlm_lock_resource *res,
+					       struct dlm_lock *lock, int flags)
+{
+	struct dlm_create_lock create;
+	int tmpret, status = 0;
+	enum dlm_status ret;
+
+	mlog_entry_void();
+
+	memset(&create, 0, sizeof(create));
+	create.node_idx = dlm->node_num;
+	create.requested_type = lock->ml.type;
+	create.cookie = lock->ml.cookie;
+	create.namelen = res->lockname.len;
+	create.flags = cpu_to_be32(flags);
+	memcpy(create.name, res->lockname.name, create.namelen);
+
+	tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+				    sizeof(create), res->owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		ret = status;  // this is already a dlm_status
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			ret = DLM_RECOVERING;
+			mlog(0, "node %u died so returning DLM_RECOVERING "
+			     "from lock message!\n", res->owner);
+		} else {
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+	}
+
+	return ret;
+}
+
+void dlm_lock_get(struct dlm_lock *lock)
+{
+	kref_get(&lock->lock_refs);
+}
+
+void dlm_lock_put(struct dlm_lock *lock)
+{
+	kref_put(&lock->lock_refs, dlm_lock_release);
+}
+
+static void dlm_lock_release(struct kref *kref)
+{
+	struct dlm_lock *lock;
+
+	lock = container_of(kref, struct dlm_lock, lock_refs);
+
+	BUG_ON(!list_empty(&lock->list));
+	BUG_ON(!list_empty(&lock->ast_list));
+	BUG_ON(!list_empty(&lock->bast_list));
+	BUG_ON(lock->ast_pending);
+	BUG_ON(lock->bast_pending);
+
+	dlm_lock_detach_lockres(lock);
+
+	if (lock->lksb_kernel_allocated) {
+		mlog(0, "freeing kernel-allocated lksb\n");
+		kfree(lock->lksb);
+	}
+	kfree(lock);
+}
+
+/* associate a lock with it's lockres, getting a ref on the lockres */
+void dlm_lock_attach_lockres(struct dlm_lock *lock,
+			     struct dlm_lock_resource *res)
+{
+	dlm_lockres_get(res);
+	lock->lockres = res;
+}
+
+/* drop ref on lockres, if there is still one associated with lock */
+static void dlm_lock_detach_lockres(struct dlm_lock *lock)
+{
+	struct dlm_lock_resource *res;
+
+	res = lock->lockres;
+	if (res) {
+		lock->lockres = NULL;
+		mlog(0, "removing lock's lockres reference\n");
+		dlm_lockres_put(res);
+	}
+}
+
+static void dlm_init_lock(struct dlm_lock *newlock, int type,
+			  u8 node, u64 cookie)
+{
+	INIT_LIST_HEAD(&newlock->list);
+	INIT_LIST_HEAD(&newlock->ast_list);
+	INIT_LIST_HEAD(&newlock->bast_list);
+	spin_lock_init(&newlock->spinlock);
+	newlock->ml.type = type;
+	newlock->ml.convert_type = LKM_IVMODE;
+	newlock->ml.highest_blocked = LKM_IVMODE;
+	newlock->ml.node = node;
+	newlock->ml.pad1 = 0;
+	newlock->ml.list = 0;
+	newlock->ml.flags = 0;
+	newlock->ast = NULL;
+	newlock->bast = NULL;
+	newlock->astdata = NULL;
+	newlock->ml.cookie = cpu_to_be64(cookie);
+	newlock->ast_pending = 0;
+	newlock->bast_pending = 0;
+	newlock->convert_pending = 0;
+	newlock->lock_pending = 0;
+	newlock->unlock_pending = 0;
+	newlock->cancel_pending = 0;
+	newlock->lksb_kernel_allocated = 0;
+
+	kref_init(&newlock->lock_refs);
+}
+
+struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
+			       struct dlm_lockstatus *lksb)
+{
+	struct dlm_lock *lock;
+	int kernel_allocated = 0;
+
+	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	if (!lock)
+		return NULL;
+
+	if (!lksb) {
+		/* zero memory only if kernel-allocated */
+		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		if (!lksb) {
+			kfree(lock);
+			return NULL;
+		}
+		kernel_allocated = 1;
+	}
+
+	dlm_init_lock(lock, type, node, cookie);
+	if (kernel_allocated)
+		lock->lksb_kernel_allocated = 1;
+	lock->lksb = lksb;
+	lksb->lockid = lock;
+	return lock;
+}
+
+/* handler for lock creation net message
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
+ */
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	char *name;
+	unsigned int namelen;
+
+	BUG_ON(!dlm);
+
+	mlog_entry_void();
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	name = create->name;
+	namelen = create->namelen;
+
+	status = DLM_IVBUFLEN;
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	status = DLM_SYSERR;
+	newlock = dlm_new_lock(create->requested_type,
+			       create->node_idx,
+			       be64_to_cpu(create->cookie), NULL);
+	if (!newlock) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	lksb = newlock->lksb;
+
+	if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
+		lksb->flags |= DLM_LKSB_GET_LVB;
+		mlog(0, "set DLM_LKSB_GET_LVB flag\n");
+	}
+
+	status = DLM_IVLOCKID;
+	res = dlm_lookup_lockres(dlm, name, namelen);
+	if (!res) {
+		dlm_error(status);
+		goto leave;
+	}
+
+	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	spin_unlock(&res->spinlock);
+
+	if (status != DLM_NORMAL) {
+		mlog(0, "lockres recovering/migrating/in-progress\n");
+		goto leave;
+	}
+
+	dlm_lock_attach_lockres(newlock, res);
+
+	status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
+leave:
+	if (status != DLM_NORMAL)
+		if (newlock)
+			dlm_lock_put(newlock);
+
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
+static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
+{
+	u64 tmpnode = node_num;
+
+	/* shift single byte of node num into top 8 bits */
+	tmpnode <<= 56;
+
+	spin_lock(&dlm_cookie_lock);
+	*cookie = (dlm_next_cookie | tmpnode);
+	if (++dlm_next_cookie & 0xff00000000000000ull) {
+		mlog(0, "This node's cookie will now wrap!\n");
+		dlm_next_cookie = 1;
+	}
+	spin_unlock(&dlm_cookie_lock);
+}
+
+enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
+			struct dlm_lockstatus *lksb, int flags,
+			const char *name, dlm_astlockfunc_t *ast, void *data,
+			dlm_bastlockfunc_t *bast)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_lock *lock = NULL;
+	int convert = 0, recovery = 0;
+
+	/* yes this function is a mess.
+	 * TODO: clean this up.  lots of common code in the
+	 *       lock and convert paths, especially in the retry blocks */
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	status = DLM_BADPARAM;
+	if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
+		dlm_error(status);
+		goto error;
+	}
+
+	if (flags & ~LKM_VALID_FLAGS) {
+		dlm_error(status);
+		goto error;
+	}
+
+	convert = (flags & LKM_CONVERT);
+	recovery = (flags & LKM_RECOVERY);
+
+	if (recovery &&
+	    (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
+		dlm_error(status);
+		goto error;
+	}
+	if (convert && (flags & LKM_LOCAL)) {
+		mlog(ML_ERROR, "strange LOCAL convert request!\n");
+		goto error;
+	}
+
+	if (convert) {
+		/* CONVERT request */
+
+		/* if converting, must pass in a valid dlm_lock */
+		lock = lksb->lockid;
+		if (!lock) {
+			mlog(ML_ERROR, "NULL lock pointer in convert "
+			     "request\n");
+			goto error;
+		}
+
+		res = lock->lockres;
+		if (!res) {
+			mlog(ML_ERROR, "NULL lockres pointer in convert "
+			     "request\n");
+			goto error;
+		}
+		dlm_lockres_get(res);
+
+		/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
+	 	 * static after the original lock call.  convert requests will
+		 * ensure that everything is the same, or return DLM_BADARGS.
+	 	 * this means that DLM_DENIED_NOASTS will never be returned.
+	 	 */
+		if (lock->lksb != lksb || lock->ast != ast ||
+		    lock->bast != bast || lock->astdata != data) {
+			status = DLM_BADARGS;
+			mlog(ML_ERROR, "new args:  lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lksb, ast, bast, data);
+			mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
+			     "astdata=%p\n", lock->lksb, lock->ast,
+			     lock->bast, lock->astdata);
+			goto error;
+		}
+retry_convert:
+		dlm_wait_for_recovery(dlm);
+
+		if (res->owner == dlm->node_num)
+			status = dlmconvert_master(dlm, res, lock, flags, mode);
+		else
+			status = dlmconvert_remote(dlm, res, lock, flags, mode);
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			/* for now, see how this works without sleeping
+			 * and just retry right away.  I suspect the reco
+			 * or migration will complete fast enough that
+			 * no waiting will be necessary */
+			mlog(0, "retrying convert with migration/recovery/"
+			     "in-progress\n");
+			msleep(100);
+			goto retry_convert;
+		}
+	} else {
+		u64 tmpcookie;
+
+		/* LOCK request */
+		status = DLM_BADARGS;
+		if (!name) {
+			dlm_error(status);
+			goto error;
+		}
+
+		status = DLM_IVBUFLEN;
+		if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
+			dlm_error(status);
+			goto error;
+		}
+
+		dlm_get_next_cookie(dlm->node_num, &tmpcookie);
+		lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
+		if (!lock) {
+			dlm_error(status);
+			goto error;
+		}
+
+		if (!recovery)
+			dlm_wait_for_recovery(dlm);
+
+		/* find or create the lock resource */
+		res = dlm_get_lock_resource(dlm, name, flags);
+		if (!res) {
+			status = DLM_IVLOCKID;
+			dlm_error(status);
+			goto error;
+		}
+
+		mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
+		mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
+
+		dlm_lock_attach_lockres(lock, res);
+		lock->ast = ast;
+		lock->bast = bast;
+		lock->astdata = data;
+
+retry_lock:
+		if (flags & LKM_VALBLK) {
+			mlog(0, "LKM_VALBLK passed by caller\n");
+
+			/* LVB requests for non PR, PW or EX locks are
+			 * ignored. */
+			if (mode < LKM_PRMODE)
+				flags &= ~LKM_VALBLK;
+			else {
+				flags |= LKM_GET_LVB;
+				lock->lksb->flags |= DLM_LKSB_GET_LVB;
+			}
+		}
+
+		if (res->owner == dlm->node_num)
+			status = dlmlock_master(dlm, res, lock, flags);
+		else
+			status = dlmlock_remote(dlm, res, lock, flags);
+
+		if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
+		    status == DLM_FORWARD) {
+			mlog(0, "retrying lock with migration/"
+			     "recovery/in progress\n");
+			msleep(100);
+			dlm_wait_for_recovery(dlm);
+			goto retry_lock;
+		}
+
+		if (status != DLM_NORMAL) {
+			lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
+			if (status != DLM_NOTQUEUED)
+				dlm_error(status);
+			goto error;
+		}
+	}
+
+error:
+	if (status != DLM_NORMAL) {
+		if (lock && !convert)
+			dlm_lock_put(lock);
+		// this is kind of unnecessary
+		lksb->status = status;
+	}
+
+	/* put lockres ref from the convert path
+	 * or from dlm_get_lock_resource */
+	if (res)
+		dlm_lockres_put(res);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 0000000..0472795
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2666 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmmod.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdebug.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
+#include "cluster/masklog.h"
+
+enum dlm_mle_type {
+	DLM_MLE_BLOCK,
+	DLM_MLE_MASTER,
+	DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name
+{
+	u8 len;
+	u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry
+{
+	struct list_head list;
+	struct list_head hb_events;
+	struct dlm_ctxt *dlm;
+	spinlock_t spinlock;
+	wait_queue_head_t wq;
+	atomic_t woken;
+	struct kref mle_refs;
+	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	u8 master;
+	u8 new_master;
+	enum dlm_mle_type type;
+	struct o2hb_callback_func mle_hb_up;
+	struct o2hb_callback_func mle_hb_down;
+	union {
+		struct dlm_lock_resource *res;
+		struct dlm_lock_name name;
+	} u;
+};
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node,
+			      int idx);
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node,
+			    int idx);
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags);
+
+static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
+				struct dlm_master_list_entry *mle,
+				const char *name,
+				unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	if (dlm != mle->dlm)
+		return 0;
+
+	if (mle->type == DLM_MLE_BLOCK ||
+	    mle->type == DLM_MLE_MIGRATION) {
+		if (namelen != mle->u.name.len ||
+    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
+			return 0;
+	} else {
+		res = mle->u.res;
+		if (namelen != res->lockname.len ||
+		    memcmp(res->lockname.name, name, namelen) != 0)
+			return 0;
+	}
+	return 1;
+}
+
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+	int i = 0, refs;
+	char *type;
+	char attached;
+	u8 master;
+	unsigned int namelen;
+	const char *name;
+	struct kref *k;
+
+	k = &mle->mle_refs;
+	if (mle->type == DLM_MLE_BLOCK)
+		type = "BLK";
+	else if (mle->type == DLM_MLE_MASTER)
+		type = "MAS";
+	else
+		type = "MIG";
+	refs = atomic_read(&k->refcount);
+	master = mle->master;
+	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
+
+	if (mle->type != DLM_MLE_MASTER) {
+		namelen = mle->u.name.len;
+		name = mle->u.name.name;
+	} else {
+		namelen = mle->u.res->lockname.len;
+		name = mle->u.res->lockname.name;
+	}
+
+	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
+		  i, type, refs, master, mle->new_master, attached,
+		  namelen, namelen, name);
+}
+
+static void dlm_dump_mles(struct dlm_ctxt *dlm)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+	
+	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
+	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
+	spin_lock(&dlm->master_lock);
+	list_for_each(iter, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+		dlm_print_one_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+int dlm_dump_all_mles(const char __user *data, unsigned int len)
+{
+	struct list_head *iter;
+	struct dlm_ctxt *dlm;
+
+	spin_lock(&dlm_domain_lock);
+	list_for_each(iter, &dlm_domains) {
+		dlm = list_entry (iter, struct dlm_ctxt, list);
+		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
+		dlm_dump_mles(dlm);
+	}
+	spin_unlock(&dlm_domain_lock);
+	return len;
+}
+EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
+
+#endif  /*  0  */
+
+
+static kmem_cache_t *dlm_mle_cache = NULL;
+
+
+static void dlm_mle_release(struct kref *kref);
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen);
+static void dlm_put_mle(struct dlm_master_list_entry *mle);
+static void __dlm_put_mle(struct dlm_master_list_entry *mle);
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen);
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
+
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked);
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked);
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master);
+
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res);
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res);
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target);
+
+
+int dlm_is_host_down(int errno)
+{
+	switch (errno) {
+		case -EBADF:
+		case -ECONNREFUSED:
+		case -ENOTCONN:
+		case -ECONNRESET:
+		case -EPIPE:
+		case -EHOSTDOWN:
+		case -EHOSTUNREACH:
+		case -ETIMEDOUT:
+		case -ECONNABORTED:
+		case -ENETDOWN:
+		case -ENETUNREACH:
+		case -ENETRESET:
+		case -ESHUTDOWN:
+		case -ENOPROTOOPT:
+		case -EINVAL:   /* if returned from our tcp code,
+				   this means there is no socket */
+			return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * MASTER LIST FUNCTIONS
+ */
+
+
+/*
+ * regarding master list entries and heartbeat callbacks:
+ *
+ * in order to avoid sleeping and allocation that occurs in
+ * heartbeat, master list entries are simply attached to the
+ * dlm's established heartbeat callbacks.  the mle is attached
+ * when it is created, and since the dlm->spinlock is held at
+ * that time, any heartbeat event will be properly discovered
+ * by the mle.  the mle needs to be detached from the
+ * dlm->mle_hb_events list as soon as heartbeat events are no
+ * longer useful to the mle, and before the mle is freed.
+ *
+ * as a general rule, heartbeat events are no longer needed by
+ * the mle once an "answer" regarding the lock master has been
+ * received.
+ */
+static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
+}
+
+
+static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					      struct dlm_master_list_entry *mle)
+{
+	if (!list_empty(&mle->hb_events))
+		list_del_init(&mle->hb_events);
+}
+
+
+static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
+					    struct dlm_master_list_entry *mle)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_mle_detach_hb_events(dlm, mle);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* remove from list and free */
+static void __dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
+
+	kref_put(&mle->mle_refs, dlm_mle_release);
+}
+
+
+/* must not have any spinlocks coming in */
+static void dlm_put_mle(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
+{
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_init_mle(struct dlm_master_list_entry *mle,
+			enum dlm_mle_type type,
+			struct dlm_ctxt *dlm,
+			struct dlm_lock_resource *res,
+			const char *name,
+			unsigned int namelen)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	mle->dlm = dlm;
+	mle->type = type;
+	INIT_LIST_HEAD(&mle->list);
+	INIT_LIST_HEAD(&mle->hb_events);
+	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+	spin_lock_init(&mle->spinlock);
+	init_waitqueue_head(&mle->wq);
+	atomic_set(&mle->woken, 0);
+	kref_init(&mle->mle_refs);
+	memset(mle->response_map, 0, sizeof(mle->response_map));
+	mle->master = O2NM_MAX_NODES;
+	mle->new_master = O2NM_MAX_NODES;
+
+	if (mle->type == DLM_MLE_MASTER) {
+		BUG_ON(!res);
+		mle->u.res = res;
+	} else if (mle->type == DLM_MLE_BLOCK) {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	} else /* DLM_MLE_MIGRATION */ {
+		BUG_ON(!name);
+		memcpy(mle->u.name.name, name, namelen);
+		mle->u.name.len = namelen;
+	}
+
+	/* copy off the node_map and register hb callbacks on our copy */
+	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
+	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+	clear_bit(dlm->node_num, mle->vote_map);
+	clear_bit(dlm->node_num, mle->node_map);
+
+	/* attach the mle to the domain node up/down events */
+	__dlm_mle_attach_hb_events(dlm, mle);
+}
+
+
+/* returns 1 if found, 0 if not */
+static int dlm_find_mle(struct dlm_ctxt *dlm,
+			struct dlm_master_list_entry **mle,
+			char *name, unsigned int namelen)
+{
+	struct dlm_master_list_entry *tmpmle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->master_lock);
+
+	list_for_each(iter, &dlm->master_list) {
+		tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
+			continue;
+		dlm_get_mle(tmpmle);
+		*mle = tmpmle;
+		return 1;
+	}
+	return 0;
+}
+
+void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
+{
+	struct dlm_master_list_entry *mle;
+	struct list_head *iter;
+
+	assert_spin_locked(&dlm->spinlock);
+	
+	list_for_each(iter, &dlm->mle_hb_events) {
+		mle = list_entry(iter, struct dlm_master_list_entry, 
+				 hb_events);
+		if (node_up)
+			dlm_mle_node_up(dlm, mle, NULL, idx);
+		else
+			dlm_mle_node_down(dlm, mle, NULL, idx);
+	}
+}
+
+static void dlm_mle_node_down(struct dlm_ctxt *dlm,
+			      struct dlm_master_list_entry *mle,
+			      struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (!test_bit(idx, mle->node_map))
+		mlog(0, "node %u already removed from nodemap!\n", idx);
+	else
+		clear_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+static void dlm_mle_node_up(struct dlm_ctxt *dlm,
+			    struct dlm_master_list_entry *mle,
+			    struct o2nm_node *node, int idx)
+{
+	spin_lock(&mle->spinlock);
+
+	if (test_bit(idx, mle->node_map))
+		mlog(0, "node %u already in node map!\n", idx);
+	else
+		set_bit(idx, mle->node_map);
+
+	spin_unlock(&mle->spinlock);
+}
+
+
+int dlm_init_mle_cache(void)
+{
+	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+					  sizeof(struct dlm_master_list_entry),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL, NULL);
+	if (dlm_mle_cache == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void dlm_destroy_mle_cache(void)
+{
+	if (dlm_mle_cache)
+		kmem_cache_destroy(dlm_mle_cache);
+}
+
+static void dlm_mle_release(struct kref *kref)
+{
+	struct dlm_master_list_entry *mle;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
+	dlm = mle->dlm;
+
+	if (mle->type != DLM_MLE_MASTER) {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.name.len, mle->u.name.name, mle->type);
+	} else {
+		mlog(0, "calling mle_release for %.*s, type %d\n",
+		     mle->u.res->lockname.len,
+		     mle->u.res->lockname.name, mle->type);
+	}
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* remove from list if not already */
+	if (!list_empty(&mle->list))
+		list_del_init(&mle->list);
+
+	/* detach the mle from the domain node up/down events */
+	__dlm_mle_detach_hb_events(dlm, mle);
+
+	/* NOTE: kfree under spinlock here.
+	 * if this is bad, we can move this to a freelist. */
+	kmem_cache_free(dlm_mle_cache, mle);
+}
+
+
+/*
+ * LOCK RESOURCE FUNCTIONS
+ */
+
+static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
+
+	if (owner == dlm->node_num)
+		atomic_inc(&dlm->local_resources);
+	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_inc(&dlm->unknown_resources);
+	else
+		atomic_inc(&dlm->remote_resources);
+
+	res->owner = owner;
+}
+
+void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res, u8 owner)
+{
+	assert_spin_locked(&res->spinlock);
+
+	if (owner == res->owner)
+		return;
+
+	if (res->owner == dlm->node_num)
+		atomic_dec(&dlm->local_resources);
+	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
+		atomic_dec(&dlm->unknown_resources);
+	else
+		atomic_dec(&dlm->remote_resources);
+
+	dlm_set_lockres_owner(dlm, res, owner);
+}
+
+
+static void dlm_lockres_release(struct kref *kref)
+{
+	struct dlm_lock_resource *res;
+
+	res = container_of(kref, struct dlm_lock_resource, refs);
+
+	/* This should not happen -- all lockres' have a name
+	 * associated with them at init time. */
+	BUG_ON(!res->lockname.name);
+
+	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
+	     res->lockname.name);
+
+	/* By the time we're ready to blow this guy away, we shouldn't
+	 * be on any lists. */
+	BUG_ON(!list_empty(&res->list));
+	BUG_ON(!list_empty(&res->granted));
+	BUG_ON(!list_empty(&res->converting));
+	BUG_ON(!list_empty(&res->blocked));
+	BUG_ON(!list_empty(&res->dirty));
+	BUG_ON(!list_empty(&res->recovering));
+	BUG_ON(!list_empty(&res->purge));
+
+	kfree(res->lockname.name);
+
+	kfree(res);
+}
+
+void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	kref_get(&res->refs);
+}
+
+void dlm_lockres_put(struct dlm_lock_resource *res)
+{
+	kref_put(&res->refs, dlm_lockres_release);
+}
+
+static void dlm_init_lockres(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res,
+			     const char *name, unsigned int namelen)
+{
+	char *qname;
+
+	/* If we memset here, we lose our reference to the kmalloc'd
+	 * res->lockname.name, so be sure to init every field
+	 * correctly! */
+
+	qname = (char *) res->lockname.name;
+	memcpy(qname, name, namelen);
+
+	res->lockname.len = namelen;
+	res->lockname.hash = full_name_hash(name, namelen);
+
+	init_waitqueue_head(&res->wq);
+	spin_lock_init(&res->spinlock);
+	INIT_LIST_HEAD(&res->list);
+	INIT_LIST_HEAD(&res->granted);
+	INIT_LIST_HEAD(&res->converting);
+	INIT_LIST_HEAD(&res->blocked);
+	INIT_LIST_HEAD(&res->dirty);
+	INIT_LIST_HEAD(&res->recovering);
+	INIT_LIST_HEAD(&res->purge);
+	atomic_set(&res->asts_reserved, 0);
+	res->migration_pending = 0;
+
+	kref_init(&res->refs);
+
+	/* just for consistency */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+	spin_unlock(&res->spinlock);
+
+	res->state = DLM_LOCK_RES_IN_PROGRESS;
+
+	res->last_used = 0;
+
+	memset(res->lvb, 0, DLM_LVB_LEN);
+}
+
+struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
+				   const char *name,
+				   unsigned int namelen)
+{
+	struct dlm_lock_resource *res;
+
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	if (!res)
+		return NULL;
+
+	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	if (!res->lockname.name) {
+		kfree(res);
+		return NULL;
+	}
+
+	dlm_init_lockres(dlm, res, name, namelen);
+	return res;
+}
+
+/*
+ * lookup a lock resource by name.
+ * may already exist in the hashtable.
+ * lockid is null terminated
+ *
+ * if not, allocate enough for the lockres and for
+ * the temporary structure used in doing the mastering.
+ *
+ * also, do a lookup in the dlm->master_list to see
+ * if another node has begun mastering the same lock.
+ * if so, there should be a block entry in there
+ * for this name, and we should *not* attempt to master
+ * the lock here.   need to wait around for that node
+ * to assert_master (or die).
+ *
+ */
+struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
+					  const char *lockid,
+					  int flags)
+{
+	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *alloc_mle = NULL;
+	int blocked = 0;
+	int ret, nodenum;
+	struct dlm_node_iter iter;
+	unsigned int namelen;
+	int tries = 0;
+
+	BUG_ON(!lockid);
+
+	namelen = strlen(lockid);
+
+	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
+
+lookup:
+	spin_lock(&dlm->spinlock);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	if (tmpres) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "found in hash!\n");
+		if (res)
+			dlm_lockres_put(res);
+		res = tmpres;
+		goto leave;
+	}
+
+	if (!res) {
+		spin_unlock(&dlm->spinlock);
+		mlog(0, "allocating a new resource\n");
+		/* nothing found and we need to allocate one. */
+		alloc_mle = (struct dlm_master_list_entry *)
+			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+		if (!alloc_mle)
+			goto leave;
+		res = dlm_new_lockres(dlm, lockid, namelen);
+		if (!res)
+			goto leave;
+		goto lookup;
+	}
+
+	mlog(0, "no lockres found, allocated our own: %p\n", res);
+
+	if (flags & LKM_LOCAL) {
+		/* caller knows it's safe to assume it's not mastered elsewhere
+		 * DONE!  return right away */
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+		/* lockres still marked IN_PROGRESS */
+		goto wake_waiters;
+	}
+
+	/* check master list to see if another node has started mastering it */
+	spin_lock(&dlm->master_lock);
+
+	/* if we found a block, wait for lock to be mastered by another node */
+	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
+	if (blocked) {
+		if (mle->type == DLM_MLE_MASTER) {
+			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
+			BUG();
+		} else if (mle->type == DLM_MLE_MIGRATION) {
+			/* migration is in progress! */
+			/* the good news is that we now know the
+			 * "current" master (mle->master). */
+
+			spin_unlock(&dlm->master_lock);
+			assert_spin_locked(&dlm->spinlock);
+
+			/* set the lockres owner and hash it */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res, mle->master);
+			__dlm_insert_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+			/* master is known, detach */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			mle = NULL;
+			goto wake_waiters;
+		}
+	} else {
+		/* go ahead and try to master lock on this node */
+		mle = alloc_mle;
+		/* make sure this does not get freed below */
+		alloc_mle = NULL;
+		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
+		set_bit(dlm->node_num, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+	}
+
+	/* at this point there is either a DLM_MLE_BLOCK or a
+	 * DLM_MLE_MASTER on the master list, so it's safe to add the
+	 * lockres to the hashtable.  anyone who finds the lock will
+	 * still have to wait on the IN_PROGRESS. */
+
+	/* finally add the lockres to its hash bucket */
+	__dlm_insert_lockres(dlm, res);
+	/* get an extra ref on the mle in case this is a BLOCK
+	 * if so, the creator of the BLOCK may try to put the last
+	 * ref at this time in the assert master handler, so we
+	 * need an extra one to keep from a bad ptr deref. */
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* must wait for lock to be mastered elsewhere */
+	if (blocked)
+		goto wait;
+
+redo_request:
+	ret = -EINVAL;
+	dlm_node_iter_init(mle->vote_map, &iter);
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = dlm_do_master_request(mle, nodenum);
+		if (ret < 0)
+			mlog_errno(ret);
+		if (mle->master != O2NM_MAX_NODES) {
+			/* found a master ! */
+			break;
+		}
+	}
+
+wait:
+	/* keep going until the response map includes all nodes */
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	if (ret < 0) {
+		mlog(0, "%s:%.*s: node map changed, redo the "
+		     "master request now, blocked=%d\n",
+		     dlm->name, res->lockname.len,
+		     res->lockname.name, blocked);
+		if (++tries > 20) {
+			mlog(ML_ERROR, "%s:%.*s: spinning on "
+			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
+			     dlm->name, res->lockname.len, 
+			     res->lockname.name, blocked);
+			dlm_print_one_lock_resource(res);
+			/* dlm_print_one_mle(mle); */
+			tries = 0;
+		}
+		goto redo_request;
+	}
+
+	mlog(0, "lockres mastered by %u\n", res->owner);
+	/* make sure we never continue without this */
+	BUG_ON(res->owner == O2NM_MAX_NODES);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	/* put the extra ref */
+	dlm_put_mle(mle);
+
+wake_waiters:
+	spin_lock(&res->spinlock);
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+leave:
+	/* need to free the unused mle */
+	if (alloc_mle)
+		kmem_cache_free(dlm_mle_cache, alloc_mle);
+
+	return res;
+}
+
+
+#define DLM_MASTERY_TIMEOUT_MS   5000
+
+static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_master_list_entry *mle,
+				     int *blocked)
+{
+	u8 m;
+	int ret, bit;
+	int map_changed, voting_done;
+	int assert, sleep;
+
+recheck:
+	ret = 0;
+	assert = 0;
+
+	/* check if another node has already become the owner */
+	spin_lock(&res->spinlock);
+	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	spin_unlock(&res->spinlock);
+
+	spin_lock(&mle->spinlock);
+	m = mle->master;
+	map_changed = (memcmp(mle->vote_map, mle->node_map,
+			      sizeof(mle->vote_map)) != 0);
+	voting_done = (memcmp(mle->vote_map, mle->response_map,
+			     sizeof(mle->vote_map)) == 0);
+
+	/* restart if we hit any errors */
+	if (map_changed) {
+		int b;
+		mlog(0, "%s: %.*s: node map changed, restarting\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
+		b = (mle->type == DLM_MLE_BLOCK);
+		if ((*blocked && !b) || (!*blocked && b)) {
+			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
+			     dlm->name, res->lockname.len, res->lockname.name,
+			     *blocked, b);
+			*blocked = b;
+		}
+		spin_unlock(&mle->spinlock);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto leave;
+		}
+		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
+		     "rechecking now\n", dlm->name, res->lockname.len,
+		     res->lockname.name);
+		goto recheck;
+	}
+
+	if (m != O2NM_MAX_NODES) {
+		/* another node has done an assert!
+		 * all done! */
+		sleep = 0;
+	} else {
+		sleep = 1;
+		/* have all nodes responded? */
+		if (voting_done && !*blocked) {
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (dlm->node_num <= bit) {
+				/* my node number is lowest.
+			 	 * now tell other nodes that I am
+				 * mastering this. */
+				mle->master = dlm->node_num;
+				assert = 1;
+				sleep = 0;
+			}
+			/* if voting is done, but we have not received
+			 * an assert master yet, we must sleep */
+		}
+	}
+
+	spin_unlock(&mle->spinlock);
+
+	/* sleep if we haven't finished voting yet */
+	if (sleep) {
+		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
+
+		/*
+		if (atomic_read(&mle->mle_refs.refcount) < 2)
+			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
+			atomic_read(&mle->mle_refs.refcount),
+			res->lockname.len, res->lockname.name);
+		*/
+		atomic_set(&mle->woken, 0);
+		(void)wait_event_timeout(mle->wq,
+					 (atomic_read(&mle->woken) == 1),
+					 timeo);
+		if (res->owner == O2NM_MAX_NODES) {
+			mlog(0, "waiting again\n");
+			goto recheck;
+		}
+		mlog(0, "done waiting, master is %u\n", res->owner);
+		ret = 0;
+		goto leave;
+	}
+
+	ret = 0;   /* done */
+	if (assert) {
+		m = dlm->node_num;
+		mlog(0, "about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, m);
+		ret = dlm_do_assert_master(dlm, res->lockname.name,
+					   res->lockname.len, mle->vote_map, 0);
+		if (ret) {
+			/* This is a failure in the network path,
+			 * not in the response to the assert_master
+			 * (any nonzero response is a BUG on this node).
+			 * Most likely a socket just got disconnected
+			 * due to node death. */
+			mlog_errno(ret);
+		}
+		/* no longer need to restart lock mastery.
+		 * all living nodes have been contacted. */
+		ret = 0;
+	}
+
+	/* set the lockres owner */
+	spin_lock(&res->spinlock);
+	dlm_change_lockres_owner(dlm, res, m);
+	spin_unlock(&res->spinlock);
+
+leave:
+	return ret;
+}
+
+struct dlm_bitmap_diff_iter
+{
+	int curnode;
+	unsigned long *orig_bm;
+	unsigned long *cur_bm;
+	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+enum dlm_node_state_change
+{
+	NODE_DOWN = -1,
+	NODE_NO_CHANGE = 0,
+	NODE_UP
+};
+
+static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
+				      unsigned long *orig_bm,
+				      unsigned long *cur_bm)
+{
+	unsigned long p1, p2;
+	int i;
+
+	iter->curnode = -1;
+	iter->orig_bm = orig_bm;
+	iter->cur_bm = cur_bm;
+
+	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
+       		p1 = *(iter->orig_bm + i);
+	       	p2 = *(iter->cur_bm + i);
+		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
+	}
+}
+
+static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
+				     enum dlm_node_state_change *state)
+{
+	int bit;
+
+	if (iter->curnode >= O2NM_MAX_NODES)
+		return -ENOENT;
+
+	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
+			    iter->curnode+1);
+	if (bit >= O2NM_MAX_NODES) {
+		iter->curnode = O2NM_MAX_NODES;
+		return -ENOENT;
+	}
+
+	/* if it was there in the original then this node died */
+	if (test_bit(bit, iter->orig_bm))
+		*state = NODE_DOWN;
+	else
+		*state = NODE_UP;
+
+	iter->curnode = bit;
+	return bit;
+}
+
+
+static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res,
+				    struct dlm_master_list_entry *mle,
+				    int blocked)
+{
+	struct dlm_bitmap_diff_iter bdi;
+	enum dlm_node_state_change sc;
+	int node;
+	int ret = 0;
+
+	mlog(0, "something happened such that the "
+	     "master process may need to be restarted!\n");
+
+	assert_spin_locked(&mle->spinlock);
+
+	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
+	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	while (node >= 0) {
+		if (sc == NODE_UP) {
+			/* a node came up.  easy.  might not even need
+			 * to talk to it if its node number is higher
+			 * or if we are already blocked. */
+			mlog(0, "node up! %d\n", node);
+			if (blocked)
+				goto next;
+
+			if (node > dlm->node_num) {
+				mlog(0, "node > this node. skipping.\n");
+				goto next;
+			}
+
+			/* redo the master request, but only for the new node */
+			mlog(0, "sending request to new node\n");
+			clear_bit(node, mle->response_map);
+			set_bit(node, mle->vote_map);
+		} else {
+			mlog(ML_ERROR, "node down! %d\n", node);
+
+			/* if the node wasn't involved in mastery skip it,
+			 * but clear it out from the maps so that it will
+			 * not affect mastery of this lockres */
+			clear_bit(node, mle->response_map);
+			clear_bit(node, mle->vote_map);
+			if (!test_bit(node, mle->maybe_map))
+				goto next;
+
+			/* if we're already blocked on lock mastery, and the
+			 * dead node wasn't the expected master, or there is
+			 * another node in the maybe_map, keep waiting */
+			if (blocked) {
+				int lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES, 0);
+
+				/* act like it was never there */
+				clear_bit(node, mle->maybe_map);
+
+			       	if (node != lowest)
+					goto next;
+
+				mlog(ML_ERROR, "expected master %u died while "
+				     "this node was blocked waiting on it!\n",
+				     node);
+				lowest = find_next_bit(mle->maybe_map,
+						       O2NM_MAX_NODES,
+						       lowest+1);
+				if (lowest < O2NM_MAX_NODES) {
+					mlog(0, "still blocked. waiting "
+					     "on %u now\n", lowest);
+					goto next;
+				}
+
+				/* mle is an MLE_BLOCK, but there is now
+				 * nothing left to block on.  we need to return
+				 * all the way back out and try again with
+				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
+				 * has already run, so the mle refcount is ok */
+				mlog(0, "no longer blocking. we can "
+				     "try to master this here\n");
+				mle->type = DLM_MLE_MASTER;
+				memset(mle->maybe_map, 0,
+				       sizeof(mle->maybe_map));
+				memset(mle->response_map, 0,
+				       sizeof(mle->maybe_map));
+				memcpy(mle->vote_map, mle->node_map,
+				       sizeof(mle->node_map));
+				mle->u.res = res;
+				set_bit(dlm->node_num, mle->maybe_map);
+
+				ret = -EAGAIN;
+				goto next;
+			}
+
+			clear_bit(node, mle->maybe_map);
+			if (node > dlm->node_num)
+				goto next;
+
+			mlog(0, "dead node in map!\n");
+			/* yuck. go back and re-contact all nodes
+			 * in the vote_map, removing this node. */
+			memset(mle->response_map, 0,
+			       sizeof(mle->response_map));
+		}
+		ret = -EAGAIN;
+next:
+		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
+	}
+	return ret;
+}
+
+
+/*
+ * DLM_MASTER_REQUEST_MSG
+ *
+ * returns: 0 on success,
+ *          -errno on a network error
+ *
+ * on error, the caller should assume the target node is "dead"
+ *
+ */
+
+static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
+{
+	struct dlm_ctxt *dlm = mle->dlm;
+	struct dlm_master_request request;
+	int ret, response=0, resend;
+
+	memset(&request, 0, sizeof(request));
+	request.node_idx = dlm->node_num;
+
+	BUG_ON(mle->type == DLM_MLE_MIGRATION);
+
+	if (mle->type != DLM_MLE_MASTER) {
+		request.namelen = mle->u.name.len;
+		memcpy(request.name, mle->u.name.name, request.namelen);
+	} else {
+		request.namelen = mle->u.res->lockname.len;
+		memcpy(request.name, mle->u.res->lockname.name,
+			request.namelen);
+	}
+
+again:
+	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+				 sizeof(request), to, &response);
+	if (ret < 0)  {
+		if (ret == -ESRCH) {
+			/* should never happen */
+			mlog(ML_ERROR, "TCP stack not ready!\n");
+			BUG();
+		} else if (ret == -EINVAL) {
+			mlog(ML_ERROR, "bad args passed to o2net!\n");
+			BUG();
+		} else if (ret == -ENOMEM) {
+			mlog(ML_ERROR, "out of memory while trying to send "
+			     "network message!  retrying\n");
+			/* this is totally crude */
+			msleep(50);
+			goto again;
+		} else if (!dlm_is_host_down(ret)) {
+			/* not a network error. bad. */
+			mlog_errno(ret);
+			mlog(ML_ERROR, "unhandled error!");
+			BUG();
+		}
+		/* all other errors should be network errors,
+		 * and likely indicate node death */
+		mlog(ML_ERROR, "link to %d went down!\n", to);
+		goto out;
+	}
+
+	ret = 0;
+	resend = 0;
+	spin_lock(&mle->spinlock);
+	switch (response) {
+		case DLM_MASTER_RESP_YES:
+			set_bit(to, mle->response_map);
+			mlog(0, "node %u is the master, response=YES\n", to);
+			mle->master = to;
+			break;
+		case DLM_MASTER_RESP_NO:
+			mlog(0, "node %u not master, response=NO\n", to);
+			set_bit(to, mle->response_map);
+			break;
+		case DLM_MASTER_RESP_MAYBE:
+			mlog(0, "node %u not master, response=MAYBE\n", to);
+			set_bit(to, mle->response_map);
+			set_bit(to, mle->maybe_map);
+			break;
+		case DLM_MASTER_RESP_ERROR:
+			mlog(0, "node %u hit an error, resending\n", to);
+			resend = 1;
+			response = 0;
+			break;
+		default:
+			mlog(ML_ERROR, "bad response! %u\n", response);
+			BUG();
+	}
+	spin_unlock(&mle->spinlock);
+	if (resend) {
+		/* this is also totally crude */
+		msleep(50);
+		goto again;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	u8 response = DLM_MASTER_RESP_MAYBE;
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res;
+	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
+	char *name;
+	unsigned int namelen;
+	int found, ret;
+	int set_maybe;
+
+	if (!dlm_grab(dlm))
+		return DLM_MASTER_RESP_NO;
+
+	if (!dlm_domain_fully_joined(dlm)) {
+		response = DLM_MASTER_RESP_NO;
+		goto send_response;
+	}
+
+	name = request->name;
+	namelen = request->namelen;
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		response = DLM_IVBUFLEN;
+		goto send_response;
+	}
+
+way_up_top:
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_unlock(&dlm->spinlock);
+
+		/* take care of the easy cases up front */
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			spin_unlock(&res->spinlock);
+			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
+			     "being recovered\n");
+			response = DLM_MASTER_RESP_ERROR;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		if (res->owner == dlm->node_num) {
+			u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
+			spin_unlock(&res->spinlock);
+			// mlog(0, "this node is the master\n");
+			response = DLM_MASTER_RESP_YES;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+
+			/* this node is the owner.
+			 * there is some extra work that needs to
+			 * happen now.  the requesting node has
+			 * caused all nodes up to this one to
+			 * create mles.  this node now needs to
+			 * go back and clean those up. */
+			mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+			ret = dlm_dispatch_assert_master(dlm, res, 1,
+							 request->node_idx,
+							 flags);
+			if (ret < 0) {
+				mlog(ML_ERROR, "failed to dispatch assert "
+				     "master work\n");
+				response = DLM_MASTER_RESP_ERROR;
+			}
+			goto send_response;
+		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			spin_unlock(&res->spinlock);
+			// mlog(0, "node %u is the master\n", res->owner);
+			response = DLM_MASTER_RESP_NO;
+			if (mle)
+				kmem_cache_free(dlm_mle_cache, mle);
+			goto send_response;
+		}
+
+		/* ok, there is no owner.  either this node is
+		 * being blocked, or it is actively trying to
+		 * master this lock. */
+		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+			mlog(ML_ERROR, "lock with no owner should be "
+			     "in-progress!\n");
+			BUG();
+		}
+
+		// mlog(0, "lockres is in progress...\n");
+		spin_lock(&dlm->master_lock);
+		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+		if (!found) {
+			mlog(ML_ERROR, "no mle found for this lock!\n");
+			BUG();
+		}
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK) {
+			// mlog(0, "this node is waiting for "
+			// "lockres to be mastered\n");
+			response = DLM_MASTER_RESP_NO;
+		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "node %u is master, but trying to migrate to "
+			     "node %u.\n", tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				mlog(ML_ERROR, "no owner on lockres, but this "
+				     "node is trying to migrate it to %u?!\n",
+				     tmpmle->new_master);
+				BUG();
+			} else {
+				/* the real master can respond on its own */
+				response = DLM_MASTER_RESP_NO;
+			}
+		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			set_maybe = 0;
+			if (tmpmle->master == dlm->node_num)
+				response = DLM_MASTER_RESP_YES;
+			else
+				response = DLM_MASTER_RESP_NO;
+		} else {
+			// mlog(0, "this node is attempting to "
+			// "master lockres\n");
+			response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&res->spinlock);
+
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+		if (mle)
+			kmem_cache_free(dlm_mle_cache, mle);
+		goto send_response;
+	}
+
+	/*
+	 * lockres doesn't exist on this node
+	 * if there is an MLE_BLOCK, return NO
+	 * if there is an MLE_MASTER, return MAYBE
+	 * otherwise, add an MLE_BLOCK, return NO
+	 */
+	spin_lock(&dlm->master_lock);
+	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
+	if (!found) {
+		/* this lockid has never been seen on this node yet */
+		// mlog(0, "no mle found\n");
+		if (!mle) {
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
+
+			mle = (struct dlm_master_list_entry *)
+				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			if (!mle) {
+				// bad bad bad... this sucks.
+				response = DLM_MASTER_RESP_ERROR;
+				goto send_response;
+			}
+			spin_lock(&dlm->spinlock);
+			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
+					 name, namelen);
+			spin_unlock(&dlm->spinlock);
+			goto way_up_top;
+		}
+
+		// mlog(0, "this is second time thru, already allocated, "
+		// "add the block.\n");
+		set_bit(request->node_idx, mle->maybe_map);
+		list_add(&mle->list, &dlm->master_list);
+		response = DLM_MASTER_RESP_NO;
+	} else {
+		// mlog(0, "mle was found\n");
+		set_maybe = 1;
+		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->type == DLM_MLE_BLOCK)
+			response = DLM_MASTER_RESP_NO;
+		else if (tmpmle->type == DLM_MLE_MIGRATION) {
+			mlog(0, "migration mle was found (%u->%u)\n",
+			     tmpmle->master, tmpmle->new_master);
+			if (tmpmle->master == dlm->node_num) {
+				mlog(ML_ERROR, "no lockres, but migration mle "
+				     "says that this node is master!\n");
+				BUG();
+			}
+			/* real master can respond on its own */
+			response = DLM_MASTER_RESP_NO;
+		} else {
+			if (tmpmle->master == dlm->node_num) {
+				response = DLM_MASTER_RESP_YES;
+				set_maybe = 0;
+			} else
+				response = DLM_MASTER_RESP_MAYBE;
+		}
+		if (set_maybe)
+			set_bit(request->node_idx, tmpmle->maybe_map);
+		spin_unlock(&tmpmle->spinlock);
+	}
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (found) {
+		/* keep the mle attached to heartbeat events */
+		dlm_put_mle(tmpmle);
+	}
+send_response:
+	dlm_put(dlm);
+	return response;
+}
+
+/*
+ * DLM_ASSERT_MASTER_MSG
+ */
+
+
+/*
+ * NOTE: this can be used for debugging
+ * can periodically run all locks owned by this node
+ * and re-assert across the cluster...
+ */
+static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
+				unsigned int namelen, void *nodemap,
+				u32 flags)
+{
+	struct dlm_assert_master assert;
+	int to, tmpret;
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+	/* note that if this nodemap is empty, it returns 0 */
+	dlm_node_iter_init(nodemap, &iter);
+	while ((to = dlm_node_iter_next(&iter)) >= 0) {
+		int r = 0;
+		mlog(0, "sending assert master to %d (%.*s)\n", to,
+		     namelen, lockname);
+		memset(&assert, 0, sizeof(assert));
+		assert.node_idx = dlm->node_num;
+		assert.namelen = namelen;
+		memcpy(assert.name, lockname, namelen);
+		assert.flags = cpu_to_be32(flags);
+
+		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
+					    &assert, sizeof(assert), to, &r);
+		if (tmpret < 0) {
+			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			if (!dlm_is_host_down(tmpret)) {
+				mlog(ML_ERROR, "unhandled error!\n");
+				BUG();
+			}
+			/* a node died.  finish out the rest of the nodes. */
+			mlog(ML_ERROR, "link to %d went down!\n", to);
+			/* any nonzero status return will do */
+			ret = tmpret;
+		} else if (r < 0) {
+			/* ok, something horribly messed.  kill thyself. */
+			mlog(ML_ERROR,"during assert master of %.*s to %u, "
+			     "got %d.\n", namelen, lockname, to, r);
+			dlm_dump_lock_resources(dlm);
+			BUG();
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * locks that can be taken here:
+ * dlm->spinlock
+ * res->spinlock
+ * mle->spinlock
+ * dlm->master_list
+ *
+ * if possible, TRIM THIS DOWN!!!
+ */
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	char *name;
+	unsigned int namelen;
+	u32 flags;
+
+	if (!dlm_grab(dlm))
+		return 0;
+
+	name = assert->name;
+	namelen = assert->namelen;
+	flags = be32_to_cpu(assert->flags);
+
+	if (namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length!");
+		goto done;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+	if (flags)
+		mlog(0, "assert_master with flags: %u\n", flags);
+
+	/* find the MLE */
+	spin_lock(&dlm->master_lock);
+	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
+		/* not an error, could be master just re-asserting */
+		mlog(0, "just got an assert_master from %u, but no "
+		     "MLE for it! (%.*s)\n", assert->node_idx,
+		     namelen, name);
+	} else {
+		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
+		if (bit >= O2NM_MAX_NODES) {
+			/* not necessarily an error, though less likely.
+			 * could be master just re-asserting. */
+			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			     "is asserting! (%.*s)\n", assert->node_idx,
+			     namelen, name);
+		} else if (bit != assert->node_idx) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "master %u was found, %u should "
+				     "back off\n", assert->node_idx, bit);
+			} else {
+				/* with the fix for bug 569, a higher node
+				 * number winning the mastery will respond
+				 * YES to mastery requests, but this node
+				 * had no way of knowing.  let it pass. */
+				mlog(ML_ERROR, "%u is the lowest node, "
+				     "%u is asserting. (%.*s)  %u must "
+				     "have begun after %u won.\n", bit,
+				     assert->node_idx, namelen, name, bit,
+				     assert->node_idx);
+			}
+		}
+	}
+	spin_unlock(&dlm->master_lock);
+
+	/* ok everything checks out with the MLE
+	 * now check to see if there is a lockres */
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING)  {
+			mlog(ML_ERROR, "%u asserting but %.*s is "
+			     "RECOVERING!\n", assert->node_idx, namelen, name);
+			goto kill;
+		}
+		if (!mle) {
+			if (res->owner != assert->node_idx) {
+				mlog(ML_ERROR, "assert_master from "
+					  "%u, but current owner is "
+					  "%u! (%.*s)\n",
+				       assert->node_idx, res->owner,
+				       namelen, name);
+				goto kill;
+			}
+		} else if (mle->type != DLM_MLE_MIGRATION) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+				/* owner is just re-asserting */
+				if (res->owner == assert->node_idx) {
+					mlog(0, "owner %u re-asserting on "
+					     "lock %.*s\n", assert->node_idx,
+					     namelen, name);
+					goto ok;
+				}
+				mlog(ML_ERROR, "got assert_master from "
+				     "node %u, but %u is the owner! "
+				     "(%.*s)\n", assert->node_idx,
+				     res->owner, namelen, name);
+				goto kill;
+			}
+			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
+				mlog(ML_ERROR, "got assert from %u, but lock "
+				     "with no owner should be "
+				     "in-progress! (%.*s)\n",
+				     assert->node_idx,
+				     namelen, name);
+				goto kill;
+			}
+		} else /* mle->type == DLM_MLE_MIGRATION */ {
+			/* should only be getting an assert from new master */
+			if (assert->node_idx != mle->new_master) {
+				mlog(ML_ERROR, "got assert from %u, but "
+				     "new master is %u, and old master "
+				     "was %u (%.*s)\n",
+				     assert->node_idx, mle->new_master,
+				     mle->master, namelen, name);
+				goto kill;
+			}
+
+		}
+ok:
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	// mlog(0, "woo!  got an assert_master from node %u!\n",
+	// 	     assert->node_idx);
+	if (mle) {
+		int extra_ref;
+		
+		spin_lock(&mle->spinlock);
+		extra_ref = !!(mle->type == DLM_MLE_BLOCK
+			       || mle->type == DLM_MLE_MIGRATION);
+		mle->master = assert->node_idx;
+		atomic_set(&mle->woken, 1);
+		wake_up(&mle->wq);
+		spin_unlock(&mle->spinlock);
+
+		if (mle->type == DLM_MLE_MIGRATION && res) {
+			mlog(0, "finishing off migration of lockres %.*s, "
+			     "from %u to %u\n",
+			       res->lockname.len, res->lockname.name,
+			       dlm->node_num, mle->new_master);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			dlm_change_lockres_owner(dlm, res, mle->new_master);
+			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			spin_unlock(&res->spinlock);
+		}
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		
+		if (extra_ref) {
+			/* the assert master message now balances the extra
+		 	 * ref given by the master / migration request message.
+		 	 * if this is the last put, it will be removed
+		 	 * from the list. */
+			dlm_put_mle(mle);
+		}
+	}
+
+done:
+	if (res)
+		dlm_lockres_put(res);
+	dlm_put(dlm);
+	return 0;
+
+kill:
+	/* kill the caller! */
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+	dlm_lockres_put(res);
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	dlm_dump_lock_resources(dlm);
+	dlm_put(dlm);
+	return -EINVAL;
+}
+
+int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res,
+			       int ignore_higher, u8 request_from, u32 flags)
+{
+	struct dlm_work_item *item;
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+
+	/* queue up work for dlm_assert_master_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
+	item->u.am.lockres = res; /* already have a ref */
+	/* can optionally ignore node numbers higher than this node */
+	item->u.am.ignore_higher = ignore_higher;
+	item->u.am.request_from = request_from;
+	item->u.am.flags = flags;
+
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+
+	schedule_work(&dlm->dispatched_work);
+	return 0;
+}
+
+static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+	int ignore_higher;
+	int bit;
+	u8 request_from;
+	u32 flags;
+
+	dlm = item->dlm;
+	res = item->u.am.lockres;
+	ignore_higher = item->u.am.ignore_higher;
+	request_from = item->u.am.request_from;
+	flags = item->u.am.flags;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dlm->node_num, nodemap);
+	if (ignore_higher) {
+		/* if is this just to clear up mles for nodes below
+		 * this node, do not send the message to the original
+		 * caller or any node number higher than this */
+		clear_bit(request_from, nodemap);
+		bit = dlm->node_num;
+		while (1) {
+			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
+					    bit+1);
+		       	if (bit >= O2NM_MAX_NODES)
+				break;
+			clear_bit(bit, nodemap);
+		}
+	}
+
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	mlog(0, "worker about to master %.*s here, this=%u\n",
+		     res->lockname.len, res->lockname.name, dlm->node_num);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len,
+				   nodemap, flags);
+	if (ret < 0) {
+		/* no need to restart, we are done */
+		mlog_errno(ret);
+	}
+
+	dlm_lockres_put(res);
+
+	mlog(0, "finished with dlm_assert_master_worker\n");
+}
+
+
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			u8 target)
+{
+	struct dlm_master_list_entry *mle = NULL;
+	struct dlm_master_list_entry *oldmle = NULL;
+ 	struct dlm_migratable_lockres *mres = NULL;
+	int ret = -EINVAL;
+	const char *name;
+	unsigned int namelen;
+	int mle_added = 0;
+	struct list_head *queue, *iter;
+	int i;
+	struct dlm_lock *lock;
+	int empty = 1;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = res->lockname.name;
+	namelen = res->lockname.len;
+
+	mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+
+	/*
+	 * ensure this lockres is a proper candidate for migration
+	 */
+	spin_lock(&res->spinlock);
+	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "cannot migrate lockres with unknown owner!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	if (res->owner != dlm->node_num) {
+		mlog(0, "cannot migrate lockres this node doesn't own!\n");
+		spin_unlock(&res->spinlock);
+		goto leave;
+	}
+	mlog(0, "checking queues...\n");
+	queue = &res->granted;
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			empty = 0;
+			if (lock->ml.node == dlm->node_num) {
+				mlog(0, "found a lock owned by this node "
+				     "still on the %s queue!  will not "
+				     "migrate this lockres\n",
+				     i==0 ? "granted" :
+				     (i==1 ? "converting" : "blocked"));
+				spin_unlock(&res->spinlock);
+				ret = -ENOTEMPTY;
+				goto leave;
+			}
+		}
+		queue++;
+	}
+	mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
+	spin_unlock(&res->spinlock);
+
+	/* no work to do */
+	if (empty) {
+		mlog(0, "no locks were found on this lockres! done!\n");
+		ret = 0;
+		goto leave;
+	}
+
+	/*
+	 * preallocate up front
+	 * if this fails, abort
+	 */
+
+	ret = -ENOMEM;
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	if (!mres) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+								GFP_KERNEL);
+	if (!mle) {
+		mlog_errno(ret);
+		goto leave;
+	}
+	ret = 0;
+
+	/*
+	 * find a node to migrate the lockres to
+	 */
+
+	mlog(0, "picking a migration node\n");
+	spin_lock(&dlm->spinlock);
+	/* pick a new node */
+	if (!test_bit(target, dlm->domain_map) ||
+	    target >= O2NM_MAX_NODES) {
+		target = dlm_pick_migration_target(dlm, res);
+	}
+	mlog(0, "node %u chosen for migration\n", target);
+
+	if (target >= O2NM_MAX_NODES ||
+	    !test_bit(target, dlm->domain_map)) {
+		/* target chosen is not alive */
+		ret = -EINVAL;
+	}
+
+	if (ret) {
+		spin_unlock(&dlm->spinlock);
+		goto fail;
+	}
+
+	mlog(0, "continuing with target = %u\n", target);
+
+	/*
+	 * clear any existing master requests and
+	 * add the migration mle to the list
+	 */
+	spin_lock(&dlm->master_lock);
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+				    namelen, target, dlm->node_num);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (ret == -EEXIST) {
+		mlog(0, "another process is already migrating it\n");
+		goto fail;
+	}
+	mle_added = 1;
+
+	/*
+	 * set the MIGRATING flag and flush asts
+	 * if we fail after this we need to re-dirty the lockres
+	 */
+	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
+		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
+		     "the target went down.\n", res->lockname.len,
+		     res->lockname.name, target);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+		ret = -EINVAL;
+	}
+
+fail:
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (ret < 0) {
+		if (mle_added) {
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+		} else if (mle) {
+			kmem_cache_free(dlm_mle_cache, mle);
+		}
+		goto leave;
+	}
+
+	/*
+	 * at this point, we have a migration target, an mle
+	 * in the master list, and the MIGRATING flag set on
+	 * the lockres
+	 */
+
+
+	/* get an extra reference on the mle.
+	 * otherwise the assert_master from the new
+	 * master will destroy this.
+	 * also, make sure that all callers of dlm_get_mle
+	 * take both dlm->spinlock and dlm->master_lock */
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	dlm_get_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	/* notify new node and send all lock state */
+	/* call send_one_lockres with migration flag.
+	 * this serves as notice to the target node that a
+	 * migration is starting. */
+	ret = dlm_send_one_lockres(dlm, res, mres, target,
+				   DLM_MRES_MIGRATION);
+
+	if (ret < 0) {
+		mlog(0, "migration to node %u failed with %d\n",
+		     target, ret);
+		/* migration failed, detach and clean up mle */
+		dlm_mle_detach_hb_events(dlm, mle);
+		dlm_put_mle(mle);
+		dlm_put_mle(mle);
+		goto leave;
+	}
+
+	/* at this point, the target sends a message to all nodes,
+	 * (using dlm_do_migrate_request).  this node is skipped since
+	 * we had to put an mle in the list to begin the process.  this
+	 * node now waits for target to do an assert master.  this node
+	 * will be the last one notified, ensuring that the migration
+	 * is complete everywhere.  if the target dies while this is
+	 * going on, some nodes could potentially see the target as the
+	 * master, so it is important that my recovery finds the migration
+	 * mle and sets the master to UNKNONWN. */
+
+
+	/* wait for new node to assert master */
+	while (1) {
+		ret = wait_event_interruptible_timeout(mle->wq,
+					(atomic_read(&mle->woken) == 1),
+					msecs_to_jiffies(5000));
+
+		if (ret >= 0) {
+		       	if (atomic_read(&mle->woken) == 1 ||
+			    res->owner == target)
+				break;
+
+			mlog(0, "timed out during migration\n");
+		}
+		if (ret == -ERESTARTSYS) {
+			/* migration failed, detach and clean up mle */
+			dlm_mle_detach_hb_events(dlm, mle);
+			dlm_put_mle(mle);
+			dlm_put_mle(mle);
+			goto leave;
+		}
+		/* TODO: if node died: stop, clean up, return error */
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, target);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	dlm_remove_nonlocal_locks(dlm, res);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* master is known, detach if not already detached */
+	dlm_mle_detach_hb_events(dlm, mle);
+	dlm_put_mle(mle);
+	ret = 0;
+
+	dlm_lockres_calc_usage(dlm, res);
+
+leave:
+	/* re-dirty the lockres if we failed */
+	if (ret < 0)
+		dlm_kick_thread(dlm, res);
+
+	/* TODO: cleanup */
+	if (mres)
+		free_page((unsigned long)mres);
+
+	dlm_put(dlm);
+
+	mlog(0, "returning %d\n", ret);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
+
+int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+{
+	int ret;
+	spin_lock(&dlm->ast_lock);
+	spin_lock(&lock->spinlock);
+	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&dlm->ast_lock);
+	return ret;
+}
+
+static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     u8 mig_target)
+{
+	int can_proceed;
+	spin_lock(&res->spinlock);
+	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
+	spin_unlock(&res->spinlock);
+
+	/* target has died, so make the caller break out of the 
+	 * wait_event, but caller must recheck the domain_map */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(mig_target, dlm->domain_map))
+		can_proceed = 1;
+	spin_unlock(&dlm->spinlock);
+	return can_proceed;
+}
+
+int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	int ret;
+	spin_lock(&res->spinlock);
+	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
+	spin_unlock(&res->spinlock);
+	return ret;
+}
+
+
+static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res,
+				       u8 target)
+{
+	int ret = 0;
+
+	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
+	       res->lockname.len, res->lockname.name, dlm->node_num,
+	       target);
+	/* need to set MIGRATING flag on lockres.  this is done by
+	 * ensuring that all asts have been flushed for this lockres. */
+	spin_lock(&res->spinlock);
+	BUG_ON(res->migration_pending);
+	res->migration_pending = 1;
+	/* strategy is to reserve an extra ast then release
+	 * it below, letting the release do all of the work */
+	__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
+	/* now flush all the pending asts.. hang out for a bit */
+	dlm_kick_thread(dlm, res);
+	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
+	dlm_lockres_release_ast(dlm, res);
+
+	mlog(0, "about to wait on migration_wq, dirty=%s\n",
+	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+	/* if the extra ref we just put was the final one, this
+	 * will pass thru immediately.  otherwise, we need to wait
+	 * for the last ast to finish. */
+again:
+	ret = wait_event_interruptible_timeout(dlm->migration_wq,
+		   dlm_migration_can_proceed(dlm, res, target),
+		   msecs_to_jiffies(1000));
+	if (ret < 0) {
+		mlog(0, "woken again: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	} else {
+		mlog(0, "all is well: migrating? %s, dead? %s\n",
+		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
+		       test_bit(target, dlm->domain_map) ? "no":"yes");
+	}
+	if (!dlm_migration_can_proceed(dlm, res, target)) {
+		mlog(0, "trying again...\n");
+		goto again;
+	}
+
+	/* did the target go down or die? */
+	spin_lock(&dlm->spinlock);
+	if (!test_bit(target, dlm->domain_map)) {
+		mlog(ML_ERROR, "aha. migration target %u just went down\n",
+		     target);
+		ret = -EHOSTDOWN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	/*
+	 * at this point:
+	 *
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set
+	 *   o there are no pending asts on this lockres
+	 *   o all processes trying to reserve an ast on this
+	 *     lockres must wait for the MIGRATING flag to clear
+	 */
+	return ret;
+}
+
+/* last step in the migration process.
+ * original master calls this to free all of the dlm_lock
+ * structures that used to be for other nodes. */
+static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res)
+{
+	struct list_head *iter, *iter2;
+	struct list_head *queue = &res->granted;
+	int i;
+	struct dlm_lock *lock;
+
+	assert_spin_locked(&res->spinlock);
+
+	BUG_ON(res->owner == dlm->node_num);
+
+	for (i=0; i<3; i++) {
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				mlog(0, "putting lock for node %u\n",
+				     lock->ml.node);
+				/* be extra careful */
+				BUG_ON(!list_empty(&lock->ast_list));
+				BUG_ON(!list_empty(&lock->bast_list));
+				BUG_ON(lock->ast_pending);
+				BUG_ON(lock->bast_pending);
+				list_del_init(&lock->list);
+				dlm_lock_put(lock);
+			}
+		}
+		queue++;
+	}
+}
+
+/* for now this is not too intelligent.  we will
+ * need stats to make this do the right thing.
+ * this just finds the first lock on one of the
+ * queues and uses that node as the target. */
+static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
+				    struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue = &res->granted;
+	struct list_head *iter;
+	struct dlm_lock *lock;
+	int nodenum;
+
+	assert_spin_locked(&dlm->spinlock);
+
+	spin_lock(&res->spinlock);
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			/* up to the caller to make sure this node
+			 * is alive */
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node != dlm->node_num) {
+				spin_unlock(&res->spinlock);
+				return lock->ml.node;
+			}
+		}
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	mlog(0, "have not found a suitable target yet! checking domain map\n");
+
+	/* ok now we're getting desperate.  pick anyone alive. */
+	nodenum = -1;
+	while (1) {
+		nodenum = find_next_bit(dlm->domain_map,
+					O2NM_MAX_NODES, nodenum+1);
+		mlog(0, "found %d in domain map\n", nodenum);
+		if (nodenum >= O2NM_MAX_NODES)
+			break;
+		if (nodenum != dlm->node_num) {
+			mlog(0, "picking %d\n", nodenum);
+			return nodenum;
+		}
+	}
+
+	mlog(0, "giving up.  no master to migrate to\n");
+	return DLM_LOCK_RES_OWNER_UNKNOWN;
+}
+
+
+
+/* this is called by the new master once all lockres
+ * data has been received */
+static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *res,
+				  u8 master, u8 new_master,
+				  struct dlm_node_iter *iter)
+{
+	struct dlm_migrate_request migrate;
+	int ret, status = 0;
+	int nodenum;
+
+	memset(&migrate, 0, sizeof(migrate));
+	migrate.namelen = res->lockname.len;
+	memcpy(migrate.name, res->lockname.name, migrate.namelen);
+	migrate.new_master = new_master;
+	migrate.master = master;
+
+	ret = 0;
+
+	/* send message to all nodes, except the master and myself */
+	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
+		if (nodenum == master ||
+		    nodenum == new_master)
+			continue;
+
+		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
+					 &migrate, sizeof(migrate), nodenum,
+					 &status);
+		if (ret < 0)
+			mlog_errno(ret);
+		else if (status < 0) {
+			mlog(0, "migrate request (node %u) returned %d!\n",
+			     nodenum, status);
+			ret = status;
+		}
+	}
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	mlog(0, "returning ret=%d\n", ret);
+	return ret;
+}
+
+
+/* if there is an existing mle for this lockres, we now know who the master is.
+ * (the one who sent us *this* message) we can clear it up right away.
+ * since the process that put the mle on the list still has a reference to it,
+ * we can unhash it now, set the master and wake the process.  as a result,
+ * we will have no mle in the list to start with.  now we can add an mle for
+ * the migration and this should be the only one found for those scanning the
+ * list.  */
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_resource *res = NULL;
+	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
+	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
+	const char *name;
+	unsigned int namelen;
+	int ret = 0;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	name = migrate->name;
+	namelen = migrate->namelen;
+
+	/* preallocate.. if this fails, abort */
+	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+							 GFP_KERNEL);
+
+	if (!mle) {
+		ret = -ENOMEM;
+		goto leave;
+	}
+
+	/* check for pre-existing lock */
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, name, namelen);
+	spin_lock(&dlm->master_lock);
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		if (res->state & DLM_LOCK_RES_RECOVERING) {
+			/* if all is working ok, this can only mean that we got
+		 	* a migrate request from a node that we now see as
+		 	* dead.  what can we do here?  drop it to the floor? */
+			spin_unlock(&res->spinlock);
+			mlog(ML_ERROR, "Got a migrate request, but the "
+			     "lockres is marked as recovering!");
+			kmem_cache_free(dlm_mle_cache, mle);
+			ret = -EINVAL; /* need a better solution */
+			goto unlock;
+		}
+		res->state |= DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
+	}
+
+	/* ignore status.  only nonzero status would BUG. */
+	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
+				    name, namelen,
+				    migrate->new_master,
+				    migrate->master);
+
+unlock:
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+	if (oldmle) {
+		/* master is known, detach if not already detached */
+		dlm_mle_detach_hb_events(dlm, oldmle);
+		dlm_put_mle(oldmle);
+	}
+
+	if (res)
+		dlm_lockres_put(res);
+leave:
+	dlm_put(dlm);
+	return ret;
+}
+
+/* must be holding dlm->spinlock and dlm->master_lock
+ * when adding a migration mle, we can clear any other mles
+ * in the master list because we know with certainty that
+ * the master is "master".  so we remove any old mle from
+ * the list after setting it's master field, and then add
+ * the new migration mle.  this way we can hold with the rule
+ * of having only one mle for a given lock name at all times. */
+static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 struct dlm_master_list_entry *mle,
+				 struct dlm_master_list_entry **oldmle,
+				 const char *name, unsigned int namelen,
+				 u8 new_master, u8 master)
+{
+	int found;
+	int ret = 0;
+
+	*oldmle = NULL;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+
+	/* caller is responsible for any ref taken here on oldmle */
+	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
+	if (found) {
+		struct dlm_master_list_entry *tmp = *oldmle;
+		spin_lock(&tmp->spinlock);
+		if (tmp->type == DLM_MLE_MIGRATION) {
+			if (master == dlm->node_num) {
+				/* ah another process raced me to it */
+				mlog(0, "tried to migrate %.*s, but some "
+				     "process beat me to it\n",
+				     namelen, name);
+				ret = -EEXIST;
+			} else {
+				/* bad.  2 NODES are trying to migrate! */
+				mlog(ML_ERROR, "migration error  mle: "
+				     "master=%u new_master=%u // request: "
+				     "master=%u new_master=%u // "
+				     "lockres=%.*s\n",
+				     tmp->master, tmp->new_master,
+				     master, new_master,
+				     namelen, name);
+				BUG();
+			}
+		} else {
+			/* this is essentially what assert_master does */
+			tmp->master = master;
+			atomic_set(&tmp->woken, 1);
+			wake_up(&tmp->wq);
+			/* remove it from the list so that only one
+			 * mle will be found */
+			list_del_init(&tmp->list);
+		}
+		spin_unlock(&tmp->spinlock);
+	}
+
+	/* now add a migration mle to the tail of the list */
+	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
+	mle->new_master = new_master;
+	mle->master = master;
+	/* do this for consistency with other mle types */
+	set_bit(new_master, mle->maybe_map);
+	list_add(&mle->list, &dlm->master_list);
+
+	return ret;
+}
+
+
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_master_list_entry *mle;
+	struct dlm_lock_resource *res;
+
+	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+	assert_spin_locked(&dlm->spinlock);
+
+	/* clean the master list */
+	spin_lock(&dlm->master_lock);
+	list_for_each_safe(iter, iter2, &dlm->master_list) {
+		mle = list_entry(iter, struct dlm_master_list_entry, list);
+
+		BUG_ON(mle->type != DLM_MLE_BLOCK &&
+		       mle->type != DLM_MLE_MASTER &&
+		       mle->type != DLM_MLE_MIGRATION);
+
+		/* MASTER mles are initiated locally.  the waiting
+		 * process will notice the node map change
+		 * shortly.  let that happen as normal. */
+		if (mle->type == DLM_MLE_MASTER)
+			continue;
+
+
+		/* BLOCK mles are initiated by other nodes.
+		 * need to clean up if the dead node would have
+		 * been the master. */
+		if (mle->type == DLM_MLE_BLOCK) {
+			int bit;
+
+			spin_lock(&mle->spinlock);
+			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+			if (bit != dead_node) {
+				mlog(0, "mle found, but dead node %u would "
+				     "not have been master\n", dead_node);
+				spin_unlock(&mle->spinlock);
+			} else {
+				/* must drop the refcount by one since the
+				 * assert_master will never arrive.  this
+				 * may result in the mle being unlinked and
+				 * freed, but there may still be a process
+				 * waiting in the dlmlock path which is fine. */
+				mlog(ML_ERROR, "node %u was expected master\n",
+				     dead_node);
+				atomic_set(&mle->woken, 1);
+				spin_unlock(&mle->spinlock);
+				wake_up(&mle->wq);
+				/* final put will take care of list removal */
+				__dlm_put_mle(mle);
+			}
+			continue;
+		}
+
+		/* everything else is a MIGRATION mle */
+
+		/* the rule for MIGRATION mles is that the master
+		 * becomes UNKNOWN if *either* the original or
+		 * the new master dies.  all UNKNOWN lockreses
+		 * are sent to whichever node becomes the recovery
+		 * master.  the new master is responsible for
+		 * determining if there is still a master for
+		 * this lockres, or if he needs to take over
+		 * mastery.  either way, this node should expect
+		 * another message to resolve this. */
+		if (mle->master != dead_node &&
+		    mle->new_master != dead_node)
+			continue;
+
+		/* if we have reached this point, this mle needs to
+		 * be removed from the list and freed. */
+
+		/* remove from the list early.  NOTE: unlinking
+		 * list_head while in list_for_each_safe */
+		spin_lock(&mle->spinlock);
+		list_del_init(&mle->list);
+		atomic_set(&mle->woken, 1);
+		spin_unlock(&mle->spinlock);
+		wake_up(&mle->wq);
+
+		mlog(0, "node %u died during migration from "
+		     "%u to %u!\n", dead_node,
+		     mle->master, mle->new_master);
+		/* if there is a lockres associated with this
+	 	 * mle, find it and set its owner to UNKNOWN */
+		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
+					mle->u.name.len);
+		if (res) {
+			/* unfortunately if we hit this rare case, our
+		 	 * lock ordering is messed.  we need to drop
+		 	 * the master lock so that we can take the
+		  	 * lockres lock, meaning that we will have to
+			 * restart from the head of list. */
+			spin_unlock(&dlm->master_lock);
+
+			/* move lockres onto recovery list */
+			spin_lock(&res->spinlock);
+			dlm_set_lockres_owner(dlm, res,
+				      	DLM_LOCK_RES_OWNER_UNKNOWN);
+			dlm_move_lockres_to_recovery_list(dlm, res);
+			spin_unlock(&res->spinlock);
+			dlm_lockres_put(res);
+
+			/* dump the mle */
+			spin_lock(&dlm->master_lock);
+			__dlm_put_mle(mle);
+			spin_unlock(&dlm->master_lock);
+
+			/* restart */
+			goto top;
+		}
+
+		/* this may be the last reference */
+		__dlm_put_mle(mle);
+	}
+	spin_unlock(&dlm->master_lock);
+}
+
+
+int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 u8 old_master)
+{
+	struct dlm_node_iter iter;
+	int ret = 0;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	clear_bit(old_master, iter.node_map);
+	clear_bit(dlm->node_num, iter.node_map);
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "now time to do a migrate request to other nodes\n");
+	ret = dlm_do_migrate_request(dlm, res, old_master,
+				     dlm->node_num, &iter);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	mlog(0, "doing assert master of %.*s to all except the original node\n",
+	     res->lockname.len, res->lockname.name);
+	/* this call now finishes out the nodemap
+	 * even if one or more nodes die */
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		/* no longer need to retry.  all living nodes contacted. */
+		mlog_errno(ret);
+		ret = 0;
+	}
+
+	memset(iter.node_map, 0, sizeof(iter.node_map));
+	set_bit(old_master, iter.node_map);
+	mlog(0, "doing assert master of %.*s back to %u\n",
+	     res->lockname.len, res->lockname.name, old_master);
+	ret = dlm_do_assert_master(dlm, res->lockname.name,
+				   res->lockname.len, iter.node_map,
+				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
+	if (ret < 0) {
+		mlog(0, "assert master to original master failed "
+		     "with %d.\n", ret);
+		/* the only nonzero status here would be because of
+		 * a dead original node.  we're done. */
+		ret = 0;
+	}
+
+	/* all done, set the owner, clear the flag */
+	spin_lock(&res->spinlock);
+	dlm_set_lockres_owner(dlm, res, dlm->node_num);
+	res->state &= ~DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	/* re-dirty it on the new master */
+	dlm_kick_thread(dlm, res);
+	wake_up(&res->wq);
+leave:
+	return ret;
+}
+
+/*
+ * LOCKRES AST REFCOUNT
+ * this is integral to migration
+ */
+
+/* for future intent to call an ast, reserve one ahead of time.
+ * this should be called only after waiting on the lockres
+ * with dlm_wait_on_lockres, and while still holding the
+ * spinlock after the call. */
+void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
+{
+	assert_spin_locked(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		__dlm_print_one_lock_resource(res);
+	}
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+
+	atomic_inc(&res->asts_reserved);
+}
+
+/*
+ * used to drop the reserved ast, either because it went unused,
+ * or because the ast/bast was actually called.
+ *
+ * also, if there is a pending migration on this lockres,
+ * and this was the last pending ast on the lockres,
+ * atomically set the MIGRATING flag before we drop the lock.
+ * this is how we ensure that migration can proceed with no
+ * asts in progress.  note that it is ok if the state of the
+ * queues is such that a lock should be granted in the future
+ * or that a bast should be fired, because the new master will
+ * shuffle the lists on this lockres as soon as it is migrated.
+ */
+void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
+			     struct dlm_lock_resource *res)
+{
+	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
+		return;
+
+	if (!res->migration_pending) {
+		spin_unlock(&res->spinlock);
+		return;
+	}
+
+	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
+	res->migration_pending = 0;
+	res->state |= DLM_LOCK_RES_MIGRATING;
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+	wake_up(&dlm->migration_wq);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 0000000..0c8eb10
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(void *data)
+{
+	struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
+	LIST_HEAD(tmp_list);
+	struct list_head *iter, *iter2;
+	struct dlm_work_item *item;
+	dlm_workfunc_t *workfunc;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
+		item = list_entry(iter, struct dlm_work_item, list);
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->reco.event);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		else
+			dlm->reco.dead_node = bit;
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master */
+		if (!dlm_pick_recovery_master(dlm)) {
+			/* already notified everyone.  go. */
+			dlm->reco.new_master = dlm->node_num;
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->reco.new_master,
+	     dlm->node_num, dlm->reco.dead_node);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->name, dlm->reco.dead_node, dlm->node_num);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
+		     "retrying.\n", status, dlm->reco.dead_node);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		dlm_reset_recovery(dlm);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	struct list_head *iter;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	status = dlm_init_recovery_area(dlm, dead_node);
+	if (status < 0)
+		goto leave;
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "requesting lock info from node %u\n",
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+		if (status < 0) {
+			mlog_errno(status);
+			if (dlm_is_host_down(status))
+				ndata->state = DLM_RECO_NODE_DATA_DEAD;
+			else {
+				destroy = 1;
+				goto leave;
+			}
+		}
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				// start all over
+				destroy = 1;
+				status = -EAGAIN;
+				goto leave;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+	}
+
+	mlog(0, "done requesting all lock info\n");
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each(iter, &dlm->reco.node_data) {
+			ndata = list_entry (iter, struct dlm_reco_node_data, list);
+
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					spin_unlock(&dlm_reco_state_lock);
+					// start all over
+					destroy = 1;
+					status = -EAGAIN;
+					goto leave;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = ret;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+leave:
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_reco_node_data *ndata;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_safe(iter, iter2, &tmplist) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	enum dlm_status ret;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = DLM_NOLOCKMGR;
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, NULL);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog_errno(ret);
+
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	struct list_head *iter;
+	int ret;
+	u8 dead_node, reco_master;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	BUG_ON(dead_node != dlm->reco.dead_node);
+	BUG_ON(reco_master != dlm->reco.new_master);
+
+	mres = (struct dlm_migratable_lockres *)data;
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+	list_for_each(iter, &resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	/* negative status is ignored by the caller */
+	if (ret >= 0)
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct list_head *iter;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter, *iter2;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len))
+			continue;
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog_errno(ret);
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	memset(mres, 0, PAGE_SIZE);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		/* send our current lvb */
+		if (ml->type == LKM_EXMODE ||
+		    ml->type == LKM_PRMODE) {
+			/* if it is already set, this had better be a PR
+			 * and it has to match */
+			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+				mlog(ML_ERROR, "mismatched lvbs!\n");
+				__dlm_print_one_lock_resource(lock->lockres);
+				BUG();
+			}
+			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		}
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue, *iter;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0) {
+				// TODO
+				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
+				     "returned %d, TODO\n", ret);
+				BUG();
+			}
+		}
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0) {
+		// TODO
+		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+		     "TODO\n", ret);
+		BUG();
+	}
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+
+		/* add an extra ref for just-allocated lockres 
+		 * otherwise the lockres will be purged immediately */
+		dlm_lockres_get(res);
+
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n", mres->lockname_len, mres->lockname);
+	} else {
+		spin_lock(&res->spinlock);
+		dlm_change_lockres_owner(dlm, res, dlm->node_num);
+		spin_unlock(&res->spinlock);
+	}
+
+	/* queue up work for dlm_mig_lockres_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
+	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
+	item->u.ml.lockres = res; /* already have a ref */
+	item->u.ml.real_master = real_master;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+leave:
+	dlm_put(dlm);
+	if (ret < 0) {
+		if (buf)
+			kfree(buf);
+		if (item)
+			kfree(item);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres;
+	int ret = 0;
+	struct dlm_lock_resource *res;
+	u8 real_master;
+
+	dlm = item->dlm;
+	mres = (struct dlm_migratable_lockres *)data;
+
+	res = item->u.ml.lockres;
+	real_master = item->u.ml.real_master;
+
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* this case is super-rare. only occurs if
+		 * node death happens during migration. */
+again:
+		ret = dlm_lockres_master_requery(dlm, res, &real_master);
+		if (ret < 0) {
+			mlog(0, "dlm_lockres_master_requery failure: %d\n",
+				  ret);
+			goto again;
+		}
+		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lockres %.*s not claimed.  "
+				   "this node will take it.\n",
+				   res->lockname.len, res->lockname.name);
+		} else {
+			mlog(0, "master needs to respond to sender "
+				  "that node %u still owns %.*s\n",
+				  real_master, res->lockname.len,
+				  res->lockname.name);
+			/* cannot touch this lockres */
+			goto leave;
+		}
+	}
+
+	ret = dlm_process_recovery_data(dlm, res, mres);
+	if (ret < 0)
+		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
+	else
+		mlog(0, "dlm_process_recovery_data succeeded\n");
+
+	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
+	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
+		ret = dlm_finish_migration(dlm, res, mres->master);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+leave:
+	kfree(data);
+	mlog_exit(ret);
+}
+
+
+
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+
+	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	/* we only reach here if one of the two nodes in a
+	 * migration died while the migration was in progress.
+	 * at this point we need to requery the master.  we
+	 * know that the new_master got as far as creating
+	 * an mle on at least one node, but we do not know
+	 * if any nodes had actually cleared the mle and set
+	 * the master to the new_master.  the old master
+	 * is supposed to set the owner to UNKNOWN in the
+	 * event of a new_master death, so the only possible
+	 * responses that we can get from nodes here are
+	 * that the master is new_master, or that the master
+	 * is UNKNOWN.
+	 * if all nodes come back with UNKNOWN then we know
+	 * the lock needs remastering here.
+	 * if any node comes back with a valid master, check
+	 * to see if that master is the one that we are
+	 * recovering.  if so, then the new_master died and
+	 * we need to remaster this lock.  if not, then the
+	 * new_master survived and that node will respond to
+	 * other nodes about the owner.
+	 * if there is an owner, this node needs to dump this
+	 * lockres and alert the sender that this lockres
+	 * was rejected. */
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			BUG();
+			/* TODO: need to figure a way to restart this */
+		}
+		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "lock master is %u\n", *real_master);
+			break;
+		}
+	}
+	return ret;
+}
+
+
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master)
+{
+	int ret = -EINVAL;
+	struct dlm_master_requery req;
+	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	memset(&req, 0, sizeof(req));
+	req.node_idx = dlm->node_num;
+	req.namelen = res->lockname.len;
+	memcpy(req.name, res->lockname.name, res->lockname.len);
+
+	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
+				 &req, sizeof(req), nodenum, &status);
+	/* XXX: negative status not handled properly here. */
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		BUG_ON(status < 0);
+		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
+		*real_master = (u8) (status & 0xff);
+		mlog(0, "node %u responded to master requery with %u\n",
+			  nodenum, *real_master);
+		ret = 0;
+	}
+	return ret;
+}
+
+
+/* this function cannot error, so unless the sending
+ * or receiving of the message failed, the owner can
+ * be trusted */
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
+	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+
+	if (!dlm_grab(dlm)) {
+		/* since the domain has gone away on this
+		 * node, the proper response is UNKNOWN */
+		return master;
+	}
+
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	if (res) {
+		spin_lock(&res->spinlock);
+		master = res->owner;
+		if (master == dlm->node_num) {
+			int ret = dlm_dispatch_assert_master(dlm, res,
+							     0, 0, flags);
+			if (ret < 0) {
+				mlog_errno(-ENOMEM);
+				/* retry!? */
+				BUG();
+			}
+		}
+		spin_unlock(&res->spinlock);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+	return master;
+}
+
+static inline struct list_head *
+dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
+{
+	struct list_head *ret;
+	BUG_ON(list_num < 0);
+	BUG_ON(list_num > 2);
+	ret = &(res->granted);
+	ret += list_num;
+	return ret;
+}
+/* TODO: do ast flush business
+ * TODO: do MIGRATING and RECOVERING spinning
+ */
+
+/*
+* NOTE about in-flight requests during migration:
+*
+* Before attempting the migrate, the master has marked the lockres as
+* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
+* requests either got queued before the MIGRATING flag got set, in which
+* case the lock data will reflect the change and a return message is on
+* the way, or the request failed to get in before MIGRATING got set.  In
+* this case, the caller will be told to spin and wait for the MIGRATING
+* flag to be dropped, then recheck the master.
+* This holds true for the convert, cancel and unlock cases, and since lvb
+* updates are tied to these same messages, it applies to lvb updates as
+* well.  For the lock case, there is no way a lock can be on the master
+* queue and not be on the secondary queue since the lock is always added
+* locally first.  This means that the new target node will never be sent
+* a lock that he doesn't already have on the list.
+* In total, this means that the local lock is correct and should not be
+* updated to match the one sent by the master.  Any messages sent back
+* from the master before the MIGRATING flag will bring the lock properly
+* up-to-date, and the change will be ordered properly for the waiter.
+* We will *not* attempt to modify the lock underneath the waiter.
+*/
+
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres)
+{
+	struct dlm_migratable_lock *ml;
+	struct list_head *queue;
+	struct dlm_lock *newlock = NULL;
+	struct dlm_lockstatus *lksb = NULL;
+	int ret = 0;
+	int i;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+
+	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
+	for (i=0; i<mres->num_locks; i++) {
+		ml = &(mres->ml[i]);
+		BUG_ON(ml->highest_blocked != LKM_IVMODE);
+		newlock = NULL;
+		lksb = NULL;
+
+		queue = dlm_list_num_to_pointer(res, ml->list);
+
+		/* if the lock is for the local node it needs to
+		 * be moved to the proper location within the queue.
+		 * do not allocate a new lock structure. */
+		if (ml->node == dlm->node_num) {
+			/* MIGRATION ONLY! */
+			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+
+			spin_lock(&res->spinlock);
+			list_for_each(iter, queue) {
+				lock = list_entry (iter, struct dlm_lock, list);
+				if (lock->ml.cookie != ml->cookie)
+					lock = NULL;
+				else
+					break;
+			}
+
+			/* lock is always created locally first, and
+			 * destroyed locally last.  it must be on the list */
+			if (!lock) {
+				mlog(ML_ERROR, "could not find local lock "
+					       "with cookie %"MLFu64"!\n",
+				     ml->cookie);
+				BUG();
+			}
+			BUG_ON(lock->ml.node != ml->node);
+
+			/* see NOTE above about why we do not update
+			 * to match the master here */
+
+			/* move the lock to its proper place */
+			/* do not alter lock refcount.  switching lists. */
+			list_del_init(&lock->list);
+			list_add_tail(&lock->list, queue);
+			spin_unlock(&res->spinlock);
+
+			mlog(0, "just reordered a local lock!\n");
+			continue;
+		}
+
+		/* lock is for another node. */
+		newlock = dlm_new_lock(ml->type, ml->node,
+				       be64_to_cpu(ml->cookie), NULL);
+		if (!newlock) {
+			ret = -ENOMEM;
+			goto leave;
+		}
+		lksb = newlock->lksb;
+		dlm_lock_attach_lockres(newlock, res);
+
+		if (ml->convert_type != LKM_IVMODE) {
+			BUG_ON(queue != &res->converting);
+			newlock->ml.convert_type = ml->convert_type;
+		}
+		lksb->flags |= (ml->flags &
+				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
+			
+		if (mres->lvb[0]) {
+			if (lksb->flags & DLM_LKSB_PUT_LVB) {
+				/* other node was trying to update
+				 * lvb when node died.  recreate the
+				 * lksb with the updated lvb. */
+				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+			} else {
+				/* otherwise, the node is sending its 
+				 * most recent valid lvb info */
+				BUG_ON(ml->type != LKM_EXMODE &&
+				       ml->type != LKM_PRMODE);
+				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
+				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+					mlog(ML_ERROR, "received bad lvb!\n");
+					__dlm_print_one_lock_resource(res);
+					BUG();
+				}
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
+			}
+		}
+
+
+		/* NOTE:
+		 * wrt lock queue ordering and recovery:
+		 *    1. order of locks on granted queue is
+		 *       meaningless.
+		 *    2. order of locks on converting queue is
+		 *       LOST with the node death.  sorry charlie.
+		 *    3. order of locks on the blocked queue is
+		 *       also LOST.
+		 * order of locks does not affect integrity, it
+		 * just means that a lock request may get pushed
+		 * back in line as a result of the node death.
+		 * also note that for a given node the lock order
+		 * for its secondary queue locks is preserved
+		 * relative to each other, but clearly *not*
+		 * preserved relative to locks from other nodes.
+		 */
+		spin_lock(&res->spinlock);
+		dlm_lock_get(newlock);
+		list_add_tail(&newlock->list, queue);
+		spin_unlock(&res->spinlock);
+	}
+	mlog(0, "done running all the locks\n");
+
+leave:
+	if (ret < 0) {
+		mlog_errno(ret);
+		if (newlock)
+			dlm_lock_put(newlock);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	int i;
+	struct list_head *queue, *iter, *iter2;
+	struct dlm_lock *lock;
+
+	res->state |= DLM_LOCK_RES_RECOVERING;
+	if (!list_empty(&res->recovering))
+		list_del_init(&res->recovering);
+	list_add_tail(&res->recovering, &dlm->reco.resources);
+
+	/* find any pending locks and put them back on proper list */
+	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each_safe(iter, iter2, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			dlm_lock_get(lock);
+			if (lock->convert_pending) {
+				/* move converting lock back to granted */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with convert pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_revert_pending_convert(res, lock);
+				lock->convert_pending = 0;
+			} else if (lock->lock_pending) {
+				/* remove pending lock requests completely */
+				BUG_ON(i != DLM_BLOCKED_LIST);
+				mlog(0, "node died with lock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				/* lock will be floating until ref in
+				 * dlmlock_remote is freed after the network
+				 * call returns.  ok for it to not be on any
+				 * list since no ast can be called
+				 * (the master is dead). */
+				dlm_revert_pending_lock(res, lock);
+				lock->lock_pending = 0;
+			} else if (lock->unlock_pending) {
+				/* if an unlock was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master.  note that the dlm_unlock
+				 * call is still responsible for calling
+				 * the unlockast.  that will happen after
+				 * the network call times out.  for now,
+				 * just move lists to prepare the new
+				 * recovery master.  */
+				BUG_ON(i != DLM_GRANTED_LIST);
+				mlog(0, "node died with unlock pending "
+				     "on %.*s. remove from blocked list and skip.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_unlock(res, lock);
+				lock->unlock_pending = 0;
+			} else if (lock->cancel_pending) {
+				/* if a cancel was in progress, treat as
+				 * if this had completed successfully
+				 * before sending this lock state to the
+				 * new master */
+				BUG_ON(i != DLM_CONVERTING_LIST);
+				mlog(0, "node died with cancel pending "
+				     "on %.*s. move back to granted list.\n",
+				     res->lockname.len, res->lockname.name);
+				dlm_commit_pending_cancel(res, lock);
+				lock->cancel_pending = 0;
+			}
+			dlm_lock_put(lock);
+		}
+	}
+}
+
+
+
+/* removes all recovered locks from the recovery list.
+ * sets the res->owner to the new master.
+ * unsets the RECOVERY flag and wakes waiters. */
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master)
+{
+	int i;
+	struct list_head *iter, *iter2, *bucket;
+	struct dlm_lock_resource *res;
+
+	mlog_entry_void();
+
+	assert_spin_locked(&dlm->spinlock);
+
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (res->owner == dead_node) {
+			list_del_init(&res->recovering);
+			spin_lock(&res->spinlock);
+			dlm_change_lockres_owner(dlm, res, new_master);
+			res->state &= ~DLM_LOCK_RES_RECOVERING;
+			__dlm_dirty_lockres(dlm, res);
+			spin_unlock(&res->spinlock);
+			wake_up(&res->wq);
+		}
+	}
+
+	/* this will become unnecessary eventually, but
+	 * for now we need to run the whole hash, clear
+	 * the RECOVERING state and set the owner
+	 * if necessary */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (res->state & DLM_LOCK_RES_RECOVERING) {
+				if (res->owner == dead_node) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, but "
+					     "clearing state anyway\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else if (res->owner == dlm->node_num) {
+					mlog(0, "(this=%u) res %.*s owner=%u "
+					     "was not on recovering list, "
+					     "owner is THIS node, clearing\n",
+					     dlm->node_num, res->lockname.len,
+					     res->lockname.name, new_master);
+				} else
+					continue;
+
+				spin_lock(&res->spinlock);
+				dlm_change_lockres_owner(dlm, res, new_master);
+				res->state &= ~DLM_LOCK_RES_RECOVERING;
+				__dlm_dirty_lockres(dlm, res);
+				spin_unlock(&res->spinlock);
+				wake_up(&res->wq);
+			}
+		}
+	}
+}
+
+static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
+{
+	if (local) {
+		if (lock->ml.type != LKM_EXMODE &&
+		    lock->ml.type != LKM_PRMODE)
+			return 1;
+	} else if (lock->ml.type == LKM_EXMODE)
+		return 1;
+	return 0;
+}
+
+static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *queue;
+	struct dlm_lock *lock;
+	int blank_lvb = 0, local = 0;
+	int i;
+	u8 search_node;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (res->owner == dlm->node_num)
+		/* if this node owned the lockres, and if the dead node 
+		 * had an EX when he died, blank out the lvb */
+		search_node = dead_node;
+	else {
+		/* if this is a secondary lockres, and we had no EX or PR
+		 * locks granted, we can no longer trust the lvb */
+		search_node = dlm->node_num;
+		local = 1;  /* check local state for valid lvb */
+	}
+
+	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+			if (lock->ml.node == search_node) {
+				if (dlm_lvb_needs_invalidation(lock, local)) {
+					/* zero the lksb lvb and lockres lvb */
+					blank_lvb = 1;
+					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
+				}
+			}
+		}
+	}
+
+	if (blank_lvb) {
+		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
+		     res->lockname.len, res->lockname.name, dead_node);
+		memset(res->lvb, 0, DLM_LVB_LEN);
+	}
+}
+
+static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
+				struct dlm_lock_resource *res, u8 dead_node)
+{
+	struct list_head *iter, *tmpiter;
+	struct dlm_lock *lock;
+
+	/* this node is the lockres master:
+	 * 1) remove any stale locks for the dead node
+	 * 2) if the dead node had an EX when he died, blank out the lvb 
+	 */
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* TODO: check pending_asts, pending_basts here */
+	list_for_each_safe(iter, tmpiter, &res->granted) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->converting) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+	list_for_each_safe(iter, tmpiter, &res->blocked) {
+		lock = list_entry (iter, struct dlm_lock, list);
+		if (lock->ml.node == dead_node) {
+			list_del_init(&lock->list);
+			dlm_lock_put(lock);
+		}
+	}
+
+	/* do not kick thread yet */
+	__dlm_dirty_lockres(dlm, res);
+}
+
+/* if this node is the recovery master, and there are no
+ * locks for a given lockres owned by this node that are in
+ * either PR or EX mode, zero out the lvb before requesting.
+ *
+ */
+
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter;
+	struct dlm_lock_resource *res;
+	int i;
+	struct list_head *bucket;
+
+
+	/* purge any stale mles */
+	dlm_clean_master_list(dlm, dead_node);
+
+	/*
+	 * now clean up all lock resources.  there are two rules:
+	 *
+	 * 1) if the dead node was the master, move the lockres
+	 *    to the recovering list.  set the RECOVERING flag.
+	 *    this lockres needs to be cleaned up before it can
+	 *    be used further.
+	 *
+	 * 2) if this node was the master, remove all locks from
+	 *    each of the lockres queues that were owned by the
+	 *    dead node.  once recovery finishes, the dlm thread
+	 *    can be kicked again to see if any ASTs or BASTs
+	 *    need to be fired as a result.
+	 */
+	for (i=0; i<DLM_HASH_SIZE; i++) {
+		bucket = &(dlm->resources[i]);
+		list_for_each(iter, bucket) {
+			res = list_entry (iter, struct dlm_lock_resource, list);
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len))
+				continue;
+			
+			spin_lock(&res->spinlock);
+			/* zero the lvb if necessary */
+			dlm_revalidate_lvb(dlm, res, dead_node);
+			if (res->owner == dead_node)
+				dlm_move_lockres_to_recovery_list(dlm, res);
+			else if (res->owner == dlm->node_num) {
+				dlm_free_dead_locks(dlm, res, dead_node);
+				__dlm_lockres_calc_usage(dlm, res);
+			}
+			spin_unlock(&res->spinlock);
+		}
+	}
+
+}
+
+static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
+{
+	assert_spin_locked(&dlm->spinlock);
+
+	/* check to see if the node is already considered dead */
+	if (!test_bit(idx, dlm->live_nodes_map)) {
+		mlog(0, "for domain %s, node %d is already dead. "
+		     "another node likely did recovery already.\n",
+		     dlm->name, idx);
+		return;
+	}
+
+	/* check to see if we do not care about this node */
+	if (!test_bit(idx, dlm->domain_map)) {
+		/* This also catches the case that we get a node down
+		 * but haven't joined the domain yet. */
+		mlog(0, "node %u already removed from domain!\n", idx);
+		return;
+	}
+
+	clear_bit(idx, dlm->live_nodes_map);
+
+	/* Clean up join state on node death. */
+	if (dlm->joining_node == idx) {
+		mlog(0, "Clearing join state for node %u\n", idx);
+		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+	}
+
+	/* make sure local cleanup occurs before the heartbeat events */
+	if (!test_bit(idx, dlm->recovery_map))
+		dlm_do_local_recovery_cleanup(dlm, idx);
+
+	/* notify anything attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 0);
+
+	mlog(0, "node %u being removed from domain map!\n", idx);
+	clear_bit(idx, dlm->domain_map);
+	/* wake up migration waiters if a node goes down.
+	 * perhaps later we can genericize this for other waiters. */
+	wake_up(&dlm->migration_wq);
+
+	if (test_bit(idx, dlm->recovery_map))
+		mlog(0, "domain %s, node %u already added "
+		     "to recovery map!\n", dlm->name, idx);
+	else
+		set_bit(idx, dlm->recovery_map);
+}
+
+void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+	__dlm_hb_node_down(dlm, idx);
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+
+	if (!dlm_grab(dlm))
+		return;
+
+	spin_lock(&dlm->spinlock);
+
+	set_bit(idx, dlm->live_nodes_map);
+
+	/* notify any mles attached to the heartbeat events */
+	dlm_hb_event_notify_attached(dlm, idx, 1);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_put(dlm);
+}
+
+static void dlm_reco_ast(void *astdata)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_bast(void *astdata, int blocked_type)
+{
+	struct dlm_ctxt *dlm = astdata;
+	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
+	     dlm->node_num, dlm->name);
+}
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
+{
+	mlog(0, "unlockast for recovery lock fired!\n");
+}
+
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
+{
+	enum dlm_status ret;
+	struct dlm_lockstatus lksb;
+	int status = -EINVAL;
+
+	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
+	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
+retry:
+	memset(&lksb, 0, sizeof(lksb));
+
+	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
+		      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+
+	if (ret == DLM_NORMAL) {
+		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* I am master, send message to all nodes saying
+		 * that I am beginning a recovery session */
+		status = dlm_send_begin_reco_message(dlm,
+					      dlm->reco.dead_node);
+
+		/* recovery lock is a special case.  ast will not get fired,
+		 * so just go ahead and unlock it. */
+		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
+		if (ret != DLM_NORMAL) {
+			/* this would really suck. this could only happen
+			 * if there was a network error during the unlock
+			 * because of node death.  this means the unlock
+			 * is actually "done" and the lock structure is
+			 * even freed.  we can continue, but only
+			 * because this specific lock name is special. */
+			mlog(0, "dlmunlock returned %d\n", ret);
+		}
+
+		if (status < 0) {
+			mlog(0, "failed to send recovery message. "
+				   "must retry with new node map.\n");
+			goto retry;
+		}
+	} else if (ret == DLM_NOTQUEUED) {
+		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		/* another node is master. wait on
+		 * reco.new_master != O2NM_INVALID_NODE_NUM */
+		status = -EEXIST;
+	}
+
+	return status;
+}
+
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct dlm_begin_reco br;
+	int ret = 0;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog_entry("%u\n", dead_node);
+
+	mlog(0, "dead node is %u\n", dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	clear_bit(dead_node, iter.node_map);
+
+	memset(&br, 0, sizeof(br));
+	br.node_idx = dlm->node_num;
+	br.dead_node = dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		ret = 0;
+		if (nodenum == dead_node) {
+			mlog(0, "not sending begin reco to dead node "
+				  "%u\n", dead_node);
+			continue;
+		}
+		if (nodenum == dlm->node_num) {
+			mlog(0, "not sending begin reco to self\n");
+			continue;
+		}
+
+		ret = -EINVAL;
+		mlog(0, "attempting to send begin reco msg to %d\n",
+			  nodenum);
+		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
+					 &br, sizeof(br), nodenum, &status);
+		/* negative status is handled ok by caller here */
+		if (ret >= 0)
+			ret = status;
+		if (ret < 0) {
+			struct dlm_lock_resource *res;
+			mlog_errno(ret);
+			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
+			    " returned %d\n", dlm->name, nodenum, ret);
+			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
+						 DLM_RECOVERY_LOCK_NAME_LEN);
+			if (res) {
+				dlm_print_one_lock_resource(res);
+				dlm_lockres_put(res);
+			} else {
+				mlog(ML_ERROR, "recovery lock not found\n");
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u wants to recover node %u\n",
+		  br->node_idx, br->dead_node);
+
+	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
+
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "new_master already set to %u!\n",
+			  dlm->reco.new_master);
+	}
+	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
+		mlog(0, "dead_node already set to %u!\n",
+			  dlm->reco.dead_node);
+	}
+	dlm->reco.new_master = br->node_idx;
+	dlm->reco.dead_node = br->dead_node;
+	if (!test_bit(br->dead_node, dlm->recovery_map)) {
+		mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
+		     "node has not yet.  marking %u as dead\n",
+		     br->node_idx, br->dead_node, br->dead_node);
+		__dlm_hb_node_down(dlm, br->dead_node);
+	}
+	spin_unlock(&dlm->spinlock);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}
+
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
+{
+	int ret = 0;
+	struct dlm_finalize_reco fr;
+	struct dlm_node_iter iter;
+	int nodenum;
+	int status;
+
+	mlog(0, "finishing recovery for node %s:%u\n",
+	     dlm->name, dlm->reco.dead_node);
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	memset(&fr, 0, sizeof(fr));
+	fr.node_idx = dlm->node_num;
+	fr.dead_node = dlm->reco.dead_node;
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
+					 &fr, sizeof(fr), nodenum, &status);
+		if (ret >= 0) {
+			ret = status;
+			if (dlm_is_host_down(ret)) {
+				/* this has no effect on this recovery 
+				 * session, so set the status to zero to 
+				 * finish out the last recovery */
+				mlog(ML_ERROR, "node %u went down after this "
+				     "node finished recovery.\n", nodenum);
+				ret = 0;
+			}
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+
+	/* ok to return 0, domain has gone away */
+	if (!dlm_grab(dlm))
+		return 0;
+
+	mlog(0, "node %u finalizing recovery of node %u\n",
+	     fr->node_idx, fr->dead_node);
+
+	spin_lock(&dlm->spinlock);
+
+	if (dlm->reco.new_master != fr->node_idx) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
+		     "%u is supposed to be the new master, dead=%u\n",
+		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
+		BUG();
+	}
+	if (dlm->reco.dead_node != fr->dead_node) {
+		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
+		     "node %u, but node %u is supposed to be dead\n",
+		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
+		BUG();
+	}
+
+	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+
+	spin_unlock(&dlm->spinlock);
+
+	dlm_reset_recovery(dlm);
+
+	dlm_kick_recovery_thread(dlm);
+	dlm_put(dlm);
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 0000000..92cd5cd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,695 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmthread.c
+ *
+ * standalone DLM module
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
+#include "cluster/masklog.h"
+
+extern spinlock_t dlm_domain_lock;
+extern struct list_head dlm_domains;
+
+static int dlm_thread(void *data);
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm);
+
+#define dlm_lock_is_remote(dlm, lock)     ((lock)->ml.node != (dlm)->node_num)
+
+/* will exit holding res->spinlock, but may drop in function */
+/* waits until flags are cleared on res->state */
+void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	assert_spin_locked(&res->spinlock);
+
+	add_wait_queue(&res->wq, &wait);
+repeat:
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (res->state & flags) {
+		spin_unlock(&res->spinlock);
+		schedule();
+		spin_lock(&res->spinlock);
+		goto repeat;
+	}
+	remove_wait_queue(&res->wq, &wait);
+	current->state = TASK_RUNNING;
+}
+
+
+static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+	if (list_empty(&res->granted) &&
+	    list_empty(&res->converting) &&
+	    list_empty(&res->blocked) &&
+	    list_empty(&res->dirty))
+		return 1;
+	return 0;
+}
+
+
+/* Call whenever you may have added or deleted something from one of
+ * the lockres queue's. This will figure out whether it belongs on the
+ * unused list or not and does the appropriate thing. */
+void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	if (__dlm_lockres_unused(res)){
+		if (list_empty(&res->purge)) {
+			mlog(0, "putting lockres %.*s from purge list\n",
+			     res->lockname.len, res->lockname.name);
+
+			res->last_used = jiffies;
+			list_add_tail(&res->purge, &dlm->purge_list);
+			dlm->purge_count++;
+		}
+	} else if (!list_empty(&res->purge)) {
+		mlog(0, "removing lockres %.*s from purge list\n",
+		     res->lockname.len, res->lockname.name);
+
+		list_del_init(&res->purge);
+		dlm->purge_count--;
+	}
+}
+
+void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
+			    struct dlm_lock_resource *res)
+{
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	spin_lock(&dlm->spinlock);
+	spin_lock(&res->spinlock);
+
+	__dlm_lockres_calc_usage(dlm, res);
+
+	spin_unlock(&res->spinlock);
+	spin_unlock(&dlm->spinlock);
+}
+
+/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
+ * to do migration, but will re-acquire before exit. */
+void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
+{
+	int master;
+	int ret;
+
+	spin_lock(&lockres->spinlock);
+	master = lockres->owner == dlm->node_num;
+	spin_unlock(&lockres->spinlock);
+
+	mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
+	     lockres->lockname.name, master);
+
+	/* Non master is the easy case -- no migration required, just
+	 * quit. */
+	if (!master)
+		goto finish;
+
+	/* Wheee! Migrate lockres here! */
+	spin_unlock(&dlm->spinlock);
+again:
+
+	ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
+	if (ret == -ENOTEMPTY) {
+		mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
+		     lockres->lockname.len, lockres->lockname.name);
+
+		BUG();
+	} else if (ret < 0) {
+		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
+		     lockres->lockname.len, lockres->lockname.name);
+		goto again;
+	}
+
+	spin_lock(&dlm->spinlock);
+
+finish:
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
+static void dlm_run_purge_list(struct dlm_ctxt *dlm,
+			       int purge_now)
+{
+	unsigned int run_max, unused;
+	unsigned long purge_jiffies;
+	struct dlm_lock_resource *lockres;
+
+	spin_lock(&dlm->spinlock);
+	run_max = dlm->purge_count;
+
+	while(run_max && !list_empty(&dlm->purge_list)) {
+		run_max--;
+
+		lockres = list_entry(dlm->purge_list.next,
+				     struct dlm_lock_resource, purge);
+
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it -- there's no need to keep the lockres
+		 * spinlock. */
+		spin_lock(&lockres->spinlock);
+		unused = __dlm_lockres_unused(lockres);
+		spin_unlock(&lockres->spinlock);
+
+		if (!unused)
+			continue;
+
+		purge_jiffies = lockres->last_used +
+			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
+
+		/* Make sure that we want to be processing this guy at
+		 * this time. */
+		if (!purge_now && time_after(purge_jiffies, jiffies)) {
+			/* Since resources are added to the purge list
+			 * in tail order, we can stop at the first
+			 * unpurgable resource -- anyone added after
+			 * him will have a greater last_used value */
+			break;
+		}
+
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+
+		/* This may drop and reacquire the dlm spinlock if it
+		 * has to do migration. */
+		mlog(0, "calling dlm_purge_lockres!\n");
+		dlm_purge_lockres(dlm, lockres);
+		mlog(0, "DONE calling dlm_purge_lockres!\n");
+
+		/* Avoid adding any scheduling latencies */
+		cond_resched_lock(&dlm->spinlock);
+	}
+
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
+			      struct dlm_lock_resource *res)
+{
+	struct dlm_lock *lock, *target;
+	struct list_head *iter;
+	struct list_head *head;
+	int can_grant = 1;
+
+	//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
+	//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
+	//mlog(0, "shuffle res %.*s\n", res->lockname.len,
+	//	  res->lockname.name);
+
+	/* because this function is called with the lockres
+	 * spinlock, and because we know that it is not migrating/
+	 * recovering/in-progress, it is fine to reserve asts and
+	 * basts right before queueing them all throughout */
+	assert_spin_locked(&res->spinlock);
+	BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
+			      DLM_LOCK_RES_RECOVERING|
+			      DLM_LOCK_RES_IN_PROGRESS)));
+
+converting:
+	if (list_empty(&res->converting))
+		goto blocked;
+	mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
+	     res->lockname.name);
+
+	target = list_entry(res->converting.next, struct dlm_lock, list);
+	if (target->ml.convert_type == LKM_IVMODE) {
+		mlog(ML_ERROR, "%.*s: converting a lock with no "
+		     "convert_type!\n", res->lockname.len, res->lockname.name);
+		BUG();
+	}
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			/* queue the BAST if not already */
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			/* update the highest_blocked if needed */
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type,
+					 target->ml.convert_type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.convert_type)
+				lock->ml.highest_blocked =
+					target->ml.convert_type;
+		}
+	}
+
+	/* we can convert the lock */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for converting lock: %.*s, have: %d, "
+		     "granting: %d, node: %u\n", res->lockname.len,
+		     res->lockname.name, target->ml.type,
+		     target->ml.convert_type, target->ml.node);
+
+		target->ml.type = target->ml.convert_type;
+		target->ml.convert_type = LKM_IVMODE;
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+blocked:
+	if (list_empty(&res->blocked))
+		goto leave;
+	target = list_entry(res->blocked.next, struct dlm_lock, list);
+
+	head = &res->granted;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	head = &res->converting;
+	list_for_each(iter, head) {
+		lock = list_entry(iter, struct dlm_lock, list);
+		if (lock==target)
+			continue;
+		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
+			can_grant = 0;
+			if (lock->ml.highest_blocked == LKM_IVMODE) {
+				__dlm_lockres_reserve_ast(res);
+				dlm_queue_bast(dlm, lock);
+			}
+			if (lock->ml.highest_blocked < target->ml.type)
+				lock->ml.highest_blocked = target->ml.type;
+		}
+	}
+
+	/* we can grant the blocked lock (only
+	 * possible if converting list empty) */
+	if (can_grant) {
+		spin_lock(&target->spinlock);
+		BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
+
+		mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
+		     "node: %u\n", res->lockname.len, res->lockname.name,
+		     target->ml.type, target->ml.node);
+
+		// target->ml.type is already correct
+		list_del_init(&target->list);
+		list_add_tail(&target->list, &res->granted);
+
+		BUG_ON(!target->lksb);
+		target->lksb->status = DLM_NORMAL;
+
+		spin_unlock(&target->spinlock);
+
+		__dlm_lockres_reserve_ast(res);
+		dlm_queue_ast(dlm, target);
+		/* go back and check for more */
+		goto converting;
+	}
+
+leave:
+	return;
+}
+
+/* must have NO locks when calling this with res !=NULL * */
+void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+	if (res) {
+		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
+		__dlm_dirty_lockres(dlm, res);
+		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
+	}
+	wake_up(&dlm->dlm_thread_wq);
+}
+
+void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+	mlog_entry("dlm=%p, res=%p\n", dlm, res);
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&res->spinlock);
+
+	/* don't shuffle secondary queues */
+	if ((res->owner == dlm->node_num) &&
+	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		list_add_tail(&res->dirty, &dlm->dirty_list);
+		res->state |= DLM_LOCK_RES_DIRTY;
+	}
+}
+
+
+/* Launch the NM thread for the mounted volume */
+int dlm_launch_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm thread...\n");
+
+	dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+	if (IS_ERR(dlm->dlm_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_thread_task));
+		dlm->dlm_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_thread_task) {
+		mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+		kthread_stop(dlm->dlm_thread_task);
+		dlm->dlm_thread_task = NULL;
+	}
+}
+
+static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
+{
+	int empty;
+
+	spin_lock(&dlm->spinlock);
+	empty = list_empty(&dlm->dirty_list);
+	spin_unlock(&dlm->spinlock);
+
+	return empty;
+}
+
+static void dlm_flush_asts(struct dlm_ctxt *dlm)
+{
+	int ret;
+	struct dlm_lock *lock;
+	struct dlm_lock_resource *res;
+	u8 hi;
+
+	spin_lock(&dlm->ast_lock);
+	while (!list_empty(&dlm->pending_asts)) {
+		lock = list_entry(dlm->pending_asts.next,
+				  struct dlm_lock, ast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+		mlog(0, "delivering an ast for this lockres\n");
+
+		BUG_ON(!lock->ast_pending);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->ast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_do_remote_ast(dlm, res, lock);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_ast(dlm, res, lock);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another ast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->ast_list)) {
+			mlog(0, "aha another ast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the ast_pending flag set.\n");
+		} else
+			lock->ast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+
+	while (!list_empty(&dlm->pending_basts)) {
+		lock = list_entry(dlm->pending_basts.next,
+				  struct dlm_lock, bast_list);
+		/* get an extra ref on lock */
+		dlm_lock_get(lock);
+		res = lock->lockres;
+
+		BUG_ON(!lock->bast_pending);
+
+		/* get the highest blocked lock, and reset */
+		spin_lock(&lock->spinlock);
+		BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
+		hi = lock->ml.highest_blocked;
+		lock->ml.highest_blocked = LKM_IVMODE;
+		spin_unlock(&lock->spinlock);
+
+		/* remove from list (including ref) */
+		list_del_init(&lock->bast_list);
+		dlm_lock_put(lock);
+		spin_unlock(&dlm->ast_lock);
+
+		mlog(0, "delivering a bast for this lockres "
+		     "(blocked = %d\n", hi);
+
+		if (lock->ml.node != dlm->node_num) {
+			ret = dlm_send_proxy_bast(dlm, res, lock, hi);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else
+			dlm_do_local_bast(dlm, res, lock, hi);
+
+		spin_lock(&dlm->ast_lock);
+
+		/* possible that another bast was queued while
+		 * we were delivering the last one */
+		if (!list_empty(&lock->bast_list)) {
+			mlog(0, "aha another bast got queued while "
+			     "we were finishing the last one.  will "
+			     "keep the bast_pending flag set.\n");
+		} else
+			lock->bast_pending = 0;
+
+		/* drop the extra ref.
+		 * this may drop it completely. */
+		dlm_lock_put(lock);
+		dlm_lockres_release_ast(dlm, res);
+	}
+	wake_up(&dlm->ast_wq);
+	spin_unlock(&dlm->ast_lock);
+}
+
+
+#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
+#define DLM_THREAD_MAX_DIRTY  100
+#define DLM_THREAD_MAX_ASTS   10
+
+static int dlm_thread(void *data)
+{
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		int n = DLM_THREAD_MAX_DIRTY;
+
+		/* dlm_shutting_down is very point-in-time, but that
+		 * doesn't matter as we'll just loop back around if we
+		 * get false on the leading edge of a state
+		 * transition. */
+		dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
+
+		/* We really don't want to hold dlm->spinlock while
+		 * calling dlm_shuffle_lists on each lockres that
+		 * needs to have its queues adjusted and AST/BASTs
+		 * run.  So let's pull each entry off the dirty_list
+		 * and drop dlm->spinlock ASAP.  Once off the list,
+		 * res->spinlock needs to be taken again to protect
+		 * the queues while calling dlm_shuffle_lists.  */
+		spin_lock(&dlm->spinlock);
+		while (!list_empty(&dlm->dirty_list)) {
+			int delay = 0;
+			res = list_entry(dlm->dirty_list.next,
+					 struct dlm_lock_resource, dirty);
+
+			/* peel a lockres off, remove it from the list,
+			 * unset the dirty flag and drop the dlm lock */
+			BUG_ON(!res);
+			dlm_lockres_get(res);
+
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_DIRTY;
+			list_del_init(&res->dirty);
+			spin_unlock(&res->spinlock);
+			spin_unlock(&dlm->spinlock);
+
+		 	/* lockres can be re-dirtied/re-added to the
+			 * dirty_list in this gap, but that is ok */
+
+			spin_lock(&res->spinlock);
+			if (res->owner != dlm->node_num) {
+				__dlm_print_one_lock_resource(res);
+				mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
+				     res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
+				     res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+			}
+			BUG_ON(res->owner != dlm->node_num);
+
+			/* it is now ok to move lockreses in these states
+			 * to the dirty list, assuming that they will only be
+			 * dirty for a short while. */
+			if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
+					  DLM_LOCK_RES_MIGRATING |
+					  DLM_LOCK_RES_RECOVERING)) {
+				/* move it to the tail and keep going */
+				spin_unlock(&res->spinlock);
+				mlog(0, "delaying list shuffling for in-"
+				     "progress lockres %.*s, state=%d\n",
+				     res->lockname.len, res->lockname.name,
+				     res->state);
+				delay = 1;
+				goto in_progress;
+			}
+
+			/* at this point the lockres is not migrating/
+			 * recovering/in-progress.  we have the lockres
+			 * spinlock and do NOT have the dlm lock.
+			 * safe to reserve/queue asts and run the lists. */
+
+			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
+			     "res=%p\n", dlm, res);
+
+			/* called while holding lockres lock */
+			dlm_shuffle_lists(dlm, res);
+			spin_unlock(&res->spinlock);
+
+			dlm_lockres_calc_usage(dlm, res);
+
+in_progress:
+
+			spin_lock(&dlm->spinlock);
+			/* if the lock was in-progress, stick
+			 * it on the back of the list */
+			if (delay) {
+				spin_lock(&res->spinlock);
+				list_add_tail(&res->dirty, &dlm->dirty_list);
+				res->state |= DLM_LOCK_RES_DIRTY;
+				spin_unlock(&res->spinlock);
+			}
+			dlm_lockres_put(res);
+
+			/* unlikely, but we may need to give time to
+			 * other tasks */
+			if (!--n) {
+				mlog(0, "throttling dlm_thread\n");
+				break;
+			}
+		}
+
+		spin_unlock(&dlm->spinlock);
+		dlm_flush_asts(dlm);
+
+		/* yield and continue right away if there is more work to do */
+		if (!n) {
+			yield();
+			continue;
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_thread_wq,
+						 !dlm_dirty_list_empty(dlm) ||
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM thread\n");
+	return 0;
+}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 0000000..cec2ce1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmunlock.c
+ *
+ * underlying calls for unlocking locks
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+
+#define MLOG_MASK_PREFIX ML_DLM
+#include "cluster/masklog.h"
+
+#define DLM_UNLOCK_FREE_LOCK           0x00000001
+#define DLM_UNLOCK_CALL_AST            0x00000002
+#define DLM_UNLOCK_REMOVE_LOCK         0x00000004
+#define DLM_UNLOCK_REGRANT_LOCK        0x00000008
+#define DLM_UNLOCK_CLEAR_CONVERT_TYPE  0x00000010
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions);
+
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner);
+
+
+/*
+ * according to the spec:
+ * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+ *
+ *  flags & LKM_CANCEL != 0: must be converting or blocked
+ *  flags & LKM_CANCEL == 0: must be granted
+ *
+ * So to unlock a converting lock, you must first cancel the
+ * convert (passing LKM_CANCEL in flags), then call the unlock
+ * again (with no LKM_CANCEL in flags).
+ */
+
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         res->spinlock and lock->spinlock taken and dropped
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ * all callers should have taken an extra ref on lock coming in
+ */
+static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
+					struct dlm_lock_resource *res,
+					struct dlm_lock *lock,
+					struct dlm_lockstatus *lksb,
+					int flags, int *call_ast,
+					int master_node)
+{
+	enum dlm_status status;
+	int actions = 0;
+	int in_use;
+        u8 owner;
+
+	mlog(0, "master_node = %d, valblk = %d\n", master_node,
+	     flags & LKM_VALBLK);
+
+	if (master_node)
+		BUG_ON(res->owner != dlm->node_num);
+	else
+		BUG_ON(res->owner == dlm->node_num);
+
+	spin_lock(&dlm->spinlock);
+	/* We want to be sure that we're not freeing a lock
+	 * that still has AST's pending... */
+	in_use = !list_empty(&lock->ast_list);
+	spin_unlock(&dlm->spinlock);
+	if (in_use) {
+	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
+		    "while waiting for an ast!", res->lockname.len,
+		    res->lockname.name);
+		return DLM_BADPARAM;
+	}
+
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
+		if (master_node) {
+			mlog(ML_ERROR, "lockres in progress!\n");
+			spin_unlock(&res->spinlock);
+			return DLM_FORWARD;
+		}
+		/* ok for this to sleep if not in a network handler */
+		__dlm_wait_on_lockres(res);
+		res->state |= DLM_LOCK_RES_IN_PROGRESS;
+	}
+	spin_lock(&lock->spinlock);
+
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+
+	/* see above for what the spec says about
+	 * LKM_CANCEL and the lock queue state */
+	if (flags & LKM_CANCEL)
+		status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
+	else
+		status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
+
+	if (status != DLM_NORMAL)
+		goto leave;
+
+	/* By now this has been masked out of cancel requests. */
+	if (flags & LKM_VALBLK) {
+		/* make the final update to the lvb */
+		if (master_node)
+			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
+		else
+			flags |= LKM_PUT_LVB; /* let the send function
+					       * handle it. */
+	}
+
+	if (!master_node) {
+		owner = res->owner;
+		/* drop locks and send message */
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 1;
+		else
+			lock->unlock_pending = 1;
+		spin_unlock(&lock->spinlock);
+		spin_unlock(&res->spinlock);
+		status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
+							flags, owner);
+		spin_lock(&res->spinlock);
+		spin_lock(&lock->spinlock);
+		/* if the master told us the lock was already granted,
+		 * let the ast handle all of these actions */
+		if (status == DLM_NORMAL &&
+		    lksb->status == DLM_CANCELGRANT) {
+			actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
+				     DLM_UNLOCK_REGRANT_LOCK|
+				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+		}
+		if (flags & LKM_CANCEL)
+			lock->cancel_pending = 0;
+		else
+			lock->unlock_pending = 0;
+
+	}
+
+	/* get an extra ref on lock.  if we are just switching
+	 * lists here, we dont want the lock to go away. */
+	dlm_lock_get(lock);
+
+	if (actions & DLM_UNLOCK_REMOVE_LOCK) {
+		list_del_init(&lock->list);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_REGRANT_LOCK) {
+		dlm_lock_get(lock);
+		list_add_tail(&lock->list, &res->granted);
+	}
+	if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
+		mlog(0, "clearing convert_type at %smaster node\n",
+		     master_node ? "" : "non-");
+		lock->ml.convert_type = LKM_IVMODE;
+	}
+
+	/* remove the extra ref on lock */
+	dlm_lock_put(lock);
+
+leave:
+	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+	if (!dlm_lock_on_list(&res->converting, lock))
+		BUG_ON(lock->ml.convert_type != LKM_IVMODE);
+	else
+		BUG_ON(lock->ml.convert_type == LKM_IVMODE);
+	spin_unlock(&lock->spinlock);
+	spin_unlock(&res->spinlock);
+	wake_up(&res->wq);
+
+	/* let the caller's final dlm_lock_put handle the actual kfree */
+	if (actions & DLM_UNLOCK_FREE_LOCK) {
+		/* this should always be coupled with list removal */
+		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
+		mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
+		     lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
+		dlm_lock_put(lock);
+	}
+	if (actions & DLM_UNLOCK_CALL_AST)
+		*call_ast = 1;
+
+	/* if cancel or unlock succeeded, lvb work is done */
+	if (status == DLM_NORMAL)
+		lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
+
+	return status;
+}
+
+void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	/* leave DLM_LKSB_PUT_LVB on the lksb so any final
+	 * update of the lvb will be sent to the new master */
+	list_del_init(&lock->list);
+}
+
+void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
+			       struct dlm_lock *lock)
+{
+	list_del_init(&lock->list);
+	list_add_tail(&lock->list, &res->granted);
+	lock->ml.convert_type = LKM_IVMODE;
+}
+
+
+static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags,
+					  int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
+}
+
+static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
+					  struct dlm_lock_resource *res,
+					  struct dlm_lock *lock,
+					  struct dlm_lockstatus *lksb,
+					  int flags, int *call_ast)
+{
+	return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         none
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
+ */
+static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
+						 struct dlm_lock_resource *res,
+						 struct dlm_lock *lock,
+						 struct dlm_lockstatus *lksb,
+						 int flags,
+						 u8 owner)
+{
+	struct dlm_unlock_lock unlock;
+	int tmpret;
+	enum dlm_status ret;
+	int status = 0;
+	struct kvec vec[2];
+	size_t veclen = 1;
+
+	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+
+	memset(&unlock, 0, sizeof(unlock));
+	unlock.node_idx = dlm->node_num;
+	unlock.flags = cpu_to_be32(flags);
+	unlock.cookie = lock->ml.cookie;
+	unlock.namelen = res->lockname.len;
+	memcpy(unlock.name, res->lockname.name, unlock.namelen);
+
+	vec[0].iov_len = sizeof(struct dlm_unlock_lock);
+	vec[0].iov_base = &unlock;
+
+	if (flags & LKM_PUT_LVB) {
+		/* extra data to send if we are updating lvb */
+		vec[1].iov_len = DLM_LVB_LEN;
+		vec[1].iov_base = lock->lksb->lvb;
+		veclen++;
+	}
+
+	tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
+					vec, veclen, owner, &status);
+	if (tmpret >= 0) {
+		// successfully sent and received
+		if (status == DLM_CANCELGRANT)
+			ret = DLM_NORMAL;
+		else if (status == DLM_FORWARD) {
+			mlog(0, "master was in-progress.  retry\n");
+			ret = DLM_FORWARD;
+		} else
+			ret = status;
+		lksb->status = status;
+	} else {
+		mlog_errno(tmpret);
+		if (dlm_is_host_down(tmpret)) {
+			/* NOTE: this seems strange, but it is what we want.
+			 * when the master goes down during a cancel or
+			 * unlock, the recovery code completes the operation
+			 * as if the master had not died, then passes the
+			 * updated state to the recovery master.  this thread
+			 * just needs to finish out the operation and call
+			 * the unlockast. */
+			ret = DLM_NORMAL;
+		} else {
+			/* something bad.  this will BUG in ocfs2 */
+			ret = dlm_err_to_dlm_status(tmpret);
+		}
+		lksb->status = ret;
+	}
+
+	return ret;
+}
+
+/*
+ * locking:
+ *   caller needs:  none
+ *   taken:         takes and drops res->spinlock
+ *   held on exit:  none
+ * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
+ *          return value from dlmunlock_master
+ */
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
+	struct dlm_lock_resource *res = NULL;
+	struct list_head *iter;
+	struct dlm_lock *lock = NULL;
+	enum dlm_status status = DLM_NORMAL;
+	int found = 0, i;
+	struct dlm_lockstatus *lksb = NULL;
+	int ignore;
+	u32 flags;
+	struct list_head *queue;
+
+	flags = be32_to_cpu(unlock->flags);
+
+	if (flags & LKM_GET_LVB) {
+		mlog(ML_ERROR, "bad args!  GET_LVB specified on unlock!\n");
+		return DLM_BADARGS;
+	}
+
+	if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
+		mlog(ML_ERROR, "bad args!  cannot modify lvb on a CANCEL "
+		     "request!\n");
+		return DLM_BADARGS;
+	}
+
+	if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
+		mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
+		return DLM_IVBUFLEN;
+	}
+
+	if (!dlm_grab(dlm))
+		return DLM_REJECTED;
+
+	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
+			"Domain %s not fully joined!\n", dlm->name);
+
+	mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
+
+	res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
+	if (!res) {
+		/* We assume here that a no lock resource simply means
+		 * it was migrated away and destroyed before the other
+		 * node could detect it. */
+		mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
+		status = DLM_FORWARD;
+		goto not_found;
+	}
+
+	queue=&res->granted;
+	found = 0;
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_RECOVERING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_RECOVERING\n");
+		status = DLM_RECOVERING;
+		goto leave;
+	}
+
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_MIGRATING\n");
+		status = DLM_MIGRATING;
+		goto leave;
+	}
+
+	if (res->owner != dlm->node_num) {
+		spin_unlock(&res->spinlock);
+		mlog(0, "returning DLM_FORWARD -- not master\n");
+		status = DLM_FORWARD;
+		goto leave;
+	}
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.cookie == unlock->cookie &&
+		    	    lock->ml.node == unlock->node_idx) {
+				dlm_lock_get(lock);
+				found = 1;
+				break;
+			}
+		}
+		if (found)
+			break;
+		/* scan granted -> converting -> blocked queues */
+		queue++;
+	}
+	spin_unlock(&res->spinlock);
+	if (!found) {
+		status = DLM_IVLOCKID;
+		goto not_found;
+	}
+
+	/* lock was found on queue */
+	lksb = lock->lksb;
+	/* unlockast only called on originating node */
+	if (flags & LKM_PUT_LVB) {
+		lksb->flags |= DLM_LKSB_PUT_LVB;
+		memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
+	}
+
+	/* if this is in-progress, propagate the DLM_FORWARD
+	 * all the way back out */
+	status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
+	if (status == DLM_FORWARD)
+		mlog(0, "lockres is in progress\n");
+
+	if (flags & LKM_PUT_LVB)
+		lksb->flags &= ~DLM_LKSB_PUT_LVB;
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_kick_thread(dlm, res);
+
+not_found:
+	if (!found)
+		mlog(ML_ERROR, "failed to find lock to unlock! "
+			       "cookie=%"MLFu64"\n",
+		     unlock->cookie);
+	else {
+		/* send the lksb->status back to the other node */
+		status = lksb->status;
+		dlm_lock_put(lock);
+	}
+
+leave:
+	if (res)
+		dlm_lockres_put(res);
+
+	dlm_put(dlm);
+
+	return status;
+}
+
+
+static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	if (dlm_lock_on_list(&res->blocked, lock)) {
+		/* cancel this outright */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	} else if (dlm_lock_on_list(&res->converting, lock)) {
+		/* cancel the request, put back on granted */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK |
+			    DLM_UNLOCK_REGRANT_LOCK |
+			    DLM_UNLOCK_CLEAR_CONVERT_TYPE);
+	} else if (dlm_lock_on_list(&res->granted, lock)) {
+		/* too late, already granted.  DLM_CANCELGRANT */
+		lksb->status = DLM_CANCELGRANT;
+		status = DLM_NORMAL;
+		*actions = DLM_UNLOCK_CALL_AST;
+	} else {
+		mlog(ML_ERROR, "lock to cancel is not on any list!\n");
+		lksb->status = DLM_IVLOCKID;
+		status = DLM_IVLOCKID;
+		*actions = 0;
+	}
+	return status;
+}
+
+static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
+					      struct dlm_lock_resource *res,
+					      struct dlm_lock *lock,
+					      struct dlm_lockstatus *lksb,
+					      int *actions)
+{
+	enum dlm_status status;
+
+	/* unlock request */
+	if (!dlm_lock_on_list(&res->granted, lock)) {
+		lksb->status = DLM_DENIED;
+		status = DLM_DENIED;
+		dlm_error(status);
+		*actions = 0;
+	} else {
+		/* unlock granted lock */
+		lksb->status = DLM_NORMAL;
+		status = DLM_NORMAL;
+		*actions = (DLM_UNLOCK_FREE_LOCK |
+			    DLM_UNLOCK_CALL_AST |
+			    DLM_UNLOCK_REMOVE_LOCK);
+	}
+	return status;
+}
+
+/* there seems to be no point in doing this async
+ * since (even for the remote case) there is really
+ * no work to queue up... so just do it and fire the
+ * unlockast by hand when done... */
+enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
+			  int flags, dlm_astunlockfunc_t *unlockast, void *data)
+{
+	enum dlm_status status;
+	struct dlm_lock_resource *res;
+	struct dlm_lock *lock = NULL;
+	int call_ast, is_master;
+
+	mlog_entry_void();
+
+	if (!lksb) {
+		dlm_error(DLM_BADARGS);
+		return DLM_BADARGS;
+	}
+
+	if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
+		mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
+		flags &= ~LKM_VALBLK;
+	}
+
+	if (!lksb->lockid || !lksb->lockid->lockres) {
+		dlm_error(DLM_BADPARAM);
+		return DLM_BADPARAM;
+	}
+
+	lock = lksb->lockid;
+	BUG_ON(!lock);
+	dlm_lock_get(lock);
+
+	res = lock->lockres;
+	BUG_ON(!res);
+	dlm_lockres_get(res);
+retry:
+	call_ast = 0;
+	/* need to retry up here because owner may have changed */
+	mlog(0, "lock=%p res=%p\n", lock, res);
+
+	spin_lock(&res->spinlock);
+	is_master = (res->owner == dlm->node_num);
+	spin_unlock(&res->spinlock);
+
+	if (is_master) {
+		status = dlmunlock_master(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_master: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	} else {
+		status = dlmunlock_remote(dlm, res, lock, lksb, flags,
+					  &call_ast);
+		mlog(0, "done calling dlmunlock_remote: returned %d, "
+		     "call_ast is %d\n", status, call_ast);
+	}
+
+	if (status == DLM_RECOVERING ||
+	    status == DLM_MIGRATING ||
+	    status == DLM_FORWARD) {
+		/* We want to go away for a tiny bit to allow recovery
+		 * / migration to complete on this resource. I don't
+		 * know of any wait queue we could sleep on as this
+		 * may be happening on another node. Perhaps the
+		 * proper solution is to queue up requests on the
+		 * other end? */
+
+		/* do we want to yield(); ?? */
+		msleep(50);
+
+		mlog(0, "retrying unlock due to pending recovery/"
+		     "migration/in-progress\n");
+		goto retry;
+	}
+
+	if (call_ast) {
+		mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
+		if (is_master) {
+			/* it is possible that there is one last bast 
+			 * pending.  make sure it is flushed, then
+			 * call the unlockast.
+			 * not an issue if this is a mastered remotely,
+			 * since this lock has been removed from the
+			 * lockres queues and cannot be found. */
+			dlm_kick_thread(dlm, NULL);
+			wait_event(dlm->ast_wq, 
+				   dlm_lock_basts_flushed(dlm, lock));
+		}
+		(*unlockast)(data, lksb->status);
+	}
+
+	if (status == DLM_NORMAL) {
+		mlog(0, "kicking the thread\n");
+		dlm_kick_thread(dlm, res);
+	} else
+		dlm_error(status);
+
+	dlm_lockres_calc_usage(dlm, res);
+	dlm_lockres_put(res);
+	dlm_lock_put(lock);
+
+	mlog(0, "returning status=%d!\n", status);
+	return status;
+}
+EXPORT_SYMBOL_GPL(dlmunlock);
+
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 0000000..7ef2653
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
+
+void dlm_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 0000000..f674aee
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLM_VER_H
+#define DLM_VER_H
+
+void dlm_print_version(void);
+
+#endif /* DLM_VER_H */
-- 
cgit v1.1


From 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:23 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

dlmfs: A minimal dlm userspace interface implemented via a virtual
file system.
Most of the OCFS2 tools make use of this to take cluster locks when
doing operations on the file system.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |   2 +
 Documentation/filesystems/dlmfs.txt | 130 +++++++
 fs/ocfs2/dlm/Makefile               |   4 +-
 fs/ocfs2/dlm/dlmfs.c                | 640 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/dlmfsver.c             |  42 +++
 fs/ocfs2/dlm/dlmfsver.h             |  31 ++
 fs/ocfs2/dlm/userdlm.c              | 658 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlm/userdlm.h              | 111 ++++++
 8 files changed, 1617 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/filesystems/dlmfs.txt
 create mode 100644 fs/ocfs2/dlm/dlmfs.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.c
 create mode 100644 fs/ocfs2/dlm/dlmfsver.h
 create mode 100644 fs/ocfs2/dlm/userdlm.c
 create mode 100644 fs/ocfs2/dlm/userdlm.h

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 628f8a7..d9b0a06 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -18,6 +18,8 @@ cramfs.txt
 	- info on the cram filesystem for small storage (ROMs etc)
 devfs/
 	- directory containing devfs documentation.
+dlmfs.txt
+	- info on the userspace interface to the OCFS2 DLM.
 ext2.txt
 	- info, mount options and specifications for the Ext2 filesystem.
 fat_cvf.txt
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 0000000..9afab84
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
+dlmfs
+==================
+A minimal DLM userspace interface implemented via a virtual file
+system.
+
+dlmfs is built with OCFS2 as it requires most of its infrastructure.
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS
+=======
+
+Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
+and Transmeta Corp.
+
+Mark Fasheh <mark.fasheh@oracle.com>
+
+Caveats
+=======
+- Right now it only works with the OCFS2 DLM, though support for other
+  DLM implementations should not be a major issue.
+
+Mount options
+=============
+None
+
+Usage
+=====
+
+If you're just interested in OCFS2, then please see ocfs2.txt. The
+rest of this document will be geared towards those who want to use
+dlmfs for easy to setup and easy to use clustered locking in
+userspace.
+
+Setup
+=====
+
+dlmfs requires that the OCFS2 cluster infrastructure be in
+place. Please download ocfs2-tools from the above url and configure a
+cluster.
+
+You'll want to start heartbeating on a volume which all the nodes in
+your lockspace can access. The easiest way to do this is via
+ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
+that an OCFS2 file system be in place so that it can automatically
+find it's heartbeat area, though it will eventually support heartbeat
+against raw disks.
+
+Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
+with ocfs2-tools.
+
+Once you're heartbeating, DLM lock 'domains' can be easily created /
+destroyed and locks within them accessed.
+
+Locking
+=======
+
+Users may access dlmfs via standard file system calls, or they can use
+'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
+system calls and presents a more traditional locking api.
+
+dlmfs handles lock caching automatically for the user, so a lock
+request for an already acquired lock will not generate another DLM
+call. Userspace programs are assumed to handle their own local
+locking.
+
+Two levels of locks are supported - Shared Read, and Exlcusive.
+Also supported is a Trylock operation.
+
+For information on the libo2dlm interface, please see o2dlm.h,
+distributed with ocfs2-tools.
+
+Lock value blocks can be read and written to a resource via read(2)
+and write(2) against the fd obtained via your open(2) call. The
+maximum currently supported LVB length is 64 bytes (though that is an
+OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
+small amounts of data amongst their nodes.
+
+mkdir(2) signals dlmfs to join a domain (which will have the same name
+as the resulting directory)
+
+rmdir(2) signals dlmfs to leave the domain
+
+Locks for a given domain are represented by regular inodes inside the
+domain directory.  Locking against them is done via the open(2) system
+call.
+
+The open(2) call will not return until your lock has been granted or
+an error has occurred, unless it has been instructed to do a trylock
+operation. If the lock succeeds, you'll get an fd.
+
+open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
+not automatically create inodes for existing lock resources.
+
+Open Flag     Lock Request Type
+---------     -----------------
+O_RDONLY      Shared Read
+O_RDWR        Exclusive
+
+Open Flag     Resulting Locking Behavior
+---------     --------------------------
+O_NONBLOCK    Trylock operation
+
+You must provide exactly one of O_RDONLY or O_RDWR.
+
+If O_NONBLOCK is also provided and the trylock operation was valid but
+could not lock the resource then open(2) will return ETXTBUSY.
+
+close(2) drops the lock associated with your fd.
+
+Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
+supported locally as well. This means you can use them to restrict
+access to the resources via dlmfs on your local node only.
+
+The resource LVB may be read from the fd in either Shared Read or
+Exclusive modes via the read(2) system call. It can be written via
+write(2) only when open in Exclusive mode.
+
+Once written, an LVB will be visible to other nodes who obtain Read
+Only or higher level locks on the resource.
+
+See Also
+========
+http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
+
+For more information on the VMS distributed locking API.
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 2a5274b..ce3f7c2 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,8 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 0000000..dd2d24d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+
+#include <asm/uaccess.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static struct super_operations dlmfs_ops;
+static struct file_operations dlmfs_file_operations;
+static struct inode_operations dlmfs_dir_inode_operations;
+static struct inode_operations dlmfs_root_inode_operations;
+static struct inode_operations dlmfs_file_inode_operations;
+static kmem_cache_t *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> LKM_NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = LKM_EXMODE;
+	else
+		*level = LKM_PRMODE;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= LKM_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp =
+		(struct dlmfs_filp_private *) file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != LKM_IVMODE)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count - *ppos;
+
+	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	user_dlm_read_lvb(inode, lvb_buf, readlen);
+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+	readlen -= bytes_left;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo,
+			    kmem_cache_t *cachep,
+			    unsigned long flags)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		ip->ip_dlm = NULL;
+		ip->ip_parent = NULL;
+
+		inode_init_once(&ip->ip_vfs_inode);
+	}
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_clear_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return;
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_dlm)
+		user_dlm_unregister_context(ip->ip_dlm);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_nlink++;
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct dlm_ctxt *dlm;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= O2NM_MAX_NAME_LEN) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	dlm = user_dlm_register_context(domain);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_dlm = dlm;
+
+	dir->i_nlink++;
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+};
+
+static struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.clear_inode	= dlmfs_clear_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
+
+static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.get_sb		= dlmfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+				dlmfs_init_once, NULL);
+	if (!dlmfs_inode_cache)
+		return -ENOMEM;
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	if (kmem_cache_destroy(dlmfs_inode_cache))
+		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
+		       "were freed\n");
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 0000000..d2be3ad
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 0000000..f35eadb
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 0000000..e1fdd28
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <asm/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct dlm_ctxt *
+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_dlm;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {		\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void user_ast(void *opaque)
+{
+	struct user_lock_res *lockres = opaque;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "AST fired for lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
+		     lksb->status, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(void *opaque);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
+			  lockres);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case LKM_EXMODE:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case LKM_PRMODE:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(void *opaque, int level)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
+		lockres->l_name, level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(void *opaque, enum dlm_status status)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
+
+	if (status != DLM_NORMAL)
+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+		lockres->l_level = LKM_IVMODE;
+	else {
+		lockres->l_requested = LKM_IVMODE; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(void *opaque)
+{
+	int new_level, status;
+	struct user_lock_res *lockres = (struct user_lock_res *) opaque;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "processing lockres %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
+	 * for user_ast to do. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "lock is in teardown so we do nothing\n");
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		mlog(0, "BUSY flag detected...\n");
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = dlmunlock(dlm,
+				   &lockres->l_lksb,
+				   LKM_CANCEL,
+				   user_unlock_ast,
+				   lockres);
+		if (status == DLM_CANCELGRANT) {
+			/* If we got this, then the ast was fired
+			 * before we could cancel. We cleanup our
+			 * state, and restart the function. */
+			spin_lock(&lockres->l_lock);
+			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+			spin_unlock(&lockres->l_lock);
+		} else if (status != DLM_NORMAL)
+			user_log_dlm_error("dlmunlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+			lockres->l_ro_holders, lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == LKM_PRMODE)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for pr: ex = %u\n",
+			lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(0, "Downconvert lock from %d to %d\n",
+		lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = dlmlock(dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 LKM_CONVERT|LKM_VALBLK,
+			 lockres->l_name,
+			 user_ast,
+			 lockres,
+			 user_bast);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmlock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n",
+		     lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
+		lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+		lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | LKM_VALBLK;
+		if (lockres->l_level != LKM_IVMODE)
+			local_flags |= LKM_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, get lock from %d to level = %d\n",
+			lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(dlm,
+				 level,
+				 &lockres->l_lksb,
+				 local_flags,
+				 lockres->l_name,
+				 user_ast,
+				 lockres,
+				 user_bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				status = -EAGAIN;
+			else {
+				user_log_dlm_error("dlmlock", status, lockres);
+				status = -EINVAL;
+			}
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+			lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
+		return;
+	}
+
+	mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
+		(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_EXMODE);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_PRMODE);
+	memcpy(val, lvb, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = LKM_IVMODE;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_blocking = LKM_IVMODE;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "asked to destroy %s\n", lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		mlog(0, "lock %s is busy\n", lockres->l_name);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s has holders\n", lockres->l_name);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "lock %s is not attached\n", lockres->l_name);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+	spin_unlock(&lockres->l_lock);
+
+	mlog(0, "unlocking lockres %s\n", lockres->l_name);
+	status = dlmunlock(dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   user_unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmunlock", status, lockres);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
+{
+	struct dlm_ctxt *dlm;
+	u32 dlm_key;
+	char *domain;
+
+	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	if (!domain) {
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dlm_key = crc32_le(0, name->name, name->len);
+
+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+
+	dlm = dlm_register_domain(domain, dlm_key);
+	if (IS_ERR(dlm))
+		mlog_errno(PTR_ERR(dlm));
+
+	kfree(domain);
+	return dlm;
+}
+
+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain(dlm);
+}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 0000000..04178bc
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len);
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+
+struct dlmfs_inode_private {
+	struct dlm_ctxt             *ip_dlm;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
-- 
cgit v1.1


From ccd979bdbce9fba8412beb3f1de68a9d0171b12c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:24 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 Documentation/filesystems/00-INDEX  |    2 +
 Documentation/filesystems/ocfs2.txt |   55 +
 MAINTAINERS                         |    9 +
 fs/ocfs2/Makefile                   |   33 +
 fs/ocfs2/alloc.c                    | 2040 ++++++++++++++++++++++++
 fs/ocfs2/alloc.h                    |   82 +
 fs/ocfs2/aops.c                     |  643 ++++++++
 fs/ocfs2/aops.h                     |   41 +
 fs/ocfs2/buffer_head_io.c           |  232 +++
 fs/ocfs2/buffer_head_io.h           |   73 +
 fs/ocfs2/dcache.c                   |   91 ++
 fs/ocfs2/dcache.h                   |   31 +
 fs/ocfs2/dir.c                      |  618 ++++++++
 fs/ocfs2/dir.h                      |   54 +
 fs/ocfs2/dlmglue.c                  | 2904 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h                  |  111 ++
 fs/ocfs2/endian.h                   |   45 +
 fs/ocfs2/export.c                   |  248 +++
 fs/ocfs2/export.h                   |   31 +
 fs/ocfs2/extent_map.c               |  994 ++++++++++++
 fs/ocfs2/extent_map.h               |   46 +
 fs/ocfs2/file.c                     | 1237 +++++++++++++++
 fs/ocfs2/file.h                     |   57 +
 fs/ocfs2/heartbeat.c                |  378 +++++
 fs/ocfs2/heartbeat.h                |   67 +
 fs/ocfs2/inode.c                    | 1140 ++++++++++++++
 fs/ocfs2/inode.h                    |  145 ++
 fs/ocfs2/journal.c                  | 1652 ++++++++++++++++++++
 fs/ocfs2/journal.h                  |  457 ++++++
 fs/ocfs2/localalloc.c               |  983 ++++++++++++
 fs/ocfs2/localalloc.h               |   56 +
 fs/ocfs2/mmap.c                     |  102 ++
 fs/ocfs2/mmap.h                     |    6 +
 fs/ocfs2/namei.c                    | 2264 +++++++++++++++++++++++++++
 fs/ocfs2/namei.h                    |   58 +
 fs/ocfs2/ocfs1_fs_compat.h          |  109 ++
 fs/ocfs2/ocfs2.h                    |  464 ++++++
 fs/ocfs2/ocfs2_fs.h                 |  638 ++++++++
 fs/ocfs2/ocfs2_lockid.h             |   73 +
 fs/ocfs2/slot_map.c                 |  303 ++++
 fs/ocfs2/slot_map.h                 |   66 +
 fs/ocfs2/suballoc.c                 | 1651 ++++++++++++++++++++
 fs/ocfs2/suballoc.h                 |  132 ++
 fs/ocfs2/super.c                    | 1733 +++++++++++++++++++++
 fs/ocfs2/super.h                    |   44 +
 fs/ocfs2/symlink.c                  |  180 +++
 fs/ocfs2/symlink.h                  |   42 +
 fs/ocfs2/sysfile.c                  |  131 ++
 fs/ocfs2/sysfile.h                  |   33 +
 fs/ocfs2/uptodate.c                 |  544 +++++++
 fs/ocfs2/uptodate.h                 |   44 +
 fs/ocfs2/ver.c                      |   43 +
 fs/ocfs2/ver.h                      |   31 +
 fs/ocfs2/vote.c                     | 1202 +++++++++++++++
 fs/ocfs2/vote.h                     |   56 +
 55 files changed, 24504 insertions(+)
 create mode 100644 Documentation/filesystems/ocfs2.txt
 create mode 100644 fs/ocfs2/Makefile
 create mode 100644 fs/ocfs2/alloc.c
 create mode 100644 fs/ocfs2/alloc.h
 create mode 100644 fs/ocfs2/aops.c
 create mode 100644 fs/ocfs2/aops.h
 create mode 100644 fs/ocfs2/buffer_head_io.c
 create mode 100644 fs/ocfs2/buffer_head_io.h
 create mode 100644 fs/ocfs2/dcache.c
 create mode 100644 fs/ocfs2/dcache.h
 create mode 100644 fs/ocfs2/dir.c
 create mode 100644 fs/ocfs2/dir.h
 create mode 100644 fs/ocfs2/dlmglue.c
 create mode 100644 fs/ocfs2/dlmglue.h
 create mode 100644 fs/ocfs2/endian.h
 create mode 100644 fs/ocfs2/export.c
 create mode 100644 fs/ocfs2/export.h
 create mode 100644 fs/ocfs2/extent_map.c
 create mode 100644 fs/ocfs2/extent_map.h
 create mode 100644 fs/ocfs2/file.c
 create mode 100644 fs/ocfs2/file.h
 create mode 100644 fs/ocfs2/heartbeat.c
 create mode 100644 fs/ocfs2/heartbeat.h
 create mode 100644 fs/ocfs2/inode.c
 create mode 100644 fs/ocfs2/inode.h
 create mode 100644 fs/ocfs2/journal.c
 create mode 100644 fs/ocfs2/journal.h
 create mode 100644 fs/ocfs2/localalloc.c
 create mode 100644 fs/ocfs2/localalloc.h
 create mode 100644 fs/ocfs2/mmap.c
 create mode 100644 fs/ocfs2/mmap.h
 create mode 100644 fs/ocfs2/namei.c
 create mode 100644 fs/ocfs2/namei.h
 create mode 100644 fs/ocfs2/ocfs1_fs_compat.h
 create mode 100644 fs/ocfs2/ocfs2.h
 create mode 100644 fs/ocfs2/ocfs2_fs.h
 create mode 100644 fs/ocfs2/ocfs2_lockid.h
 create mode 100644 fs/ocfs2/slot_map.c
 create mode 100644 fs/ocfs2/slot_map.h
 create mode 100644 fs/ocfs2/suballoc.c
 create mode 100644 fs/ocfs2/suballoc.h
 create mode 100644 fs/ocfs2/super.c
 create mode 100644 fs/ocfs2/super.h
 create mode 100644 fs/ocfs2/symlink.c
 create mode 100644 fs/ocfs2/symlink.h
 create mode 100644 fs/ocfs2/sysfile.c
 create mode 100644 fs/ocfs2/sysfile.h
 create mode 100644 fs/ocfs2/uptodate.c
 create mode 100644 fs/ocfs2/uptodate.h
 create mode 100644 fs/ocfs2/ver.c
 create mode 100644 fs/ocfs2/ver.h
 create mode 100644 fs/ocfs2/vote.c
 create mode 100644 fs/ocfs2/vote.h

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d9b0a06..2580ada 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
 	- info and mount options for the NTFS filesystem (Windows NT).
 proc.txt
 	- info on Linux's /proc filesystem.
+ocfs2.txt
+	- info and mount options for the OCFS2 clustered filesystem.
 romfs.txt
 	- Description of the ROMFS filesystem.
 smbfs.txt
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 0000000..f2595ca
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page:    http://oss.oracle.com/projects/ocfs2
+Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker   <joel.becker@oracle.com>
+Zach Brown    <zach.brown@oracle.com>
+Mark Fasheh   <mark.fasheh@oracle.com>
+Kurt Hackel   <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh  <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+	- sparse files
+	- extended attributes
+	- shared writeable mmap
+	- loopback is supported, but data written will not
+	  be cluster coherent.
+	- quotas
+	- cluster aware flock
+	- Directory change notification (F_NOTIFY)
+	- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	- POSIX ACLs
+	- readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1		This enables/disables barriers. barrier=0 disables it,
+			barrier=1 enables it.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
+errors=panic		Panic and halt the machine if an error occurs.
+intr		(*)	Allow signals to interrupt cluster operations.
+nointr			Do not allow signals to interrupt cluster
+			operations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 86ee06f..1588830 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M:	ajoshi@shell.unixbox.com
 L:	linux-nvidia@lists.surfsouth.com
 S:	Maintained
 
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P:	Mark Fasheh
+M:	mark.fasheh@oracle.com
+P:	Kurt Hackel
+M:	kurt.hackel@oracle.com
+L:	ocfs2-devel@oss.oracle.com
+W:	http://oss.oracle.com/projects/ocfs2/
+S:	Supported	
+
 OLYMPIC NETWORK DRIVER
 P:	Peter De Shrijver
 M:	p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 0000000..7d3be84
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+	alloc.o 		\
+	aops.o 			\
+	buffer_head_io.o	\
+	dcache.o 		\
+	dir.o 			\
+	dlmglue.o 		\
+	export.o 		\
+	extent_map.o 		\
+	file.o 			\
+	heartbeat.o 		\
+	inode.o 		\
+	journal.o 		\
+	localalloc.o 		\
+	mmap.o 			\
+	namei.o 		\
+	slot_map.o 		\
+	suballoc.o 		\
+	super.o 		\
+	symlink.o 		\
+	sysfile.o 		\
+	uptodate.o		\
+	ver.o 			\
+	vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 0000000..465f797
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 blkno,
+				  u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       unsigned int new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+			       struct ocfs2_extent_rec *ext,
+			       u64 blkno)
+{
+	return blkno == (le64_to_cpu(ext->e_blkno) +
+			 ocfs2_clusters_to_blocks(inode->i_sb,
+						  le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe)
+{
+	int retval;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		retval = -EIO;
+		goto bail;
+	}
+
+	if (fe->i_last_eb_blk) {
+		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &eb_bh, OCFS2_BH_CACHED, inode);
+		if (retval < 0) {
+			mlog_errno(retval);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	BUG_ON(el->l_tree_depth != 0);
+
+	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	mlog_exit(retval);
+	return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     int wanted,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct buffer_head *bhs[])
+{
+	int count, status, i;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_extent_block *eb;
+
+	mlog_entry_void();
+
+	count = 0;
+	while (count < wanted) {
+		status = ocfs2_claim_metadata(osb,
+					      handle,
+					      meta_ac,
+					      wanted - count,
+					      &suballoc_bit_start,
+					      &num_got,
+					      &first_blkno);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = count;  i < (num_got + count); i++) {
+			bhs[i] = sb_getblk(osb->sb, first_blkno);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+			status = ocfs2_journal_access(handle, inode, bhs[i],
+						      OCFS2_JOURNAL_ACCESS_CREATE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+			/* Ok, setup the minimal stuff here. */
+			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+			eb->h_blkno = cpu_to_le64(first_blkno);
+			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+			/* we always use slot zero's suballocator */
+			eb->h_suballoc_slot = 0;
+#else
+			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+			eb->h_list.l_count =
+				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+			suballoc_bit_start++;
+			first_blkno++;
+
+			/* We'll also be dirtied by the caller, so
+			 * this isn't absolutely necessary. */
+			status = ocfs2_journal_dirty(handle, bhs[i]);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		count += num_got;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		for(i = 0; i < wanted; i++) {
+			if (bhs[i])
+				brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+	}
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct buffer_head *fe_bh,
+			    struct buffer_head *eb_bh,
+			    struct buffer_head *last_eb_bh,
+			    struct ocfs2_alloc_context *meta_ac)
+{
+	int status, new_blocks, i;
+	u64 next_blkno, new_last_eb_blk;
+	struct buffer_head *bh;
+	struct buffer_head **new_eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *eb_el;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	BUG_ON(!last_eb_bh);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (eb_bh) {
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+
+	/* we never add a branch to a leaf. */
+	BUG_ON(!el->l_tree_depth);
+
+	new_blocks = le16_to_cpu(el->l_tree_depth);
+
+	/* allocate the number of new eb blocks we need */
+	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+			     GFP_KERNEL);
+	if (!new_eb_bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+					   meta_ac, new_eb_bhs);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+	 * linked with the rest of the tree.
+	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
+	 *
+	 * when we leave the loop, new_last_eb_blk will point to the
+	 * newest leaf, and next_blkno will point to the topmost extent
+	 * block. */
+	next_blkno = new_last_eb_blk = 0;
+	for(i = 0; i < new_blocks; i++) {
+		bh = new_eb_bhs[i];
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		eb_el = &eb->h_list;
+
+		status = ocfs2_journal_access(handle, inode, bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb->h_next_leaf_blk = 0;
+		eb_el->l_tree_depth = cpu_to_le16(i);
+		eb_el->l_next_free_rec = cpu_to_le16(1);
+		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		if (!eb_el->l_tree_depth)
+			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		next_blkno = le64_to_cpu(eb->h_blkno);
+	}
+
+	/* This is a bit hairy. We want to update up to three blocks
+	 * here without leaving any of them in an inconsistent state
+	 * in case of error. We don't have to worry about
+	 * journal_dirty erroring as it won't unless we've aborted the
+	 * handle (in which case we would never be here) so reserving
+	 * the write with journal_access is all we need to do. */
+	status = ocfs2_journal_access(handle, inode, last_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (eb_bh) {
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Link the new branch into the rest of the tree (el will
+	 * either be on the fe, or the extent block passed in. */
+	i = le16_to_cpu(el->l_next_free_rec);
+	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+	el->l_recs[i].e_cpos = fe->i_clusters;
+	el->l_recs[i].e_clusters = 0;
+	le16_add_cpu(&el->l_next_free_rec, 1);
+
+	/* fe needs a new last extent block pointer, as does the
+	 * next_leaf on the previously last-extent-block. */
+	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+	eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+	status = ocfs2_journal_dirty(handle, last_eb_bh);
+	if (status < 0)
+		mlog_errno(status);
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (eb_bh) {
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (new_eb_bhs) {
+		for (i = 0; i < new_blocks; i++)
+			if (new_eb_bhs[i])
+				brelse(new_eb_bhs[i]);
+		kfree(new_eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  struct ocfs2_alloc_context *meta_ac,
+				  struct buffer_head **ret_new_eb_bh)
+{
+	int status, i;
+	struct buffer_head *new_eb_bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *eb_el;
+
+	mlog_entry_void();
+
+	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+					   &new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		status = -EIO;
+		goto bail;
+	}
+
+	eb_el = &eb->h_list;
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	fe_el = &fe->id2.i_list;
+
+	status = ocfs2_journal_access(handle, inode, new_eb_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* copy the fe data into the new extent block */
+	eb_el->l_tree_depth = fe_el->l_tree_depth;
+	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+	}
+
+	status = ocfs2_journal_dirty(handle, new_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* update fe now */
+	le16_add_cpu(&fe_el->l_tree_depth, 1);
+	fe_el->l_recs[0].e_cpos = 0;
+	fe_el->l_recs[0].e_blkno = eb->h_blkno;
+	fe_el->l_recs[0].e_clusters = fe->i_clusters;
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+		fe_el->l_recs[i].e_cpos = 0;
+		fe_el->l_recs[i].e_clusters = 0;
+		fe_el->l_recs[i].e_blkno = 0;
+	}
+	fe_el->l_next_free_rec = cpu_to_le16(1);
+
+	/* If this is our 1st tree depth shift, then last_eb_blk
+	 * becomes the allocated extent block */
+	if (fe_el->l_tree_depth == cpu_to_le16(1))
+		fe->i_last_eb_blk = eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*ret_new_eb_bh = new_eb_bh;
+	new_eb_bh = NULL;
+	status = 0;
+bail:
+	if (new_eb_bh)
+		brelse(new_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent.  Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+				  struct ocfs2_journal_handle *handle,
+				  struct inode *inode,
+				  struct buffer_head *fe_bh,
+				  u64 start_blk,
+				  u32 new_clusters)
+{
+	int status, i, num_bhs = 0;
+	u64 next_blkno;
+	u16 next_free;
+	struct buffer_head **eb_bhs = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+	if (el->l_tree_depth) {
+		/* This is another operation where we want to be
+		 * careful about our tree updates. An error here means
+		 * none of the previous changes we made should roll
+		 * forward. As a result, we have to record the buffers
+		 * for this part of the tree in an array and reserve a
+		 * journal write to them before making any changes. */
+		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+				 GFP_KERNEL);
+		if (!eb_bhs) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		i = 0;
+		while(el->l_tree_depth) {
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			if (next_free == 0) {
+				ocfs2_error(inode->i_sb,
+					    "Dinode %"MLFu64" has a bad "
+					    "extent list",
+					    OCFS2_I(inode)->ip_blkno);
+				status = -EIO;
+				goto bail;
+			}
+			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+			BUG_ON(i >= num_bhs);
+			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+						  OCFS2_BH_CACHED, inode);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+								 eb);
+				status = -EIO;
+				goto bail;
+			}
+
+			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			el = &eb->h_list;
+			i++;
+			/* When we leave this loop, eb_bhs[num_bhs - 1] will
+			 * hold the bottom-most leaf extent block. */
+		}
+		BUG_ON(el->l_tree_depth);
+
+		el = &fe->id2.i_list;
+		/* If we have tree depth, then the fe update is
+		 * trivial, and we want to switch el out for the
+		 * bottom-most leaf in order to update it with the
+		 * actual extent data below. */
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" has a bad "
+				    "extent list",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+			     new_clusters);
+		/* (num_bhs - 1) to avoid the leaf */
+		for(i = 0; i < (num_bhs - 1); i++) {
+			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+			el = &eb->h_list;
+
+			/* finally, make our actual change to the
+			 * intermediate extent blocks. */
+			next_free = le16_to_cpu(el->l_next_free_rec);
+			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+				     new_clusters);
+
+			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+			if (status < 0)
+				mlog_errno(status);
+		}
+		BUG_ON(i != (num_bhs - 1));
+		/* note that the leaf block wasn't touched in
+		 * the loop above */
+		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+		el = &eb->h_list;
+		BUG_ON(el->l_tree_depth);
+	}
+
+	/* yay, we can finally add the actual extent now! */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) &&
+	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+	} else if (le16_to_cpu(el->l_next_free_rec) &&
+		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+		/* having an empty extent at eof is legal. */
+		if (el->l_recs[i].e_cpos != fe->i_clusters) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %"MLFu64" trailing extent is bad: "
+				    "cpos (%u) != number of clusters (%u)",
+				    le32_to_cpu(el->l_recs[i].e_cpos),
+				    le32_to_cpu(fe->i_clusters));
+			status = -EIO;
+			goto bail;
+		}
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+	} else {
+		/* No contiguous record, or no empty record at eof, so
+		 * we add a new one. */
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+		       le16_to_cpu(el->l_count));
+		i = le16_to_cpu(el->l_next_free_rec);
+
+		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+		el->l_recs[i].e_cpos = fe->i_clusters;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+	}
+
+	/*
+	 * extent_map errors are not fatal, so they are ignored outside
+	 * of flushing the thing.
+	 */
+	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+					 new_clusters);
+	if (status) {
+		mlog_errno(status);
+		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+	}
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+	if (fe->id2.i_list.l_tree_depth) {
+		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+	status = 0;
+bail:
+	if (eb_bhs) {
+		for (i = 0; i < num_bhs; i++)
+			if (eb_bhs[i])
+				brelse(eb_bhs[i]);
+		kfree(eb_bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ *    *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ *    pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ *    which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head **target_bh)
+{
+	int status = 0, i;
+	u64 blkno;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lowest_bh = NULL;
+
+	mlog_entry_void();
+
+	*target_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	while(le16_to_cpu(el->l_tree_depth) > 1) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+				    "extent list (next_free_rec == 0)",
+				    OCFS2_I(inode)->ip_blkno);
+			status = -EIO;
+			goto bail;
+		}
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (!blkno) {
+			ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+				    "list where extent # %d has no physical "
+				    "block start",
+				    OCFS2_I(inode)->ip_blkno, i);
+			status = -EIO;
+			goto bail;
+		}
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &eb->h_list;
+
+		if (le16_to_cpu(el->l_next_free_rec) <
+		    le16_to_cpu(el->l_count)) {
+			if (lowest_bh)
+				brelse(lowest_bh);
+			lowest_bh = bh;
+			get_bh(lowest_bh);
+		}
+	}
+
+	/* If we didn't find one and the fe doesn't have any room,
+	 * then return '1' */
+	if (!lowest_bh
+	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+		status = 1;
+
+	*target_bh = lowest_bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 start_blk,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status, i, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list  *el;
+
+	mlog_entry_void();
+
+	mlog(0, "add %u clusters starting at block %"MLFu64" to "
+		"inode %"MLFu64"\n",
+	     new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	el = &fe->id2.i_list;
+
+	if (el->l_tree_depth) {
+		/* jump to end of tree */
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_exit(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/* Can we allocate without adding/shifting tree bits? */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le16_to_cpu(el->l_next_free_rec) == 0
+	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+		goto out_add;
+
+	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+	     "tree now.\n");
+
+	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+	if (shift < 0) {
+		status = shift;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* We traveled all the way to the bottom of the allocation tree
+	 * and didn't find room for any more extents - we need to add
+	 * another tree level */
+	if (shift) {
+		/* if we hit a leaf, we'd better be empty :) */
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+		       le16_to_cpu(el->l_count));
+		BUG_ON(bh);
+		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+		     "(current = %u)\n",
+		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+		/* ocfs2_shift_tree_depth will return us a buffer with
+		 * the new extent block (so we can pass that to
+		 * ocfs2_add_branch). */
+		status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+						meta_ac, &bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		/* Special case: we have room now if we shifted from
+		 * tree_depth 0 */
+		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+			goto out_add;
+	}
+
+	/* call ocfs2_add_branch to add the final part of the tree with
+	 * the new data. */
+	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+				  meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+out_add:
+	/* Finally, we can add clusters. */
+	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+					start_blk, new_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+
+	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+			"slot %d, invalid truncate log parameters: used = "
+			"%u, count = %u\n", osb->slot_num,
+			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+					   unsigned int new_start)
+{
+	unsigned int tail_index;
+	unsigned int current_tail;
+
+	/* No records, nothing to coalesce */
+	if (!le16_to_cpu(tl->tl_used))
+		return 0;
+
+	tail_index = le16_to_cpu(tl->tl_used) - 1;
+	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+	return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     u64 start_blk,
+				     unsigned int num_clusters)
+{
+	int status, index;
+	unsigned int start_cluster, tl_count;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+		   num_clusters);
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	tl_count = le16_to_cpu(tl->tl_count);
+	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+			tl_count == 0,
+			"Truncate record count on #%"MLFu64" invalid ("
+			"wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+			ocfs2_truncate_recs_per_inode(osb->sb),
+			le16_to_cpu(tl->tl_count));
+
+	/* Caller should have known to flush before calling us. */
+	index = le16_to_cpu(tl->tl_used);
+	if (index >= tl_count) {
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+	     "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+	     OCFS2_I(tl_inode)->ip_blkno, index);
+
+	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+		/*
+		 * Move index back to the record we are coalescing with.
+		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+		 */
+		index--;
+
+		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+		     index, le32_to_cpu(tl->tl_recs[index].t_start),
+		     num_clusters);
+	} else {
+		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+		tl->tl_used = cpu_to_le16(index + 1);
+	}
+	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+	status = ocfs2_journal_dirty(handle, tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+					 struct ocfs2_journal_handle *handle,
+					 struct inode *data_alloc_inode,
+					 struct buffer_head *data_alloc_bh)
+{
+	int status = 0;
+	int i;
+	unsigned int num_clusters;
+	u64 start_blk;
+	struct ocfs2_truncate_rec rec;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+	mlog_entry_void();
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	i = le16_to_cpu(tl->tl_used) - 1;
+	while (i >= 0) {
+		/* Caller has given us at least enough credits to
+		 * update the truncate log dinode */
+		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		tl->tl_used = cpu_to_le16(i);
+
+		status = ocfs2_journal_dirty(handle, tl_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* TODO: Perhaps we can calculate the bulk of the
+		 * credits up front rather than extending like
+		 * this. */
+		status = ocfs2_extend_trans(handle,
+					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		rec = tl->tl_recs[i];
+		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+						    le32_to_cpu(rec.t_start));
+		num_clusters = le32_to_cpu(rec.t_clusters);
+
+		/* if start_blk is not set, we ignore the record as
+		 * invalid. */
+		if (start_blk) {
+			mlog(0, "free record %d, start = %u, clusters = %u\n",
+			     i, le32_to_cpu(rec.t_start), num_clusters);
+
+			status = ocfs2_free_clusters(handle, data_alloc_inode,
+						     data_alloc_bh, start_blk,
+						     num_clusters);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i--;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	unsigned int num_to_flush;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct inode *data_alloc_inode = NULL;
+	struct buffer_head *tl_bh = osb->osb_tl_bh;
+	struct buffer_head *data_alloc_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	num_to_flush = le16_to_cpu(tl->tl_used);
+	mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+	     num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+	if (!num_to_flush) {
+		status = 0;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	data_alloc_inode = ocfs2_get_system_file_inode(osb,
+						       GLOBAL_BITMAP_SYSTEM_INODE,
+						       OCFS2_INVALID_SLOT);
+	if (!data_alloc_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, data_alloc_inode);
+	status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+					       data_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_alloc_inode)
+		iput(data_alloc_inode);
+
+	if (data_alloc_bh)
+		brelse(data_alloc_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	down(&tl_inode->i_sem);
+	status = __ocfs2_flush_truncate_log(osb);
+	up(&tl_inode->i_sem);
+
+	return status;
+}
+
+static void ocfs2_truncate_log_worker(void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+
+	mlog_entry_void();
+
+	status = ocfs2_flush_truncate_log(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel)
+{
+	if (osb->osb_tl_inode) {
+		/* We want to push off log flushes while truncates are
+		 * still running. */
+		if (cancel)
+			cancel_delayed_work(&osb->osb_truncate_log_wq);
+
+		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
+	}
+}
+
+static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
+				       int slot_num,
+				       struct inode **tl_inode,
+				       struct buffer_head **tl_bh)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					   TRUNCATE_LOG_SYSTEM_INODE,
+					   slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		iput(inode);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*tl_inode = inode;
+	*tl_bh    = bh;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* called during the 1st stage of node recovery. we stamp a clean
+ * truncate log and pass back a copy for processing later. if the
+ * truncate log does not require processing, a *tl_copy is set to
+ * NULL. */
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_truncate_log *tl;
+
+	*tl_copy = NULL;
+
+	mlog(0, "recover truncate log from slot %d\n", slot_num);
+
+	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) tl_bh->b_data;
+	tl = &di->id2.i_dealloc;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
+		status = -EIO;
+		goto bail;
+	}
+
+	if (le16_to_cpu(tl->tl_used)) {
+		mlog(0, "We'll have %u logs to recover\n",
+		     le16_to_cpu(tl->tl_used));
+
+		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
+		if (!(*tl_copy)) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		/* Assuming the write-out below goes well, this copy
+		 * will be passed back to recovery for processing. */
+		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
+
+		/* All we need to do to clear the truncate log is set
+		 * tl_used. */
+		tl->tl_used = 0;
+
+		status = ocfs2_write_block(osb, tl_bh, tl_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+bail:
+	if (tl_inode)
+		iput(tl_inode);
+	if (tl_bh)
+		brelse(tl_bh);
+
+	if (status < 0 && (*tl_copy)) {
+		kfree(*tl_copy);
+		*tl_copy = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy)
+{
+	int status = 0;
+	int i;
+	unsigned int clusters, num_recs, start_cluster;
+	u64 start_blk;
+	struct ocfs2_journal_handle *handle;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_truncate_log *tl;
+
+	mlog_entry_void();
+
+	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
+		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
+		return -EINVAL;
+	}
+
+	tl = &tl_copy->id2.i_dealloc;
+	num_recs = le16_to_cpu(tl->tl_used);
+	mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
+	     tl_copy->i_blkno);
+
+	down(&tl_inode->i_sem);
+	for(i = 0; i < num_recs; i++) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			status = __ocfs2_flush_truncate_log(osb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail_up;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, NULL,
+					   OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_up;
+		}
+
+		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
+		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
+		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
+
+		status = ocfs2_truncate_log_append(osb, handle,
+						   start_blk, clusters);
+		ocfs2_commit_trans(handle);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_up;
+		}
+	}
+
+bail_up:
+	up(&tl_inode->i_sem);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	if (tl_inode) {
+		cancel_delayed_work(&osb->osb_truncate_log_wq);
+		flush_workqueue(ocfs2_wq);
+
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		brelse(osb->osb_tl_bh);
+		iput(osb->osb_tl_inode);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb)
+{
+	int status;
+	struct inode *tl_inode = NULL;
+	struct buffer_head *tl_bh = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_get_truncate_log_info(osb,
+					     osb->slot_num,
+					     &tl_inode,
+					     &tl_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* ocfs2_truncate_log_shutdown keys on the existence of
+	 * osb->osb_tl_inode so we don't set any of the osb variables
+	 * until we're sure all is well. */
+	INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+	osb->osb_tl_bh    = tl_bh;
+	osb->osb_tl_inode = tl_inode;
+
+	mlog_exit(status);
+	return status;
+}
+
+/* This function will figure out whether the currently last extent
+ * block will be deleted, and if it will, what the new last extent
+ * block will be so we can update his h_next_leaf_blk field, as well
+ * as the dinodes i_last_eb_blk */
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+				       struct inode *inode,
+				       struct ocfs2_dinode *fe,
+				       u32 new_i_clusters,
+				       struct buffer_head *old_last_eb,
+				       struct buffer_head **new_last_eb)
+{
+	int i, status = 0;
+	u64 block = 0;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *bh = NULL;
+
+	*new_last_eb = NULL;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	/* we have no tree, so of course, no last_eb. */
+	if (!fe->id2.i_list.l_tree_depth)
+		goto bail;
+
+	/* trunc to zero special case - this makes tree_depth = 0
+	 * regardless of what it is.  */
+	if (!new_i_clusters)
+		goto bail;
+
+	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
+	el = &(eb->h_list);
+	BUG_ON(!el->l_next_free_rec);
+
+	/* Make sure that this guy will actually be empty after we
+	 * clear away the data. */
+	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
+		goto bail;
+
+	/* Ok, at this point, we know that last_eb will definitely
+	 * change, so lets traverse the tree and find the second to
+	 * last extent block. */
+	el = &(fe->id2.i_list);
+	/* go down the tree, */
+	do {
+		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
+			if (le32_to_cpu(el->l_recs[i].e_cpos) <
+			    new_i_clusters) {
+				block = le64_to_cpu(el->l_recs[i].e_blkno);
+				break;
+			}
+		}
+		BUG_ON(i < 0);
+
+		if (bh) {
+			brelse(bh);
+			bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
+					 inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+	} while (el->l_tree_depth);
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
+bail:
+	if (bh)
+		brelse(bh);
+
+	return status;
+}
+
+static int ocfs2_do_truncate(struct ocfs2_super *osb,
+			     unsigned int clusters_to_del,
+			     struct inode *inode,
+			     struct buffer_head *fe_bh,
+			     struct buffer_head *old_last_eb_bh,
+			     struct ocfs2_journal_handle *handle,
+			     struct ocfs2_truncate_context *tc)
+{
+	int status, i, depth;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *last_eb = NULL;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *eb_bh = NULL;
+	struct buffer_head *last_eb_bh = NULL;
+	u64 next_eb = 0;
+	u64 delete_blk = 0;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	status = ocfs2_find_new_last_ext_blk(osb,
+					     inode,
+					     fe,
+					     le32_to_cpu(fe->i_clusters) -
+					     		clusters_to_del,
+					     old_last_eb_bh,
+					     &last_eb_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (last_eb_bh)
+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	el = &(fe->id2.i_list);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
+				      clusters_to_del;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
+	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+	/* tree depth zero, we can just delete the clusters, otherwise
+	 * we need to record the offset of the next level extent block
+	 * as we may overwrite it. */
+	if (!el->l_tree_depth)
+		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+			+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+	else
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+
+	if (!el->l_recs[i].e_clusters) {
+		/* if we deleted the whole extent record, then clear
+		 * out the other fields and update the extent
+		 * list. For depth > 0 trees, we've already recorded
+		 * the extent block in 'next_eb' */
+		el->l_recs[i].e_cpos = 0;
+		el->l_recs[i].e_blkno = 0;
+		BUG_ON(!el->l_next_free_rec);
+		le16_add_cpu(&el->l_next_free_rec, -1);
+	}
+
+	depth = le16_to_cpu(el->l_tree_depth);
+	if (!fe->i_clusters) {
+		/* trunc to zero is a special case. */
+		el->l_tree_depth = 0;
+		fe->i_last_eb_blk = 0;
+	} else if (last_eb)
+		fe->i_last_eb_blk = last_eb->h_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (last_eb) {
+		/* If there will be a new last extent block, then by
+		 * definition, there cannot be any leaves to the right of
+		 * him. */
+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		last_eb->h_next_leaf_blk = 0;
+		status = ocfs2_journal_dirty(handle, last_eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* if our tree depth > 0, update all the tree blocks below us. */
+	while (depth) {
+		mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
+		     depth,  next_eb);
+		status = ocfs2_read_block(osb, next_eb, &eb_bh,
+					  OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+
+		status = ocfs2_journal_access(handle, inode, eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+		mlog(0, "extent block %"MLFu64", before: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
+		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
+
+		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
+		/* bottom-most block requires us to delete data.*/
+		if (!el->l_tree_depth)
+			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
+				+ ocfs2_clusters_to_blocks(osb->sb,
+					le32_to_cpu(el->l_recs[i].e_clusters));
+		if (!el->l_recs[i].e_clusters) {
+			el->l_recs[i].e_cpos = 0;
+			el->l_recs[i].e_blkno = 0;
+			BUG_ON(!el->l_next_free_rec);
+			le16_add_cpu(&el->l_next_free_rec, -1);
+		}
+		mlog(0, "extent block %"MLFu64", after: record %d: "
+		     "(%u, %u, %"MLFu64"), next = %u\n",
+		     le64_to_cpu(eb->h_blkno), i,
+		     le32_to_cpu(el->l_recs[i].e_cpos),
+		     le32_to_cpu(el->l_recs[i].e_clusters),
+		     le64_to_cpu(el->l_recs[i].e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		status = ocfs2_journal_dirty(handle, eb_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!el->l_next_free_rec) {
+			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, eb_bh);
+
+			BUG_ON(eb->h_suballoc_slot);
+			BUG_ON(el->l_recs[0].e_clusters);
+			BUG_ON(el->l_recs[0].e_cpos);
+			BUG_ON(el->l_recs[0].e_blkno);
+			status = ocfs2_free_extent_block(handle,
+							 tc->tc_ext_alloc_inode,
+							 tc->tc_ext_alloc_bh,
+							 eb);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		brelse(eb_bh);
+		eb_bh = NULL;
+		depth--;
+	}
+
+	BUG_ON(!delete_blk);
+	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+					   clusters_to_del);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	if (!status)
+		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
+	else
+		ocfs2_extent_map_drop(inode, 0);
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * It is expected, that by the time you call this function,
+ * inode->i_size and fe->i_size have been adjusted.
+ *
+ * WARNING: This will kfree the truncate context
+ */
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc)
+{
+	int status, i, credits, tl_sem = 0;
+	u32 clusters_to_del, target_i_clusters;
+	u64 last_eb = 0;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *tl_inode = osb->osb_tl_inode;
+
+	mlog_entry_void();
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						     i_size_read(inode));
+
+	last_eb_bh = tc->tc_last_eb_bh;
+	tc->tc_last_eb_bh = NULL;
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (fe->id2.i_list.l_tree_depth) {
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		el = &eb->h_list;
+	} else
+		el = &fe->id2.i_list;
+	last_eb = le64_to_cpu(fe->i_last_eb_blk);
+start:
+	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
+	     "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
+	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
+	     le32_to_cpu(fe->i_clusters), last_eb,
+	     le64_to_cpu(fe->i_last_eb_blk),
+	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
+
+	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
+		mlog(0, "last_eb changed!\n");
+		BUG_ON(!fe->id2.i_list.l_tree_depth);
+		last_eb = le64_to_cpu(fe->i_last_eb_blk);
+		/* i_last_eb_blk may have changed, read it if
+		 * necessary. We don't have to worry about the
+		 * truncate to zero case here (where there becomes no
+		 * last_eb) because we never loop back after our work
+		 * is done. */
+		if (last_eb_bh) {
+			brelse(last_eb_bh);
+			last_eb_bh = NULL;
+		}
+
+		status = ocfs2_read_block(osb, last_eb,
+					  &last_eb_bh, OCFS2_BH_CACHED,
+					  inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+	}
+
+	/* by now, el will point to the extent list on the bottom most
+	 * portion of this tree. */
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
+		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+	else
+		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+				   le32_to_cpu(el->l_recs[i].e_cpos)) -
+				  target_i_clusters;
+
+	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+
+	down(&tl_inode->i_sem);
+	tl_sem = 1;
+	/* ocfs2_truncate_log_needs_flush guarantees us at least one
+	 * record is free for use. If there isn't any, we flush to get
+	 * an empty truncate log.  */
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		status = __ocfs2_flush_truncate_log(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+						fe, el);
+	handle = ocfs2_start_trans(osb, NULL, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
+				   last_eb_bh, handle, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	up(&tl_inode->i_sem);
+	tl_sem = 0;
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
+	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
+		goto start;
+bail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	if (tl_sem)
+		up(&tl_inode->i_sem);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (last_eb_bh)
+		brelse(last_eb_bh);
+
+	/* This will drop the ext_alloc cluster lock for us */
+	ocfs2_free_truncate_context(tc);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+/*
+ * Expects the inode to already be locked. This will figure out which
+ * inodes need to be locked and will put them on the returned truncate
+ * context.
+ */
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc)
+{
+	int status, metadata_delete;
+	unsigned int new_i_clusters;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct buffer_head *last_eb_bh = NULL;
+	struct inode *ext_alloc_inode = NULL;
+	struct buffer_head *ext_alloc_bh = NULL;
+
+	mlog_entry_void();
+
+	*tc = NULL;
+
+	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+						  i_size_read(inode));
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
+	     "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
+
+	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
+		ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
+			    "%u and size %"MLFu64" whereas struct inode has "
+			    "cluster count %u and size %llu which caused an "
+			    "invalid truncate to %u clusters.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->i_clusters),
+			    le64_to_cpu(fe->i_size),
+			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
+			    new_i_clusters);
+		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
+		status = -EIO;
+		goto bail;
+	}
+
+	*tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
+	if (!(*tc)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	metadata_delete = 0;
+	if (fe->id2.i_list.l_tree_depth) {
+		/* If we have a tree, then the truncate may result in
+		 * metadata deletes. Figure this out from the
+		 * rightmost leaf block.*/
+		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+
+			brelse(last_eb_bh);
+			status = -EIO;
+			goto bail;
+		}
+		el = &(eb->h_list);
+		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+			metadata_delete = 1;
+	}
+
+	(*tc)->tc_last_eb_bh = last_eb_bh;
+
+	if (metadata_delete) {
+		mlog(0, "Will have to delete metadata for this trunc. "
+		     "locking allocator.\n");
+		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
+		if (!ext_alloc_inode) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		down(&ext_alloc_inode->i_sem);
+		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+
+		status = ocfs2_meta_lock(ext_alloc_inode,
+					 NULL,
+					 &ext_alloc_bh,
+					 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
+		(*tc)->tc_ext_alloc_locked = 1;
+	}
+
+	status = 0;
+bail:
+	if (status < 0) {
+		if (*tc)
+			ocfs2_free_truncate_context(*tc);
+		*tc = NULL;
+	}
+	mlog_exit_void();
+	return status;
+}
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+{
+	if (tc->tc_ext_alloc_inode) {
+		if (tc->tc_ext_alloc_locked)
+			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+
+		up(&tc->tc_ext_alloc_inode->i_sem);
+		iput(tc->tc_ext_alloc_inode);
+	}
+
+	if (tc->tc_ext_alloc_bh)
+		brelse(tc->tc_ext_alloc_bh);
+
+	if (tc->tc_last_eb_bh)
+		brelse(tc->tc_last_eb_bh);
+
+	kfree(tc);
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 0000000..12ba897
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ALLOC_H
+#define OCFS2_ALLOC_H
+
+struct ocfs2_alloc_context;
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u64 blkno,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac);
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct ocfs2_dinode *fe);
+/* how many new metadata chunks would an allocation need at maximum? */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+{
+	/*
+	 * Rather than do all the work of determining how much we need
+	 * (involves a ton of reads and locks), just ask for the
+	 * maximal limit.  That's a tree depth shift.  So, one block for
+	 * level of the tree (current l_tree_depth), one block for the
+	 * new tree_depth==0 extent_block, and one block at the new
+	 * top-of-the tree.
+	 */
+	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+}
+
+int ocfs2_truncate_log_init(struct ocfs2_super *osb);
+void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
+void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
+				       int cancel);
+int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
+				      int slot_num,
+				      struct ocfs2_dinode **tl_copy);
+int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
+					 struct ocfs2_dinode *tl_copy);
+
+struct ocfs2_truncate_context {
+	struct inode *tc_ext_alloc_inode;
+	struct buffer_head *tc_ext_alloc_bh;
+	int tc_ext_alloc_locked; /* is it cluster locked? */
+	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
+	struct buffer_head *tc_last_eb_bh;
+};
+
+int ocfs2_prepare_truncate(struct ocfs2_super *osb,
+			   struct inode *inode,
+			   struct buffer_head *fe_bh,
+			   struct ocfs2_truncate_context **tc);
+int ocfs2_commit_truncate(struct ocfs2_super *osb,
+			  struct inode *inode,
+			  struct buffer_head *fe_bh,
+			  struct ocfs2_truncate_context *tc);
+
+#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 0000000..8f4467a
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int err = -EIO;
+	int status;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *buffer_cache_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	void *kaddr;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
+
+	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
+		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
+		     (unsigned long long)iblock);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  &bh, OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		goto bail;
+	}
+
+	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+						    le32_to_cpu(fe->i_clusters))) {
+		mlog(ML_ERROR, "block offset is outside the allocated size: "
+		     "%llu\n", (unsigned long long)iblock);
+		goto bail;
+	}
+
+	/* We don't use the page cache to create symlink data, so if
+	 * need be, copy it over from the buffer cache. */
+	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
+		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
+			    iblock;
+		buffer_cache_bh = sb_getblk(osb->sb, blkno);
+		if (!buffer_cache_bh) {
+			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
+			goto bail;
+		}
+
+		/* we haven't locked out transactions, so a commit
+		 * could've happened. Since we've got a reference on
+		 * the bh, even if it commits while we're doing the
+		 * copy, the data is still good. */
+		if (buffer_jbd(buffer_cache_bh)
+		    && ocfs2_inode_is_new(inode)) {
+			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+			if (!kaddr) {
+				mlog(ML_ERROR, "couldn't kmap!\n");
+				goto bail;
+			}
+			memcpy(kaddr + (bh_result->b_size * iblock),
+			       buffer_cache_bh->b_data,
+			       bh_result->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			set_buffer_uptodate(bh_result);
+		}
+		brelse(buffer_cache_bh);
+	}
+
+	map_bh(bh_result, inode->i_sb,
+	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
+
+	err = 0;
+
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh_result, int create)
+{
+	int err = 0;
+	u64 p_blkno, past_eof;
+
+	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+		   (unsigned long long)iblock, bh_result, create);
+
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
+		     inode, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode)) {
+		/* this always does I/O for some reason. */
+		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
+		goto bail;
+	}
+
+	/* this can happen if another node truncs after our extend! */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
+					       OCFS2_I(inode)->ip_clusters))
+		err = -EIO;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	if (err)
+		goto bail;
+
+	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  NULL);
+	if (err) {
+		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
+		     "%"MLFu64", NULL)\n", err, inode,
+		     (unsigned long long)iblock, p_blkno);
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	if (bh_result->b_blocknr == 0) {
+		err = -EIO;
+		mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
+		     "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
+		     p_blkno, OCFS2_I(inode)->ip_blkno);
+	}
+
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
+
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
+bail:
+	if (err < 0)
+		err = -EIO;
+
+	mlog_exit(err);
+	return err;
+}
+
+static int ocfs2_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	int ret, unlock = 1;
+
+	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * i_size might have just been updated as we grabed the meta lock.  We
+	 * might now be discovering a truncate that hit on another node.
+	 * block_read_full_page->get_block freaks out if it is asked to read
+	 * beyond the end of a file, so we check here.  Callers
+	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * and notice that the page they just read isn't needed.
+	 *
+	 * XXX sys_readahead() seems to get that wrong?
+	 */
+	if (start >= i_size_read(inode)) {
+		char *addr = kmap(page);
+		memset(addr, 0, PAGE_SIZE);
+		flush_dcache_page(page);
+		kunmap(page);
+		SetPageUptodate(page);
+		ret = 0;
+		goto out_alloc;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 0, page);
+	if (ret != 0) {
+		if (ret == AOP_TRUNCATED_PAGE)
+			unlock = 0;
+		mlog_errno(ret);
+		goto out_alloc;
+	}
+
+	ret = block_read_full_page(page, ocfs2_get_block);
+	unlock = 0;
+
+	ocfs2_data_unlock(inode, 0);
+out_alloc:
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 0);
+out:
+	if (unlock)
+		unlock_page(page);
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing.  It can't block on any cluster locks
+ * to during block mapping.  It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
+ */
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+
+	mlog_entry("(0x%p)\n", page);
+
+	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+
+	mlog_exit(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
+ * from loopback.  It must be able to perform its own locking around
+ * ocfs2_get_block().
+ */
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_meta_unlock(inode, 0);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* Taken from ext3. We don't necessarily need the full blown
+ * functionality yet, but IMHO it's better to cut and paste the whole
+ * thing so we can avoid introducing our own bugs (and easily pick up
+ * their fixes when they happen) --Mark */
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+	    	block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle->k_handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0) 
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (handle)
+			ocfs2_commit_trans(handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
+static int ocfs2_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	int ret, extending = 0, locklevel = 0;
+	loff_t new_i_size;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+
+	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
+	 * us to sample inode->i_size here without the metadata lock:
+	 *
+	 * 1) We're currently holding the inode alloc lock, so no
+	 *    nodes can change it underneath us.
+	 *
+	 * 2) We've had to take the metadata lock at least once
+	 *    already to check for extending writes, hence insuring
+	 *    that our current copy is also up to date.
+	 */
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > i_size_read(inode)) {
+		extending = 1;
+		locklevel = 1;
+	}
+
+	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_data_lock_with_page(inode, 1, page);
+	if (ret != 0) {
+		mlog_errno(ret);
+		goto out_unlock_meta;
+	}
+
+	if (extending) {
+		handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock_data;
+		}
+
+		/* Mark our buffer early. We'd rather catch this error up here
+		 * as opposed to after a successful commit_write which would
+		 * require us to set back inode->i_size. */
+		ret = ocfs2_journal_access(handle, inode, di_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/* might update i_size */
+	ret = generic_commit_write(file, page, from, to);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (extending) {
+		loff_t size = (u64) i_size_read(inode);
+		struct ocfs2_dinode *di =
+			(struct ocfs2_dinode *)di_bh->b_data;
+
+		/* ocfs2_mark_inode_dirty is too heavy to use here. */
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+		di->i_size = cpu_to_le64(size);
+		di->i_ctime = di->i_mtime = 
+				cpu_to_le64(inode->i_mtime.tv_sec);
+		di->i_ctime_nsec = di->i_mtime_nsec = 
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+
+		ret = ocfs2_journal_dirty(handle, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	BUG_ON(extending && (i_size_read(inode) != new_i_size));
+
+out_commit:
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+out_unlock_meta:
+	ocfs2_meta_unlock(inode, locklevel);
+out:
+	if (di_bh)
+		brelse(di_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t status;
+	u64 p_blkno = 0;
+	int err = 0;
+	struct inode *inode = mapping->host;
+
+	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+
+	/* We don't need to lock journal system files, since they aren't
+	 * accessed concurrently from multiple nodes.
+	 */
+	if (!INODE_JOURNAL(inode)) {
+		err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+		if (err) {
+			if (err != -ENOENT)
+				mlog_errno(err);
+			goto bail;
+		}
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	}
+
+	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
+					  NULL);
+
+	if (!INODE_JOURNAL(inode)) {
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		ocfs2_meta_unlock(inode, 0);
+	}
+
+	if (err) {
+		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
+		     (unsigned long long)block);
+		mlog_errno(err);
+		goto bail;
+	}
+
+
+bail:
+	status = err ? 0 : p_blkno;
+
+	mlog_exit((int)status);
+
+	return status;
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * 					fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
+				     unsigned long max_blocks,
+				     struct buffer_head *bh_result, int create)
+{
+	int ret;
+	u64 vbo_max; /* file offset, max_blocks from iblock */
+	u64 p_blkno;
+	int contig_blocks;
+	unsigned char blocksize_bits;
+
+	if (!inode || !bh_result) {
+		mlog(ML_ERROR, "inode or bh_result is null\n");
+		return -EIO;
+	}
+
+	blocksize_bits = inode->i_sb->s_blocksize_bits;
+
+	/* This function won't even be called if the request isn't all
+	 * nicely aligned and of the right size, so there's no need
+	 * for us to check any of that. */
+
+	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if ((iblock + max_blocks) >
+	    ocfs2_clusters_to_blocks(inode->i_sb,
+				     OCFS2_I(inode)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		ret = -EIO;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* This figures out the size of the next contiguous block, and
+	 * our logical offset */
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
+					  &contig_blocks);
+	if (ret) {
+		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+		     (unsigned long long)iblock);
+		ret = -EIO;
+		goto bail;
+	}
+
+	map_bh(bh_result, inode->i_sb, p_blkno);
+
+	/* make sure we don't map more than max_blocks blocks here as
+	   that's all the kernel will handle at this point. */
+	if (max_blocks < contig_blocks)
+		contig_blocks = max_blocks;
+	bh_result->b_size = contig_blocks << blocksize_bits;
+bail:
+	return ret;
+}
+
+/* 
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  Like the core uses
+ * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
+ * truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+			     loff_t offset,
+			     ssize_t bytes,
+			     void *private)
+{
+	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+
+	/* this io's submitter should not have unlocked this before we could */
+	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+	ocfs2_iocb_clear_rw_locked(iocb);
+	up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, 0);
+}
+
+static ssize_t ocfs2_direct_IO(int rw,
+			       struct kiocb *iocb,
+			       const struct iovec *iov,
+			       loff_t offset,
+			       unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	int ret;
+
+	mlog_entry_void();
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev, iov, offset,
+					    nr_segs, 
+					    ocfs2_direct_IO_get_blocks,
+					    ocfs2_dio_end_io);
+	mlog_exit(ret);
+	return ret;
+}
+
+struct address_space_operations ocfs2_aops = {
+	.readpage	= ocfs2_readpage,
+	.writepage	= ocfs2_writepage,
+	.prepare_write	= ocfs2_prepare_write,
+	.commit_write	= ocfs2_commit_write,
+	.bmap		= ocfs2_bmap,
+	.sync_page	= block_sync_page,
+	.direct_IO	= ocfs2_direct_IO
+};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 0000000..d40456d
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_AOPS_H
+#define OCFS2_AOPS_H
+
+int ocfs2_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to);
+
+struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+							 struct page *page,
+							 unsigned from,
+							 unsigned to);
+
+/* all ocfs2_dio_end_io()'s fault */
+#define ocfs2_iocb_is_rw_locked(iocb) \
+	test_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_rw_locked(iocb) \
+	set_bit(0, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_rw_locked(iocb) \
+	clear_bit(0, (unsigned long *)&iocb->private)
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 0000000..d424041
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * io.c
+ *
+ * Buffer cache handling
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
+		      struct inode *inode)
+{
+	int ret = 0;
+
+	mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
+		   (unsigned long long)bh->b_blocknr, inode);
+
+	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
+	BUG_ON(buffer_jbd(bh));
+
+	/* No need to check for a soft readonly file system here. non
+	 * journalled writes are only ever done on system files which
+	 * can get modified during recovery even if read-only. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+
+	lock_buffer(bh);
+	set_buffer_uptodate(bh);
+
+	/* remove from dirty list before I/O. */
+	clear_buffer_dirty(bh);
+
+	get_bh(bh); /* for end_buffer_write_sync() */                   
+	bh->b_end_io = end_buffer_write_sync;
+	submit_bh(WRITE, bh);
+
+	wait_on_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		ocfs2_set_buffer_uptodate(inode, bh);
+	} else {
+		/* We don't need to remove the clustered uptodate
+		 * information for this bh as it's not marked locally
+		 * uptodate. */
+		ret = -EIO;
+		brelse(bh);
+	}
+
+	up(&OCFS2_I(inode)->ip_io_sem);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb;
+	int i, ignore_cache = 0;
+	struct buffer_head *bh;
+
+	mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
+		   block, nr, flags, inode);
+
+	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr < 0) {
+		mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (nr == 0) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		status = 0;
+		goto bail;
+	}
+
+	sb = osb->sb;
+
+	if (flags & OCFS2_BH_CACHED && !inode)
+		flags &= ~OCFS2_BH_CACHED;
+
+	if (inode)
+		down(&OCFS2_I(inode)->ip_io_sem);
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(sb, block++);
+			if (bhs[i] == NULL) {
+				if (inode)
+					up(&OCFS2_I(inode)->ip_io_sem);
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+		ignore_cache = 0;
+
+		if (flags & OCFS2_BH_CACHED &&
+		    !ocfs2_buffer_uptodate(inode, bh)) {
+			mlog(ML_UPTODATE,
+			     "bh (%llu), inode %"MLFu64" not uptodate\n",
+			     (unsigned long long)bh->b_blocknr,
+			     OCFS2_I(inode)->ip_blkno);
+			ignore_cache = 1;
+		}
+
+		/* XXX: Can we ever get this and *not* have the cached
+		 * flag set? */
+		if (buffer_jbd(bh)) {
+			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+				mlog(ML_BH_IO, "trying to sync read a jbd "
+					       "managed bh (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+			if (buffer_dirty(bh)) {
+				/* This should probably be a BUG, or
+				 * at least return an error. */
+				mlog(ML_BH_IO, "asking me to sync read a dirty "
+					       "buffer! (blocknr = %llu)\n",
+				     (unsigned long long)bh->b_blocknr);
+				continue;
+			}
+
+			lock_buffer(bh);
+			if (buffer_jbd(bh)) {
+#ifdef CATCH_BH_JBD_RACES
+				mlog(ML_ERROR, "block %llu had the JBD bit set "
+					       "while I was in lock_buffer!",
+				     (unsigned long long)bh->b_blocknr);
+				BUG();
+#else
+				unlock_buffer(bh);
+				continue;
+#endif
+			}
+			clear_buffer_uptodate(bh);
+			get_bh(bh); /* for end_buffer_read_sync() */
+			bh->b_end_io = end_buffer_read_sync;
+			if (flags & OCFS2_BH_READAHEAD)
+				submit_bh(READA, bh);
+			else
+				submit_bh(READ, bh);
+			continue;
+		}
+	}
+
+	status = 0;
+
+	for (i = (nr - 1); i >= 0; i--) {
+		bh = bhs[i];
+
+		/* We know this can't have changed as we hold the
+		 * inode sem. Avoid doing any work on the bh if the
+		 * journal has it. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
+
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. Don't need to
+			 * remove the clustered uptodate information
+			 * for this bh as it's not marked locally
+			 * uptodate. */
+			status = -EIO;
+			brelse(bh);
+			bhs[i] = NULL;
+			continue;
+		}
+
+		if (inode)
+			ocfs2_set_buffer_uptodate(inode, bh);
+	}
+	if (inode)
+		up(&OCFS2_I(inode)->ip_io_sem);
+
+	mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+
+bail:
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 0000000..6ecb909
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_buffer_head.h
+ *
+ * Buffer cache handling functions defined
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_BUFFER_HEAD_IO_H
+#define OCFS2_BUFFER_HEAD_IO_H
+
+#include <linux/buffer_head.h>
+
+void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
+			     int uptodate);
+
+static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+				   u64                  off,
+				   struct buffer_head **bh,
+				   int                  flags,
+				   struct inode        *inode);
+
+int ocfs2_write_block(struct ocfs2_super          *osb,
+		      struct buffer_head  *bh,
+		      struct inode        *inode);
+int ocfs2_read_blocks(struct ocfs2_super          *osb,
+		      u64                  block,
+		      int                  nr,
+		      struct buffer_head  *bhs[],
+		      int                  flags,
+		      struct inode        *inode);
+
+
+#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+
+static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+				   struct buffer_head **bh, int flags,
+				   struct inode *inode)
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_blocks(osb, off, 1, bh,
+				   flags, inode);
+
+bail:
+	return status;
+}
+
+#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 0000000..bd85182
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.c
+ *
+ * dentry cache handling code
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+
+#define MLOG_MASK_PREFIX ML_DCACHE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "file.h"
+#include "inode.h"
+
+static int ocfs2_dentry_revalidate(struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;    /* if all else fails, just return false */
+	struct ocfs2_super *osb;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	/* Never trust a negative dentry - force a new lookup. */
+	if (inode == NULL) {
+		mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
+		     dentry->d_name.name);
+		goto bail;
+	}
+
+	osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!osb);
+
+	if (inode != osb->root_inode) {
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		/* did we or someone else delete this inode? */
+		if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			mlog(0, "inode (%"MLFu64") deleted, returning false\n",
+			     OCFS2_I(inode)->ip_blkno);
+			goto bail;
+		}
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		if (!inode->i_nlink) {
+			mlog(0, "Inode %"MLFu64" orphaned, returning false "
+			     "dir = %d\n", OCFS2_I(inode)->ip_blkno,
+			     S_ISDIR(inode->i_mode));
+			goto bail;
+		}
+	}
+
+	ret = 1;
+
+bail:
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct dentry_operations ocfs2_dentry_ops = {
+	.d_revalidate		= ocfs2_dentry_revalidate,
+};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 0000000..9007277
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dcache.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DCACHE_H
+#define OCFS2_DCACHE_H
+
+extern struct dentry_operations ocfs2_dentry_ops;
+
+#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 0000000..856e20a
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.c
+ *
+ * Creates, reads, walks and deletes directory-nodes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static unsigned char ocfs2_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh);
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset, blk;
+	int i, num, stored;
+	struct buffer_head * bh, * tmp;
+	struct ocfs2_dir_entry * de;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb = inode->i_sb;
+	int have_disk_lock = 0;
+
+	mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	stored = 0;
+	bh = NULL;
+
+	error = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		stored = error;
+		goto bail;
+	}
+	have_disk_lock = 1;
+
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
+		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+		bh = ocfs2_bread(inode, blk, &err, 0);
+		if (!bh) {
+			mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
+				       "at offset %lld\n",
+			     OCFS2_I(inode)->ip_blkno,
+			     filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+		/*
+		 * Do the readahead (8k)
+		 */
+		if (!offset) {
+			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+			     i > 0; i--) {
+				tmp = ocfs2_bread(inode, ++blk, &err, 1);
+				if (tmp)
+					brelse(tmp);
+			}
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < i_size_read(inode)
+		       && offset < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
+			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+				/* On error, skip the f_pos to the
+				   next block. */
+				filp->f_pos = (filp->f_pos |
+					       (sb->s_blocksize - 1)) + 1;
+				brelse(bh);
+				goto bail;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (de->file_type < OCFS2_FT_MAX)
+					d_type = ocfs2_filetype_table[de->file_type];
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						d_type);
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse(bh);
+	}
+
+	stored = 0;
+bail:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(inode, 0);
+
+	mlog_exit(stored);
+
+	return stored;
+}
+
+/*
+ * NOTE: this should always be called with parent dir i_sem taken.
+ */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent)
+{
+	int status = -ENOENT;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
+		   "inode=%p)\n",
+		   osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
+
+	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
+	if (!*dirent_bh || !*dirent) {
+		status = -ENOENT;
+		goto leave;
+	}
+
+	*blkno = le64_to_cpu((*dirent)->inode);
+
+	status = 0;
+leave:
+	if (status < 0) {
+		*dirent = NULL;
+		if (*dirent_bh) {
+			brelse(*dirent_bh);
+			*dirent_bh = NULL;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Check for a name within a directory.
+ *
+ * Return 0 if the name does not exist
+ * Return -EEXIST if the directory contains the name
+ *
+ * Callers should have i_sem + a cluster lock on dir
+ */
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen)
+{
+	int ret;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+
+	mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
+		   namelen, name);
+
+	ret = -EEXIST;
+	dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
+	if (dirent_bh)
+		goto bail;
+
+	ret = 0;
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int ocfs2_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ocfs2_dir_entry * de, * de1;
+	struct super_block * sb;
+	int err;
+
+	sb = inode->i_sb;
+	if ((i_size_read(inode) <
+	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
+	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no data block\n",
+		     OCFS2_I(inode)->ip_blkno);
+		return 1;
+	}
+
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	de1 = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
+			!le64_to_cpu(de1->inode) ||
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+	    	mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
+			       "no `.' or `..'\n",
+		     OCFS2_I(inode)->ip_blkno);
+		brelse(bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
+	while (offset < i_size_read(inode) ) {
+		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
+			brelse(bh);
+			bh = ocfs2_bread(inode,
+					 offset >> sb->s_blocksize_bits, &err, 0);
+			if (!bh) {
+				mlog(ML_ERROR, "directory #%"MLFu64" contains "
+					       "a hole at offset %lu\n",
+				     OCFS2_I(inode)->ip_blkno, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
+			brelse(bh);
+			return 1;
+		}
+		if (le64_to_cpu(de->inode)) {
+			brelse(bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)
+			((char *)de + le16_to_cpu(de->rec_len));
+	}
+	brelse(bh);
+	return 1;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh)
+{
+	int status;
+	int extend;
+	u64 p_blkno;
+
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+	spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+	if (extend) {
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
+						    parent_fe_bh, handle,
+						    data_ac, meta_ac, NULL);
+		BUG_ON(status == -EAGAIN);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
+						   (sb->s_blocksize_bits - 9)),
+					     1, &p_blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_bh = sb_getblk(sb, p_blkno);
+	if (!*new_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* assumes you already have a cluster lock on the directory. */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+			    struct inode *dir,
+			    struct buffer_head *parent_fe_bh,
+			    struct buffer_head **new_de_bh)
+{
+	int status = 0;
+	int credits, num_free_extents;
+	loff_t dir_i_size;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry * de;
+	struct super_block *sb = osb->sb;
+
+	mlog_entry_void();
+
+	dir_i_size = i_size_read(dir);
+	mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
+	     OCFS2_I(dir)->ip_blkno, dir_i_size);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* dir->i_size is always block aligned. */
+	spin_lock(&OCFS2_I(dir)->ip_lock);
+	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		if (num_free_extents < 0) {
+			status = num_free_extents;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (!num_free_extents) {
+			status = ocfs2_reserve_new_metadata(osb, handle,
+							    fe, &meta_ac);
+			if (status < 0) {
+				if (status != -ENOSPC)
+					mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+	} else {
+		spin_unlock(&OCFS2_I(dir)->ip_lock);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+				     data_ac, meta_ac, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
+	status = ocfs2_journal_access(handle, dir, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, sb->s_blocksize);
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	dir_i_size += dir->i_sb->s_blocksize;
+	i_size_write(dir, dir_i_size);
+	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*new_de_bh = new_bh;
+	get_bh(*new_de_bh);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Search the dir for a good spot, extending it if necessary. The
+ * block containing an appropriate record is returned in ret_de_bh.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head * bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dinode *fe;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb;
+	int status;
+
+	mlog_entry_void();
+
+	mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
+	     namelen, OCFS2_I(dir)->ip_blkno);
+
+	BUG_ON(!S_ISDIR(dir->i_mode));
+	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+
+	sb = dir->i_sb;
+
+	if (!namelen) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bh = ocfs2_bread(dir, 0, &status, 0);
+	if (!bh) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (1) {
+		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+			brelse(bh);
+			bh = NULL;
+
+			if (i_size_read(dir) <= offset) {
+				status = ocfs2_extend_dir(osb,
+							  dir,
+							  parent_fe_bh,
+							  &bh);
+				if (status < 0) {
+					mlog_errno(status);
+					goto bail;
+				}
+				BUG_ON(!bh);
+				*ret_de_bh = bh;
+				get_bh(*ret_de_bh);
+				goto bail;
+			}
+			bh = ocfs2_bread(dir,
+					 offset >> sb->s_blocksize_bits,
+					 &status,
+					 0);
+			if (!bh) {
+				mlog_errno(status);
+				goto bail;
+			}
+			/* move to next block */
+			de = (struct ocfs2_dir_entry *) bh->b_data;
+		}
+		if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+			status = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			status = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = bh;
+			get_bh(*ret_de_bh);
+			status = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	status = 0;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 0000000..5f614ec
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dir.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_DIR_H
+#define OCFS2_DIR_H
+
+int ocfs2_check_dir_for_entry(struct inode *dir,
+			      const char *name,
+			      int namelen);
+int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_find_files_on_disk(const char *name,
+			     int namelen,
+			     u64 *blkno,
+			     struct inode *inode,
+			     struct buffer_head **dirent_bh,
+			     struct ocfs2_dir_entry **dirent);
+int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh);
+struct ocfs2_alloc_context;
+int ocfs2_do_extend_dir(struct super_block *sb,
+			struct ocfs2_journal_handle *handle,
+			struct inode *dir,
+			struct buffer_head *parent_fe_bh,
+			struct ocfs2_alloc_context *data_ac,
+			struct ocfs2_alloc_context *meta_ac,
+			struct buffer_head **new_bh);
+#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 0000000..e971ec2
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.c
+ *
+ * Code which implements an OCFS2 specific interface to our DLM.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/crc32.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_DLM_GLUE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "super.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_mask_waiter {
+	struct list_head	mw_item;
+	int			mw_status;
+	struct completion	mw_complete;
+	unsigned long		mw_mask;
+	unsigned long		mw_goal;
+};
+
+static void ocfs2_inode_ast_func(void *opaque);
+static void ocfs2_inode_bast_func(void *opaque,
+				  int level);
+static void ocfs2_super_ast_func(void *opaque);
+static void ocfs2_super_bast_func(void *opaque,
+				  int level);
+static void ocfs2_rename_ast_func(void *opaque);
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level);
+
+/* so far, all locks have gotten along with the same unlock ast */
+static void ocfs2_unlock_ast_func(void *opaque,
+				  enum dlm_status status);
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue);
+static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+			      int *requeue);
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue);
+typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker);
+
+struct ocfs2_lock_res_ops {
+	void (*ast)(void *);
+	void (*bast)(void *, int);
+	void (*unlock_ast)(void *, enum dlm_status);
+	int  (*unblock)(struct ocfs2_lock_res *, int *);
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_inode_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_meta,
+};
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking);
+
+static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
+	.ast		= ocfs2_inode_ast_func,
+	.bast		= ocfs2_inode_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_data,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_super_lops = {
+	.ast		= ocfs2_super_ast_func,
+	.bast		= ocfs2_super_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
+	.ast		= ocfs2_rename_ast_func,
+	.bast		= ocfs2_rename_bast_func,
+	.unlock_ast	= ocfs2_unlock_ast_func,
+	.unblock	= ocfs2_unblock_osb_lock,
+};
+
+static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
+		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
+		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+}
+
+static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
+}
+
+static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+{
+	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
+}
+
+static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_super_lock(lockres)
+	       && !ocfs2_is_rename_lock(lockres));
+
+	return (struct ocfs2_super *) lockres->l_priv;
+}
+
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	return (struct inode *) lockres->l_priv;
+}
+
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags);
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted);
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level);
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres);
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert);
+#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
+		"resource %s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_name, dlm_errmsg(_stat));		\
+} while (0)
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres);
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh);
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
+static inline int ocfs2_highest_compat_lock_level(int level);
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level);
+
+static char *ocfs2_lock_type_strings[] = {
+	[OCFS2_LOCK_TYPE_META] = "Meta",
+	[OCFS2_LOCK_TYPE_DATA] = "Data",
+	[OCFS2_LOCK_TYPE_SUPER] = "Super",
+	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
+	/* Need to differntiate from [R]ename.. serializing writes is the
+	 * important job it does, anyway. */
+	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
+};
+
+static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+	return ocfs2_lock_type_strings[type];
+}
+
+static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
+				  u64 blkno,
+				  u32 generation,
+				  char *name)
+{
+	int len;
+
+	mlog_entry_void();
+
+	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
+
+	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
+		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
+		       generation);
+
+	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
+
+	mlog(0, "built lock resource with name: %s\n", name);
+
+	mlog_exit_void();
+}
+
+static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+
+static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
+				       struct ocfs2_dlm_debug *dlm_debug)
+{
+	mlog(0, "Add tracking for lockres %s\n", res->l_name);
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
+{
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	if (!list_empty(&res->l_debug_list))
+		list_del_init(&res->l_debug_list);
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+}
+
+static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
+				       struct ocfs2_lock_res *res,
+				       enum ocfs2_lock_type type,
+				       u64 blkno,
+				       u32 generation,
+				       struct ocfs2_lock_res_ops *ops,
+				       void *priv)
+{
+	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
+
+	res->l_type          = type;
+	res->l_ops           = ops;
+	res->l_priv          = priv;
+
+	res->l_level         = LKM_IVMODE;
+	res->l_requested     = LKM_IVMODE;
+	res->l_blocking      = LKM_IVMODE;
+	res->l_action        = OCFS2_AST_INVALID;
+	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+	res->l_flags         = OCFS2_LOCK_INITIALIZED;
+
+	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+}
+
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
+{
+	/* This also clears out the lock status block */
+	memset(res, 0, sizeof(struct ocfs2_lock_res));
+	spin_lock_init(&res->l_lock);
+	init_waitqueue_head(&res->l_event);
+	INIT_LIST_HEAD(&res->l_blocked_list);
+	INIT_LIST_HEAD(&res->l_mask_waiters);
+}
+
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode)
+{
+	struct ocfs2_lock_res_ops *ops;
+
+	switch(type) {
+		case OCFS2_LOCK_TYPE_RW:
+			ops = &ocfs2_inode_rw_lops;
+			break;
+		case OCFS2_LOCK_TYPE_META:
+			ops = &ocfs2_inode_meta_lops;
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			ops = &ocfs2_inode_data_lops;
+			break;
+		default:
+			mlog_bug_on_msg(1, "type: %d\n", type);
+			ops = NULL; /* thanks, gcc */
+			break;
+	};
+
+	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
+				   OCFS2_I(inode)->ip_blkno,
+				   inode->i_generation, ops, inode);
+}
+
+static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
+				      struct ocfs2_super *osb)
+{
+	/* Superblock lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
+				   OCFS2_SUPER_BLOCK_BLKNO, 0,
+				   &ocfs2_super_lops, osb);
+}
+
+static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
+				       struct ocfs2_super *osb)
+{
+	/* Rename lockres doesn't come from a slab so we call init
+	 * once on it manually.  */
+	ocfs2_lock_res_init_once(res);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
+				   &ocfs2_rename_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
+	mlog_entry_void();
+
+	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
+		return;
+
+	ocfs2_remove_lockres_tracking(res);
+
+	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
+			"Lockres %s is on the blocked list\n",
+			res->l_name);
+	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
+			"Lockres %s has mask waiters pending\n",
+			res->l_name);
+	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
+			"Lockres %s is locked\n",
+			res->l_name);
+	mlog_bug_on_msg(res->l_ro_holders,
+			"Lockres %s has %u ro holders\n",
+			res->l_name, res->l_ro_holders);
+	mlog_bug_on_msg(res->l_ex_holders,
+			"Lockres %s has %u ex holders\n",
+			res->l_name, res->l_ex_holders);
+
+	/* Need to clear out the lock status block for the dlm */
+	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
+
+	res->l_flags = 0UL;
+	mlog_exit_void();
+}
+
+static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+	mlog_exit_void();
+}
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int ocfs2_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void lockres_set_flags(struct ocfs2_lock_res *lockres,
+			      unsigned long newflags)
+{
+	struct list_head *pos, *tmp;
+	struct ocfs2_mask_waiter *mw;
+
+ 	assert_spin_locked(&lockres->l_lock);
+
+	lockres->l_flags = newflags;
+
+	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
+		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			continue;
+
+		list_del_init(&mw->mw_item);
+		mw->mw_status = 0;
+		complete(&mw->mw_complete);
+	}
+}
+static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
+{
+	lockres_set_flags(lockres, lockres->l_flags | or);
+}
+static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
+				unsigned long clear)
+{
+	lockres_set_flags(lockres, lockres->l_flags & ~clear);
+}
+
+static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	lockres->l_level = lockres->l_requested;
+	if (lockres->l_level <=
+	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
+		lockres->l_blocking = LKM_NLMODE;
+		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+	}
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
+
+	/* Convert from RO to EX doesn't really need anything as our
+	 * information is already up to data. Convert from NL to
+	 * *anything* however should mark ourselves as needing an
+	 * update */
+	if (lockres->l_level == LKM_NLMODE)
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+
+	if (lockres->l_requested > LKM_NLMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	lockres->l_level = lockres->l_requested;
+	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct dlm_lockstatus *lksb;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
+	     OCFS2_I(inode)->ip_blkno, lockres->l_action,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
+		     "on inode %"MLFu64"\n", lksb->status,
+		     OCFS2_I(inode)->ip_blkno);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+		     "lockres flags = 0x%lx, unlock action: %u\n",
+		     lockres->l_name, lockres->l_action, lockres->l_flags,
+		     lockres->l_unlock_action);
+
+		BUG();
+	}
+
+	/* data and rw locking ignores refresh flag for now. */
+	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
+				     int level)
+{
+	int needs_downconvert = 0;
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+
+	if (level > lockres->l_blocking) {
+		/* only schedule a downconvert if we haven't already scheduled
+		 * one that goes low enough to satisfy the level we're
+		 * blocking.  this also catches the case where we get
+		 * duplicate BASTs */
+		if (ocfs2_highest_compat_lock_level(level) <
+		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
+			needs_downconvert = 1;
+
+		lockres->l_blocking = level;
+	}
+
+	mlog_exit(needs_downconvert);
+	return needs_downconvert;
+}
+
+static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
+				    struct ocfs2_lock_res *lockres,
+				    int level)
+{
+	int needs_downconvert;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	BUG_ON(level <= LKM_NLMODE);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
+	if (needs_downconvert)
+		ocfs2_schedule_blocked_lock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	ocfs2_kick_vote_thread(osb);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+static void ocfs2_inode_bast_func(void *opaque, int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	BUG_ON(!ocfs2_is_inode_lock(lockres));
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
+	     "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
+	     lockres->l_level,
+	     ocfs2_lock_type_string(lockres->l_type));
+
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
+				   int ignore_refresh)
+{
+	struct dlm_lockstatus *lksb = &lockres->l_lksb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
+		     lockres->l_name, lksb->status);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_action) {
+	case OCFS2_AST_ATTACH:
+		ocfs2_generic_handle_attach_action(lockres);
+		break;
+	case OCFS2_AST_CONVERT:
+		ocfs2_generic_handle_convert_action(lockres);
+		break;
+	case OCFS2_AST_DOWNCONVERT:
+		ocfs2_generic_handle_downconvert_action(lockres);
+		break;
+	default:
+		BUG();
+	}
+
+	if (ignore_refresh)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+
+	/* set it to something invalid so if we get called again we
+	 * can catch it. */
+	lockres->l_action = OCFS2_AST_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+}
+
+static void ocfs2_super_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+	mlog(0, "Superblock AST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+	ocfs2_generic_ast_func(lockres, 0);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_super_bast_func(void *opaque,
+				  int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+	mlog(0, "Superblock BAST fired\n");
+
+	BUG_ON(!ocfs2_is_super_lock(lockres));
+       	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_ast_func(void *opaque)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename AST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	ocfs2_generic_ast_func(lockres, 1);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_rename_bast_func(void *opaque,
+				   int level)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Rename BAST fired\n");
+
+	BUG_ON(!ocfs2_is_rename_lock(lockres));
+
+	osb = ocfs2_lock_res_super(lockres);
+	ocfs2_generic_bast_func(osb, lockres, level);
+
+	mlog_exit_void();
+}
+
+static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
+						int convert)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	if (convert)
+		lockres->l_action = OCFS2_AST_INVALID;
+	else
+		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+	mlog_exit_void();
+}
+
+/* Note: If we detect another process working on the lock (i.e.,
+ * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
+ * to do the right thing in that case.
+ */
+static int ocfs2_lock_create(struct ocfs2_super *osb,
+			     struct ocfs2_lock_res *lockres,
+			     int level,
+			     int dlm_flags)
+{
+	int ret = 0;
+	enum dlm_status status;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+	     dlm_flags);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
+	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	lockres->l_action = OCFS2_AST_ATTACH;
+	lockres->l_requested = level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = dlmlock(osb->dlm,
+			 level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+	}
+
+	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
+					int flag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ret = lockres->l_flags & flag;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+}
+
+static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
+}
+
+static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
+						     int wanted)
+{
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
+}
+
+static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
+{
+	INIT_LIST_HEAD(&mw->mw_item);
+	init_completion(&mw->mw_complete);
+}
+
+static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
+{
+	wait_for_completion(&mw->mw_complete);
+	/* Re-arm the completion in case we want to wait on it again */
+	INIT_COMPLETION(mw->mw_complete);
+	return mw->mw_status;
+}
+
+static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
+				    struct ocfs2_mask_waiter *mw,
+				    unsigned long mask,
+				    unsigned long goal)
+{
+	BUG_ON(!list_empty(&mw->mw_item));
+
+	assert_spin_locked(&lockres->l_lock);
+
+	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
+	mw->mw_mask = mask;
+	mw->mw_goal = goal;
+}
+
+/* returns 0 if the mw that was removed was already satisfied, -EBUSY
+ * if the mask still hadn't reached its goal */
+static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
+				      struct ocfs2_mask_waiter *mw)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!list_empty(&mw->mw_item)) {
+		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
+			ret = -EBUSY;
+
+		list_del_init(&mw->mw_item);
+		init_completion(&mw->mw_complete);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ret;
+
+}
+
+static int ocfs2_cluster_lock(struct ocfs2_super *osb,
+			      struct ocfs2_lock_res *lockres,
+			      int level,
+			      int lkm_flags,
+			      int arg_flags)
+{
+	struct ocfs2_mask_waiter mw;
+	enum dlm_status status;
+	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
+	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	ocfs2_init_mask_waiter(&mw);
+
+again:
+	wait = 0;
+
+	if (catch_signals && signal_pending(current)) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
+			"Cluster lock called on freeing lockres %s! flags "
+			"0x%lx\n", lockres->l_name, lockres->l_flags);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
+	    level > lockres->l_level) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		/* lock has not been created yet. */
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		goto again;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
+	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
+		wait = 1;
+		goto unlock;
+	}
+
+	if (level > lockres->l_level) {
+		if (lockres->l_action != OCFS2_AST_INVALID)
+			mlog(ML_ERROR, "lockres %s has action %u pending\n",
+			     lockres->l_name, lockres->l_action);
+
+		lockres->l_action = OCFS2_AST_CONVERT;
+		lockres->l_requested = level;
+		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		mlog(0, "lock %s, convert from %d to level = %d\n",
+		     lockres->l_name, lockres->l_level, level);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(osb->dlm,
+				 level,
+				 &lockres->l_lksb,
+				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+				 lockres->l_name,
+				 lockres->l_ops->ast,
+				 lockres,
+				 lockres->l_ops->bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				ret = -EAGAIN;
+			else {
+				ocfs2_log_dlm_error("dlmlock", status,
+						    lockres);
+				ret = -EINVAL;
+			}
+			ocfs2_recover_from_dlm_error(lockres, 1);
+			goto out;
+		}
+
+		mlog(0, "lock %s, successfull return from dlmlock\n",
+		     lockres->l_name);
+
+		/* At this point we've gone inside the dlm and need to
+		 * complete our work regardless. */
+		catch_signals = 0;
+
+		/* wait for busy to clear and carry on */
+		goto again;
+	}
+
+	/* Ok, if we get here then we're good to go. */
+	ocfs2_inc_holders(lockres, level);
+
+	ret = 0;
+unlock:
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+out:
+	/*
+	 * This is helping work around a lock inversion between the page lock
+	 * and dlm locks.  One path holds the page lock while calling aops
+	 * which block acquiring dlm locks.  The voting thread holds dlm
+	 * locks while acquiring page locks while down converting data locks.
+	 * This block is helping an aop path notice the inversion and back
+	 * off to unlock its page lock before trying the dlm lock again.
+	 */
+	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
+	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
+		wait = 0;
+		if (lockres_remove_mask_waiter(lockres, &mw))
+			ret = -EAGAIN;
+		else
+			goto again;
+	}
+	if (wait) {
+		ret = ocfs2_wait_for_mask(&mw);
+		if (ret == 0)
+			goto again;
+		mlog_errno(ret);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres,
+				 int level)
+{
+	unsigned long flags;
+
+	mlog_entry_void();
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	ocfs2_dec_holders(lockres, level);
+	ocfs2_vote_on_unlock(osb, lockres);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	mlog_exit_void();
+}
+
+static int ocfs2_create_new_inode_lock(struct inode *inode,
+				       struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned long flags;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
+	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+}
+
+/* Grants us an EX lock on the data and metadata resources, skipping
+ * the normal cluster directory lookup. Use this ONLY on newly created
+ * inodes which other nodes can't possibly see, and which haven't been
+ * hashed in the inode hash yet. This can give us a good performance
+ * increase as it'll skip the network broadcast normally associated
+ * with creating a new lock resource. */
+int ocfs2_create_new_inode_locks(struct inode *inode)
+{
+	int ret;
+
+	BUG_ON(!inode);
+	BUG_ON(!ocfs2_inode_is_new(inode));
+
+	mlog_entry_void();
+
+	mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	/* NOTE: That we don't increment any of the holder counts, nor
+	 * do we add anything to a journal handle. Since this is
+	 * supposed to be a new inode which the cluster doesn't know
+	 * about yet, there is no need to.  As far as the LVB handling
+	 * is concerned, this is basically like acquiring an EX lock
+	 * on a resource which has an invalid one -- we'll set it
+	 * valid when we release the EX. */
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_rw_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_meta_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+	ret = ocfs2_create_new_inode_lock(inode,
+					  &OCFS2_I(inode)->ip_data_lockres);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+int ocfs2_rw_lock(struct inode *inode, int write)
+{
+	int status, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
+				    0);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_rw_unlock(struct inode *inode, int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s RW lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" take %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	/* We'll allow faking a readonly data lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
+		if (write) {
+			status = -EROFS;
+			mlog_errno(status);
+		}
+		goto out;
+	}
+
+	lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
+				    0, arg_flags);
+	if (status < 0 && status != -EAGAIN)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* see ocfs2_meta_lock_with_page() */
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_data_lock(inode, write) == 0)
+			ocfs2_data_unlock(inode, write);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
+				 struct ocfs2_lock_res *lockres)
+{
+	int kick = 0;
+
+	mlog_entry_void();
+
+	/* If we know that another node is waiting on our lock, kick
+	 * the vote thread * pre-emptively when we reach a release
+	 * condition. */
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
+		switch(lockres->l_blocking) {
+		case LKM_EXMODE:
+			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+				kick = 1;
+			break;
+		case LKM_PRMODE:
+			if (!lockres->l_ex_holders)
+				kick = 1;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (kick)
+		ocfs2_kick_vote_thread(osb);
+
+	mlog_exit_void();
+}
+
+void ocfs2_data_unlock(struct inode *inode,
+		       int write)
+{
+	int level = write ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+#define OCFS2_SEC_BITS   34
+#define OCFS2_SEC_SHIFT  (64 - 34)
+#define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
+
+/* LVB only has room for 64 bits of time here so we pack it for
+ * now. */
+static u64 ocfs2_pack_timespec(struct timespec *spec)
+{
+	u64 res;
+	u64 sec = spec->tv_sec;
+	u32 nsec = spec->tv_nsec;
+
+	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
+
+	return res;
+}
+
+/* Call this with the lockres locked. I am reasonably sure we don't
+ * need ip_lock in this function as anyone who would be changing those
+ * values is supposed to be blocked in ocfs2_meta_lock right now. */
+static void __ocfs2_stuff_meta_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
+	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
+	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
+	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
+	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
+	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
+	lvb->lvb_iatime_packed  =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
+	lvb->lvb_ictime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+	lvb->lvb_imtime_packed =
+		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+
+	mlog_meta_lvb(0, lockres);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unpack_timespec(struct timespec *spec,
+				  u64 packed_time)
+{
+	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
+	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
+}
+
+static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+	struct ocfs2_meta_lvb *lvb;
+
+	mlog_entry_void();
+
+	mlog_meta_lvb(0, lockres);
+
+	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	/* We're safe here without the lockres lock... */
+	spin_lock(&oi->ip_lock);
+	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
+	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+
+	/* fast-symlinks are a special case */
+	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+
+	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
+	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
+	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
+	ocfs2_unpack_timespec(&inode->i_atime,
+			      be64_to_cpu(lvb->lvb_iatime_packed));
+	ocfs2_unpack_timespec(&inode->i_mtime,
+			      be64_to_cpu(lvb->lvb_imtime_packed));
+	ocfs2_unpack_timespec(&inode->i_ctime,
+			      be64_to_cpu(lvb->lvb_ictime_packed));
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit_void();
+}
+
+static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+		return 1;
+	return 0;
+}
+
+/* Determine whether a lock resource needs to be refreshed, and
+ * arbitrate who gets to refresh it.
+ *
+ *   0 means no refresh needed.
+ *
+ *   > 0 means you need to refresh this and you MUST call
+ *   ocfs2_complete_lock_res_refresh afterwards. */
+static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
+{
+	unsigned long flags;
+	int status = 0;
+
+	mlog_entry_void();
+
+refresh_check:
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto bail;
+	}
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		ocfs2_wait_on_refreshing_lock(lockres);
+		goto refresh_check;
+	}
+
+	/* Ok, I'll be the one to refresh this lock. */
+	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = 1;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* If status is non zero, I'll mark it as not being in refresh
+ * anymroe, but i won't clear the needs refresh flag. */
+static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
+						   int status)
+{
+	unsigned long flags;
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
+	if (!status)
+		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+/* may or may not return a bh if it went to disk. */
+static int ocfs2_meta_lock_update(struct inode *inode,
+				  struct buffer_head **bh)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_flags & OCFS2_INODE_DELETED) {
+		mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
+		     "were waiting on a lock. ip_flags = 0x%x\n",
+		     oi->ip_blkno, oi->ip_flags);
+		spin_unlock(&oi->ip_lock);
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	lockres = &oi->ip_meta_lockres;
+
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+
+	/* This will discard any caching information we might have had
+	 * for the inode metadata. */
+	ocfs2_metadata_cache_purge(inode);
+
+	/* will do nothing for inode types that don't use the extent
+	 * map (directories, bitmap files, etc) */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	if (ocfs2_meta_lvb_is_trustable(lockres)) {
+		mlog(0, "Trusting LVB on inode %"MLFu64"\n",
+		     oi->ip_blkno);
+		ocfs2_refresh_inode_from_lvb(inode);
+	} else {
+		/* Boo, we have to go to disk. */
+		/* read bh, cast, ocfs2_refresh_inode */
+		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+					  bh, OCFS2_BH_CACHED, inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_refresh;
+		}
+		fe = (struct ocfs2_dinode *) (*bh)->b_data;
+
+		/* This is a good chance to make sure we're not
+		 * locking an invalid object.
+		 *
+		 * We bug on a stale inode here because we checked
+		 * above whether it was wiped from disk. The wiping
+		 * node provides a guarantee that we receive that
+		 * message and can mark the inode before dropping any
+		 * locks associated with it. */
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			status = -EIO;
+			goto bail_refresh;
+		}
+		mlog_bug_on_msg(inode->i_generation !=
+				le32_to_cpu(fe->i_generation),
+				"Invalid dinode %"MLFu64" disk generation: %u "
+				"inode->i_generation: %u\n",
+				oi->ip_blkno, le32_to_cpu(fe->i_generation),
+				inode->i_generation);
+		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
+				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
+				"Stale dinode %"MLFu64" dtime: %"MLFu64" "
+				"flags: 0x%x\n", oi->ip_blkno,
+				le64_to_cpu(fe->i_dtime),
+				le32_to_cpu(fe->i_flags));
+
+		ocfs2_refresh_inode(inode, fe);
+	}
+
+	status = 0;
+bail_refresh:
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_assign_bh(struct inode *inode,
+			   struct buffer_head **ret_bh,
+			   struct buffer_head *passed_bh)
+{
+	int status;
+
+	if (passed_bh) {
+		/* Ok, the update went to disk for us, use the
+		 * returned bh. */
+		*ret_bh = passed_bh;
+		get_bh(*ret_bh);
+
+		return 0;
+	}
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  ret_bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/*
+ * returns < 0 error if the callback will never be called, otherwise
+ * the result of the lock will be communicated via the callback.
+ */
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags)
+{
+	int status, level, dlm_flags, acquired;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *local_bh = NULL;
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64", take %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	status = 0;
+	acquired = 0;
+	/* We'll allow faking a readonly metadata lock for
+	 * rodevices. */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	acquired = 0;
+	lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	level = ex ? LKM_EXMODE : LKM_PRMODE;
+	dlm_flags = 0;
+	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
+		dlm_flags |= LKM_NOQUEUE;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+	if (status < 0) {
+		if (status != -EAGAIN && status != -EIOCBRETRY)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* Notify the error cleanup path to drop the cluster lock. */
+	acquired = 1;
+
+	/* We wait twice because a node may have died while we were in
+	 * the lower dlm layers. The second time though, we've
+	 * committed to owning this lock so we don't allow signals to
+	 * abort the operation. */
+	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
+		wait_event(osb->recovery_event,
+			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+
+	/* This is fun. The caller may want a bh back, or it may
+	 * not. ocfs2_meta_lock_update definitely wants one in, but
+	 * may or may not read one, depending on what's in the
+	 * LVB. The result of all of this is that we've *only* gone to
+	 * disk if we have to, so the complexity is worthwhile. */
+	status = ocfs2_meta_lock_update(inode, &local_bh);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	if (ret_bh) {
+		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	if (handle) {
+		status = ocfs2_handle_add_lock(handle, inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	if (status < 0) {
+		if (ret_bh && (*ret_bh)) {
+			brelse(*ret_bh);
+			*ret_bh = NULL;
+		}
+		if (acquired)
+			ocfs2_meta_unlock(inode, ex);
+	}
+
+	if (local_bh)
+		brelse(local_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * This is working around a lock inversion between tasks acquiring DLM locks
+ * while holding a page lock and the vote thread which blocks dlm lock acquiry
+ * while acquiring page locks.
+ *
+ * ** These _with_page variantes are only intended to be called from aop
+ * methods that hold page locks and return a very specific *positive* error
+ * code that aop methods pass up to the VFS -- test for errors with != 0. **
+ *
+ * The DLM is called such that it returns -EAGAIN if it would have blocked
+ * waiting for the vote thread.  In that case we unlock our page so the vote
+ * thread can make progress.  Once we've done this we have to return
+ * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
+ * into the VFS who will then immediately retry the aop call.
+ *
+ * We do a blocking lock and immediate unlock before returning, though, so that
+ * the lock has a great chance of being cached on this node by the time the VFS
+ * calls back to retry the aop.    This has a potential to livelock as nodes
+ * ping locks back and forth, but that's a risk we're willing to take to avoid
+ * the lock inversion simply.
+ */
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page)
+{
+	int ret;
+
+	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
+				   OCFS2_LOCK_NONBLOCK);
+	if (ret == -EAGAIN) {
+		unlock_page(page);
+		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
+			ocfs2_meta_unlock(inode, ex);
+		ret = AOP_TRUNCATED_PAGE;
+	}
+
+	return ret;
+}
+
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+
+	mlog_entry_void();
+
+	mlog(0, "inode %"MLFu64" drop %s META lock\n",
+	     OCFS2_I(inode)->ip_blkno,
+	     ex ? "EXMODE" : "PRMODE");
+
+	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+
+	mlog_exit_void();
+}
+
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex)
+{
+	int status;
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+	struct buffer_head *bh;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* The super block lock path is really in the best position to
+	 * know when resources covered by the lock need to be
+	 * refreshed, so we do it here. Of course, making sense of
+	 * everything is up to the caller :) */
+	status = ocfs2_should_refresh_lock_res(lockres);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (status) {
+		bh = si->si_bh;
+		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
+					  si->si_inode);
+		if (status == 0)
+			ocfs2_update_slot_info(si);
+
+		ocfs2_complete_lock_res_refresh(lockres, status);
+
+		if (status < 0)
+			mlog_errno(status);
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex)
+{
+	int level = ex ? LKM_EXMODE : LKM_PRMODE;
+	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+int ocfs2_rename_lock(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_rename_unlock(struct ocfs2_super *osb)
+{
+	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
+
+	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+}
+
+/* Reference counting of the dlm debug structure. We want this because
+ * open references on the debug inodes can live on after a mount, so
+ * we can't rely on the ocfs2_super to always exist. */
+static void ocfs2_dlm_debug_free(struct kref *kref)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
+
+	kfree(dlm_debug);
+}
+
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
+{
+	if (dlm_debug)
+		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
+}
+
+static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
+{
+	kref_get(&debug->d_refcnt);
+}
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
+{
+	struct ocfs2_dlm_debug *dlm_debug;
+
+	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
+	if (!dlm_debug) {
+		mlog_errno(-ENOMEM);
+		goto out;
+	}
+
+	kref_init(&dlm_debug->d_refcnt);
+	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
+	dlm_debug->d_locking_state = NULL;
+out:
+	return dlm_debug;
+}
+
+/* Access to this is arbitrated for us via seq_file->sem. */
+struct ocfs2_dlm_seq_priv {
+	struct ocfs2_dlm_debug *p_dlm_debug;
+	struct ocfs2_lock_res p_iter_res;
+	struct ocfs2_lock_res p_tmp_res;
+};
+
+static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
+						 struct ocfs2_dlm_seq_priv *priv)
+{
+	struct ocfs2_lock_res *iter, *ret = NULL;
+	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
+
+	assert_spin_locked(&ocfs2_dlm_tracking_lock);
+
+	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
+		/* discover the head of the list */
+		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
+			mlog(0, "End of list found, %p\n", ret);
+			break;
+		}
+
+		/* We track our "dummy" iteration lockres' by a NULL
+		 * l_ops field. */
+		if (iter->l_ops != NULL) {
+			ret = iter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
+	if (iter) {
+		/* Since lockres' have the lifetime of their container
+		 * (which can be inodes, ocfs2_supers, etc) we want to
+		 * copy this out to a temporary lockres while still
+		 * under the spinlock. Obviously after this we can't
+		 * trust any pointers on the copy returned, but that's
+		 * ok as the information we want isn't typically held
+		 * in them. */
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ocfs2_dlm_seq_priv *priv = m->private;
+	struct ocfs2_lock_res *iter = v;
+	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
+
+	spin_lock(&ocfs2_dlm_tracking_lock);
+	iter = ocfs2_dlm_next_res(iter, priv);
+	list_del_init(&dummy->l_debug_list);
+	if (iter) {
+		list_add(&dummy->l_debug_list, &iter->l_debug_list);
+		priv->p_tmp_res = *iter;
+		iter = &priv->p_tmp_res;
+	}
+	spin_unlock(&ocfs2_dlm_tracking_lock);
+
+	return iter;
+}
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define OCFS2_DLM_DEBUG_STR_VERSION 1
+static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+	char *lvb;
+	struct ocfs2_lock_res *lockres = v;
+
+	if (!lockres)
+		return -EINVAL;
+
+	seq_printf(m, "0x%x\t"
+		   "%.*s\t"
+		   "%d\t"
+		   "0x%lx\t"
+		   "0x%x\t"
+		   "0x%x\t"
+		   "%u\t"
+		   "%u\t"
+		   "%d\t"
+		   "%d\t",
+		   OCFS2_DLM_DEBUG_STR_VERSION,
+		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
+		   lockres->l_level,
+		   lockres->l_flags,
+		   lockres->l_action,
+		   lockres->l_unlock_action,
+		   lockres->l_ro_holders,
+		   lockres->l_ex_holders,
+		   lockres->l_requested,
+		   lockres->l_blocking);
+
+	/* Dump the raw LVB */
+	lvb = lockres->l_lksb.lvb;
+	for(i = 0; i < DLM_LVB_LEN; i++)
+		seq_printf(m, "0x%x\t", lvb[i]);
+
+	/* End the line */
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static struct seq_operations ocfs2_dlm_seq_ops = {
+	.start =	ocfs2_dlm_seq_start,
+	.stop =		ocfs2_dlm_seq_stop,
+	.next =		ocfs2_dlm_seq_next,
+	.show =		ocfs2_dlm_seq_show,
+};
+
+static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = (struct seq_file *) file->private_data;
+	struct ocfs2_dlm_seq_priv *priv = seq->private;
+	struct ocfs2_lock_res *res = &priv->p_iter_res;
+
+	ocfs2_remove_lockres_tracking(res);
+	ocfs2_put_dlm_debug(priv->p_dlm_debug);
+	return seq_release_private(inode, file);
+}
+
+static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct ocfs2_dlm_seq_priv *priv;
+	struct seq_file *seq;
+	struct ocfs2_super *osb;
+
+	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
+	priv->p_dlm_debug = osb->osb_dlm_debug;
+	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
+
+	ret = seq_open(file, &ocfs2_dlm_seq_ops);
+	if (ret) {
+		kfree(priv);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	seq = (struct seq_file *) file->private_data;
+	seq->private = priv;
+
+	ocfs2_add_lockres_tracking(&priv->p_iter_res,
+				   priv->p_dlm_debug);
+
+out:
+	return ret;
+}
+
+static struct file_operations ocfs2_dlm_debug_fops = {
+	.open =		ocfs2_dlm_debug_open,
+	.release =	ocfs2_dlm_debug_release,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+};
+
+static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
+							 S_IFREG|S_IRUSR,
+							 osb->osb_debug_root,
+							 osb,
+							 &ocfs2_dlm_debug_fops);
+	if (!dlm_debug->d_locking_state) {
+		ret = -EINVAL;
+		mlog(ML_ERROR,
+		     "Unable to create locking state debugfs file.\n");
+		goto out;
+	}
+
+	ocfs2_get_dlm_debug(dlm_debug);
+out:
+	return ret;
+}
+
+static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
+{
+	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
+
+	if (dlm_debug) {
+		debugfs_remove(dlm_debug->d_locking_state);
+		ocfs2_put_dlm_debug(dlm_debug);
+	}
+}
+
+int ocfs2_dlm_init(struct ocfs2_super *osb)
+{
+	int status;
+	u32 dlm_key;
+	struct dlm_ctxt *dlm;
+
+	mlog_entry_void();
+
+	status = ocfs2_dlm_init_debug(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* launch vote thread */
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
+				     osb->osb_id);
+	if (IS_ERR(osb->vote_task)) {
+		status = PTR_ERR(osb->vote_task);
+		osb->vote_task = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* used by the dlm code to make message headers unique, each
+	 * node in this domain must agree on this. */
+	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
+
+	/* for now, uuid == domain */
+	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
+	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+
+	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+
+	osb->dlm = dlm;
+
+	status = 0;
+bail:
+	if (status < 0) {
+		ocfs2_dlm_shutdown_debug(osb);
+		if (osb->vote_task)
+			kthread_stop(osb->vote_task);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
+
+	ocfs2_drop_osb_locks(osb);
+
+	if (osb->vote_task) {
+		kthread_stop(osb->vote_task);
+		osb->vote_task = NULL;
+	}
+
+	ocfs2_lock_res_free(&osb->osb_super_lockres);
+	ocfs2_lock_res_free(&osb->osb_rename_lockres);
+
+	dlm_unregister_domain(osb->dlm);
+	osb->dlm = NULL;
+
+	ocfs2_dlm_shutdown_debug(osb);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
+{
+	struct ocfs2_lock_res *lockres = opaque;
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
+	     lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	/* We tried to cancel a convert request, but it was already
+	 * granted. All we want to do here is clear our unlock
+	 * state. The wake_up call done at the bottom is redundant
+	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
+	 * hurt anything anyway */
+	if (status == DLM_CANCELGRANT &&
+	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
+
+		/* We don't clear the busy flag in this case as it
+		 * should have been cleared by the ast which the dlm
+		 * has called. */
+		goto complete_unlock;
+	}
+
+	if (status != DLM_NORMAL) {
+		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
+		     "unlock_action %d\n", status, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = LKM_IVMODE;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+complete_unlock:
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	wake_up(&lockres->l_event);
+
+	mlog_exit_void();
+}
+
+typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
+
+struct drop_lock_cb {
+	ocfs2_pre_drop_cb_t	*drop_func;
+	void			*drop_data;
+};
+
+static int ocfs2_drop_lock(struct ocfs2_super *osb,
+			   struct ocfs2_lock_res *lockres,
+			   struct drop_lock_cb *dcb)
+{
+	enum dlm_status status;
+	unsigned long flags;
+
+	/* We didn't get anywhere near actually using this lockres. */
+	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
+		goto out;
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
+			"lockres %s, flags 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
+		     "%u, unlock_action = %u\n",
+		     lockres->l_name, lockres->l_flags, lockres->l_action,
+		     lockres->l_unlock_action);
+
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		/* XXX: Today we just wait on any busy
+		 * locks... Perhaps we need to cancel converts in the
+		 * future? */
+		ocfs2_wait_on_busy_lock(lockres);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+
+	if (dcb)
+		dcb->drop_func(lockres, dcb->drop_data);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY)
+		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
+		     lockres->l_name);
+	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
+
+	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		goto out;
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
+
+	/* make sure we never get here while waiting for an ast to
+	 * fire. */
+	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
+
+	/* is this necessary? */
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+			   lockres->l_ops->unlock_ast, lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
+		dlm_print_one_lock(lockres->l_lksb.lockid);
+		BUG();
+	}
+	mlog(0, "lock %s, successfull return from dlmunlock\n",
+	     lockres->l_name);
+
+	ocfs2_wait_on_busy_lock(lockres);
+out:
+	mlog_exit(0);
+	return 0;
+}
+
+/* Mark the lockres as being dropped. It will no longer be
+ * queued if blocking, but we still may have to wait on it
+ * being dequeued from the vote thread before we can consider
+ * it safe to drop. 
+ *
+ * You can *not* attempt to call cluster_lock on this lockres anymore. */
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+{
+	int status;
+	struct ocfs2_mask_waiter mw;
+	unsigned long flags;
+
+	ocfs2_init_mask_waiter(&mw);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	lockres->l_flags |= OCFS2_LOCK_FREEING;
+	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
+		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
+
+		status = ocfs2_wait_for_mask(&mw);
+		if (status)
+			mlog_errno(status);
+
+		spin_lock_irqsave(&lockres->l_lock, flags);
+	}
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
+
+	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+}
+
+static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
+{
+	struct inode *inode = data;
+
+	/* the metadata lock requires a bit more work as we have an
+	 * LVB to worry about. */
+	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+	    lockres->l_level == LKM_EXMODE &&
+	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+		__ocfs2_stuff_meta_lvb(inode);
+}
+
+int ocfs2_drop_inode_locks(struct inode *inode)
+{
+	int status, err;
+	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
+
+	mlog_entry_void();
+
+	/* No need to call ocfs2_mark_lockres_freeing here -
+	 * ocfs2_clear_inode has done it for us. */
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+
+	status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_meta_lockres,
+			      &meta_dcb);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_rw_lockres,
+			      NULL);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+				      int new_level)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+
+	if (lockres->l_level <= new_level) {
+		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+		     lockres->l_level, new_level);
+		BUG();
+	}
+
+	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
+	     lockres->l_name, new_level, lockres->l_blocking);
+
+	lockres->l_action = OCFS2_AST_DOWNCONVERT;
+	lockres->l_requested = new_level;
+	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+}
+
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+				  struct ocfs2_lock_res *lockres,
+				  int new_level,
+				  int lvb)
+{
+	int ret, dlm_flags = LKM_CONVERT;
+	enum dlm_status status;
+
+	mlog_entry_void();
+
+	if (lvb)
+		dlm_flags |= LKM_VALBLK;
+
+	status = dlmlock(osb->dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 dlm_flags,
+			 lockres->l_name,
+			 lockres->l_ops->ast,
+			 lockres,
+			 lockres->l_ops->bast);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 1);
+		goto bail;
+	}
+
+	ret = 0;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/* returns 1 when the caller should unlock and call dlmunlock */
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+				        struct ocfs2_lock_res *lockres)
+{
+	assert_spin_locked(&lockres->l_lock);
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
+		/* If we're already trying to cancel a lock conversion
+		 * then just drop the spinlock and allow the caller to
+		 * requeue this lock. */
+
+		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		return 0;
+	}
+
+	/* were we in a convert when we got the bast fire? */
+	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
+	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
+	/* set things up for the unlockast to know to just
+	 * clear out the ast_action and unset busy, etc. */
+	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
+
+	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
+			"lock %s, invalid flags: 0x%lx\n",
+			lockres->l_name, lockres->l_flags);
+
+	return 1;
+}
+
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int ret;
+	enum dlm_status status;
+
+	mlog_entry_void();
+	mlog(0, "lock %s\n", lockres->l_name);
+
+	ret = 0;
+	status = dlmunlock(osb->dlm,
+			   &lockres->l_lksb,
+			   LKM_CANCEL,
+			   lockres->l_ops->unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		ocfs2_log_dlm_error("dlmunlock", status, lockres);
+		ret = -EINVAL;
+		ocfs2_recover_from_dlm_error(lockres, 0);
+	}
+
+	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+						  struct ocfs2_lock_res *lockres,
+						  int new_level)
+{
+	int ret;
+
+	mlog_entry_void();
+
+	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
+		ret = 0;
+		mlog(0, "lockres %s currently being refreshed -- backing "
+		     "off!\n", lockres->l_name);
+	} else if (new_level == LKM_PRMODE)
+		ret = !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+	else /* Must be NLMODE we're converting to. */
+		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
+			ocfs2_inode_fully_checkpointed(inode);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_do_unblock_meta(struct inode *inode,
+				 int *requeue)
+{
+	int new_level;
+	int set_lvb = 0;
+	int ret = 0;
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+	unsigned long flags;
+
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
+	     lockres->l_blocking);
+
+	BUG_ON(lockres->l_level != LKM_EXMODE &&
+	       lockres->l_level != LKM_PRMODE);
+
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
+	     lockres->l_level, lockres->l_blocking, new_level);
+
+	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
+		if (lockres->l_level == LKM_EXMODE)
+			set_lvb = 1;
+
+		/* If the lock hasn't been refreshed yet (rare), then
+		 * our memory inode values are old and we skip
+		 * stuffing the lvb. There's no need to actually clear
+		 * out the lvb here as it's value is still valid. */
+		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
+			if (set_lvb)
+				__ocfs2_stuff_meta_lvb(inode);
+		} else
+			mlog(0, "lockres %s: downconverting stale lock!\n",
+			     lockres->l_name);
+
+		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
+		     "l_blocking=%d, new_level=%d\n",
+		     lockres->l_level, lockres->l_blocking, new_level);
+
+		ocfs2_prepare_downconvert(lockres, new_level);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+		goto leave;
+	}
+	if (!ocfs2_inode_fully_checkpointed(inode))
+		ocfs2_start_checkpoint(osb);
+
+	*requeue = 1;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = 0;
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
+				      struct ocfs2_lock_res *lockres,
+				      int *requeue,
+				      ocfs2_convert_worker_t *worker)
+{
+	unsigned long flags;
+	int blocking;
+	int new_level;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+
+	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
+
+recheck:
+	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+		*requeue = 1;
+		ret = ocfs2_prepare_cancel_convert(osb, lockres);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		if (ret) {
+			ret = ocfs2_cancel_convert(osb, lockres);
+			if (ret < 0)
+				mlog_errno(ret);
+		}
+		goto leave;
+	}
+
+	/* if we're blocking an exclusive and we have *any* holders,
+	 * then requeue. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If it's a PR we're blocking, then only
+	 * requeue if we've got any EX holders */
+	if (lockres->l_blocking == LKM_PRMODE &&
+	    lockres->l_ex_holders) {
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		*requeue = 1;
+		ret = 0;
+		goto leave;
+	}
+
+	/* If we get here, then we know that there are no more
+	 * incompatible holders (and anyone asking for an incompatible
+	 * lock is blocked). We can now downconvert the lock */
+	if (!worker)
+		goto downconvert;
+
+	/* Some lockres types want to do a bit of work before
+	 * downconverting a lock. Allow that here. The worker function
+	 * may sleep, so we save off a copy of what we're blocking as
+	 * it may change while we're not holding the spin lock. */
+	blocking = lockres->l_blocking;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	worker(lockres, blocking);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (blocking != lockres->l_blocking) {
+		/* If this changed underneath us, then we can't drop
+		 * it just yet. */
+		goto recheck;
+	}
+
+downconvert:
+	*requeue = 0;
+	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+
+	ocfs2_prepare_downconvert(lockres, new_level);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+leave:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+				      int blocking)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+	mapping = inode->i_mapping;
+
+	if (filemap_fdatawrite(mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
+		     OCFS2_I(inode)->ip_blkno);
+	}
+	sync_mapping_buffers(mapping);
+	if (blocking == LKM_EXMODE) {
+		truncate_inode_pages(mapping, 0);
+		unmap_mapping_range(mapping, 0, 0, 0);
+	} else {
+		/* We only need to wait on the I/O if we're not also
+		 * truncating pages because truncate_inode_pages waits
+		 * for us above. We don't truncate pages if we're
+		 * blocking anything < EXMODE because we want to keep
+		 * them around in that case. */
+		filemap_fdatawait(mapping);
+	}
+
+	mlog_exit_void();
+}
+
+int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	inode = ocfs2_lock_res_inode(lockres);
+	osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    ocfs2_data_convert_worker);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+				    int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	inode  = ocfs2_lock_res_inode(lockres);
+
+	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+
+int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+		       int *requeue)
+{
+	int status;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+       	inode = ocfs2_lock_res_inode(lockres);
+
+	mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_do_unblock_meta(inode, requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog(0, "inode %"MLFu64", requeue = %d\n",
+	     OCFS2_I(inode)->ip_blkno, *requeue);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Generic unblock function for any lockres whose private data is an
+ * ocfs2_super pointer. */
+static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+				  int *requeue)
+{
+	int status;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	mlog(0, "Unblock lockres %s\n", lockres->l_name);
+
+	osb = ocfs2_lock_res_super(lockres);
+
+	status = ocfs2_generic_unblock_lock(osb,
+					    lockres,
+					    requeue,
+					    NULL);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres)
+{
+	int status;
+	int requeue = 0;
+	unsigned long flags;
+
+	/* Our reference to the lockres in this function can be
+	 * considered valid until we remove the OCFS2_LOCK_QUEUED
+	 * flag. */
+
+	mlog_entry_void();
+
+	BUG_ON(!lockres);
+	BUG_ON(!lockres->l_ops);
+	BUG_ON(!lockres->l_ops->unblock);
+
+	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+
+	/* Detect whether a lock has been marked as going away while
+	 * the vote thread was processing other things. A lock can
+	 * still be marked with OCFS2_LOCK_FREEING after this check,
+	 * but short circuiting here will still save us some
+	 * performance. */
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (lockres->l_flags & OCFS2_LOCK_FREEING)
+		goto unqueue;
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	status = lockres->l_ops->unblock(lockres, &requeue);
+	if (status < 0)
+		mlog_errno(status);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+unqueue:
+	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
+		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+	} else
+		ocfs2_schedule_blocked_lock(osb, lockres);
+
+	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	     requeue ? "yes" : "no");
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
+					struct ocfs2_lock_res *lockres)
+{
+	mlog_entry_void();
+
+	assert_spin_locked(&lockres->l_lock);
+
+	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
+		/* Do not schedule a lock for downconvert when it's on
+		 * the way to destruction - any nodes wanting access
+		 * to the resource will get it soon. */
+		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		     lockres->l_name, lockres->l_flags);
+		return;
+	}
+
+	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&lockres->l_blocked_list)) {
+		list_add_tail(&lockres->l_blocked_list,
+			      &osb->blocked_lock_list);
+		osb->blocked_lock_count++;
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+
+	mlog(level, "LVB information for %s (called from %s:%u):\n",
+	     lockres->l_name, function, line);
+	mlog(level, "version: %u, clusters: %u\n",
+	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
+	mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
+	     be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
+	     be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
+	mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
+	     "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
+	     be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
+	     be64_to_cpu(lvb->lvb_ictime_packed),
+	     be64_to_cpu(lvb->lvb_imtime_packed));
+}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 0000000..8f2d1db
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmglue.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef DLMGLUE_H
+#define DLMGLUE_H
+
+#define OCFS2_LVB_VERSION 2
+
+struct ocfs2_meta_lvb {
+	__be32       lvb_version;
+	__be32       lvb_iclusters;
+	__be32       lvb_iuid;
+	__be32       lvb_igid;
+	__be64       lvb_iatime_packed;
+	__be64       lvb_ictime_packed;
+	__be64       lvb_imtime_packed;
+	__be64       lvb_isize;
+	__be16       lvb_imode;
+	__be16       lvb_inlink;
+	__be32       lvb_reserved[3];
+};
+
+/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+/* don't wait on recovery. */
+#define OCFS2_META_LOCK_RECOVERY	(0x01)
+/* Instruct the dlm not to queue ourselves on the other node. */
+#define OCFS2_META_LOCK_NOQUEUE		(0x02)
+/* don't block waiting for the vote thread, instead return -EAGAIN */
+#define OCFS2_LOCK_NONBLOCK		(0x04)
+
+int ocfs2_dlm_init(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
+void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
+			       enum ocfs2_lock_type type,
+			       struct inode *inode);
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
+int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_drop_inode_locks(struct inode *inode);
+int ocfs2_data_lock_full(struct inode *inode,
+			 int write,
+			 int arg_flags);
+#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
+int ocfs2_data_lock_with_page(struct inode *inode,
+			      int write,
+			      struct page *page);
+void ocfs2_data_unlock(struct inode *inode,
+		       int write);
+int ocfs2_rw_lock(struct inode *inode, int write);
+void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_meta_lock_full(struct inode *inode,
+			 struct ocfs2_journal_handle *handle,
+			 struct buffer_head **ret_bh,
+			 int ex,
+			 int arg_flags);
+int ocfs2_meta_lock_with_page(struct inode *inode,
+			      struct ocfs2_journal_handle *handle,
+			      struct buffer_head **ret_bh,
+			      int ex,
+			      struct page *page);
+/* 99% of the time we don't want to supply any additional flags --
+ * those are for very specific cases only. */
+#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
+void ocfs2_meta_unlock(struct inode *inode,
+		       int ex);
+int ocfs2_super_lock(struct ocfs2_super *osb,
+		     int ex);
+void ocfs2_super_unlock(struct ocfs2_super *osb,
+			int ex);
+int ocfs2_rename_lock(struct ocfs2_super *osb);
+void ocfs2_rename_unlock(struct ocfs2_super *osb);
+void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+
+/* for the vote thread */
+void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+				struct ocfs2_lock_res *lockres);
+
+struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
+void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
+
+/* aids in debugging and tracking lvbs */
+void ocfs2_dump_meta_lvb_info(u64 level,
+			      const char *function,
+			      unsigned int line,
+			      struct ocfs2_lock_res *lockres);
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+#endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 0000000..f226b22
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_ENDIAN_H
+#define OCFS2_ENDIAN_H
+
+static inline void le16_add_cpu(__le16 *var, u16 val)
+{
+	*var = cpu_to_le16(le16_to_cpu(*var) + val);
+}
+
+static inline void le32_add_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) + val);
+}
+
+static inline void le32_and_cpu(__le32 *var, u32 val)
+{
+	*var = cpu_to_le32(le32_to_cpu(*var) & val);
+}
+
+static inline void be32_add_cpu(__be32 *var, u32 val)
+{
+	*var = cpu_to_be32(be32_to_cpu(*var) + val);
+}
+
+#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 0000000..5810160
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.c
+ *
+ * Functions to facilitate NFS exporting
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#define MLOG_MASK_PREFIX ML_EXPORT
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dir.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "inode.h"
+
+#include "buffer_head_io.h"
+
+struct ocfs2_inode_handle
+{
+	u64 ih_blkno;
+	u32 ih_generation;
+};
+
+static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+{
+	struct ocfs2_inode_handle *handle = vobjp;
+	struct inode *inode;
+	struct dentry *result;
+
+	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+
+	if (handle->ih_blkno == 0) {
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		return (void *)inode;
+	}
+
+	if (handle->ih_generation != inode->i_generation) {
+		iput(inode);
+		mlog_errno(-ESTALE);
+		return ERR_PTR(-ESTALE);
+	}
+
+	result = d_alloc_anon(inode);
+
+	if (!result) {
+		iput(inode);
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mlog_exit_ptr(result);
+	return result;
+}
+
+static struct dentry *ocfs2_get_parent(struct dentry *child)
+{
+	int status;
+	u64 blkno;
+	struct dentry *parent;
+	struct inode *inode;
+	struct inode *dir = child->d_inode;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *dirent;
+
+	mlog_entry("(0x%p, '%.*s')\n", child,
+		   child->d_name.len, child->d_name.name);
+
+	mlog(0, "find parent of directory %"MLFu64"\n",
+	     OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
+					  &dirent);
+	if (status < 0) {
+		parent = ERR_PTR(-ENOENT);
+		goto bail_unlock;
+	}
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		parent = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+
+bail_unlock:
+	ocfs2_meta_unlock(dir, 0);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+bail:
+	mlog_exit_ptr(parent);
+
+	return parent;
+}
+
+static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type = 1;
+	u64 blkno;
+	u32 generation;
+
+	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+		   dentry->d_name.len, dentry->d_name.name,
+		   fh, len, connectable);
+
+	if (len < 3 || (connectable && len < 6)) {
+		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+		type = 255;
+		goto bail;
+	}
+
+	blkno = OCFS2_I(inode)->ip_blkno;
+	generation = inode->i_generation;
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     blkno, generation);
+
+	len = 3;
+	fh[0] = cpu_to_le32((u32)(blkno >> 32));
+	fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
+	fh[2] = cpu_to_le32(generation);
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		blkno = OCFS2_I(parent)->ip_blkno;
+		generation = parent->i_generation;
+
+		fh[3] = cpu_to_le32((u32)(blkno >> 32));
+		fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
+		fh[5] = cpu_to_le32(generation);
+
+		spin_unlock(&dentry->d_lock);
+
+		len = 6;
+		type = 2;
+
+		mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
+		     blkno, generation);
+	}
+	
+	*max_len = len;
+
+bail:
+	mlog_exit(type);
+	return type;
+}
+
+static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
+				      int fh_len, int fileid_type,
+				      int (*acceptable)(void *context,
+						        struct dentry *de),
+				      void *context)
+{
+	struct ocfs2_inode_handle handle, parent;
+	struct dentry *ret = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
+		   sb, fh, fh_len, fileid_type, acceptable, context);
+
+	if (fh_len < 3 || fileid_type > 2)
+		goto bail;
+
+	if (fileid_type == 2) {
+		if (fh_len < 6)
+			goto bail;
+
+		parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
+		parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
+		parent.ih_generation = le32_to_cpu(fh[5]);
+
+		mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
+		     parent.ih_blkno, parent.ih_generation);
+	}
+
+	handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
+	handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
+	handle.ih_generation = le32_to_cpu(fh[2]);
+
+	mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
+	     handle.ih_blkno, handle.ih_generation);
+
+	ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
+						    acceptable, context);
+
+bail:
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+struct export_operations ocfs2_export_ops = {
+	.decode_fh	= ocfs2_decode_fh,
+	.encode_fh	= ocfs2_encode_fh,
+
+	.get_parent	= ocfs2_get_parent,
+	.get_dentry	= ocfs2_get_dentry,
+};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 0000000..5b77ee7
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * export.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_EXPORT_H
+#define OCFS2_EXPORT_H
+
+extern struct export_operations ocfs2_export_ops;
+
+#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 0000000..f2fb40c
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.c
+ *
+ * In-memory extent map for OCFS2.  Man, this code was prettier in
+ * the library.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_EXTENT_MAP
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "extent_map.h"
+#include "inode.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+
+/*
+ * SUCK SUCK SUCK
+ * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ */
+
+struct ocfs2_extent_map_entry {
+	struct rb_node e_node;
+	int e_tree_depth;
+	struct ocfs2_extent_rec e_rec;
+};
+
+struct ocfs2_em_insert_context {
+	int need_left;
+	int need_right;
+	struct ocfs2_extent_map_entry *new_ent;
+	struct ocfs2_extent_map_entry *old_ent;
+	struct ocfs2_extent_map_entry *left_ent;
+	struct ocfs2_extent_map_entry *right_ent;
+};
+
+static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
+
+
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent);
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth);
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent);
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el);
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos, u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent);
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt);
+
+/* returns 1 only if the rec contains all the given clusters -- that is that
+ * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
+ * clusters) is >= the argument's endpoint */
+static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
+					      u32 cpos, u32 clusters)
+{
+	if (le32_to_cpu(rec->e_cpos) > cpos)
+		return 0;
+	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
+			      le32_to_cpu(rec->e_clusters))
+		return 0;
+	return 1;
+}
+
+
+/*
+ * Find an entry in the tree that intersects the region passed in.
+ * Note that this will find straddled intervals, it is up to the
+ * callers to enforce any boundary conditions.
+ *
+ * Callers must hold ip_lock.  This lookup is not guaranteed to return
+ * a tree_depth 0 match, and as such can race inserts if the lock
+ * were not held.
+ *
+ * The rb_node garbage lets insertion share the search.  Trivial
+ * callers pass NULL.
+ */
+static struct ocfs2_extent_map_entry *
+ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+			u32 cpos, u32 clusters,
+			struct rb_node ***ret_p,
+			struct rb_node **ret_parent)
+{
+	struct rb_node **p = &em->em_extents.rb_node;
+	struct rb_node *parent = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	while (*p)
+	{
+		parent = *p;
+		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
+			       e_node);
+		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
+			p = &(*p)->rb_left;
+			ent = NULL;
+		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
+				    le32_to_cpu(ent->e_rec.e_clusters))) {
+			p = &(*p)->rb_right;
+			ent = NULL;
+		} else
+			break;
+	}
+
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
+	return ent;
+}
+
+/*
+ * Find the leaf containing the interval we want.  While we're on our
+ * way down the tree, fill in every record we see at any depth, because
+ * we might want it later.
+ *
+ * Note that this code is run without ip_lock.  That's because it
+ * sleeps while reading.  If someone is also filling the extent list at
+ * the same time we are, we might have to restart.
+ */
+static int ocfs2_extent_map_find_leaf(struct inode *inode,
+				      u32 cpos, u32 clusters,
+				      struct ocfs2_extent_list *el)
+{
+	int i, ret;
+	struct buffer_head *eb_bh = NULL;
+	u64 blkno;
+	u32 rec_end;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+
+	/*
+	 * The bh data containing the el cannot change here, because
+	 * we hold alloc_sem.  So we can do this without other
+	 * locks.
+	 */
+	while (el->l_tree_depth)
+	{
+		blkno = 0;
+		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+			rec = &el->l_recs[i];
+			rec_end = (le32_to_cpu(rec->e_cpos) +
+				   le32_to_cpu(rec->e_clusters));
+
+			ret = -EBADR;
+			if (rec_end > OCFS2_I(inode)->ip_clusters) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			if (rec_end <= cpos) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
+				ret = ocfs2_extent_map_insert(inode, rec,
+						le16_to_cpu(el->l_tree_depth));
+				if (ret && (ret != -EEXIST)) {
+					mlog_errno(ret);
+					goto out_free;
+				}
+				continue;
+			}
+
+			/*
+			 * We've found a record that matches our
+			 * interval.  We don't insert it because we're
+			 * about to traverse it.
+			 */
+
+			/* Check to see if we're stradling */
+			ret = -ESRCH;
+			if (!ocfs2_extent_rec_contains_clusters(rec,
+							        cpos,
+								clusters)) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			/*
+			 * If we've already found a record, the el has
+			 * two records covering the same interval.
+			 * EEEK!
+			 */
+			ret = -EBADR;
+			if (blkno) {
+				mlog_errno(ret);
+				goto out_free;
+			}
+
+			blkno = le64_to_cpu(rec->e_blkno);
+		}
+
+		/*
+		 * We don't support holes, and we're still up
+		 * in the branches, so we'd better have found someone
+		 */
+		ret = -EBADR;
+		if (!blkno) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+
+		if (eb_bh) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+		}
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       blkno, &eb_bh, OCFS2_BH_CACHED,
+				       inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EIO;
+			goto out_free;
+		}
+		el = &eb->h_list;
+	}
+
+	if (el->l_tree_depth)
+		BUG();
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+		ret = ocfs2_extent_map_insert(inode, rec,
+					      le16_to_cpu(el->l_tree_depth));
+		if (ret) {
+			mlog_errno(ret);
+			goto out_free;
+		}
+	}
+
+	ret = 0;
+
+out_free:
+	if (eb_bh)
+		brelse(eb_bh);
+
+	return ret;
+}
+
+/*
+ * This lookup actually will read from disk.  It has one invariant:
+ * It will never re-traverse blocks.  This means that all inserts should
+ * be new regions or more granular regions (both allowed by insert).
+ */
+static int ocfs2_extent_map_lookup_read(struct inode *inode,
+					u32 cpos,
+					u32 clusters,
+					struct ocfs2_extent_map_entry **ret_ent)
+{
+	int ret;
+	u64 blkno;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_list *el;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (ent) {
+		if (!ent->e_tree_depth) {
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+			*ret_ent = ent;
+			return 0;
+		}
+		blkno = le64_to_cpu(ent->e_rec.e_blkno);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		eb = (struct ocfs2_extent_block *)bh->b_data;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			brelse(bh);
+			return -EIO;
+		}
+		el = &eb->h_list;
+	} else {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       OCFS2_I(inode)->ip_blkno, &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			if (bh)
+				brelse(bh);
+			return ret;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(di)) {
+			brelse(bh);
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
+			return -EIO;
+		}
+		el = &di->id2.i_list;
+	}
+
+	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
+	brelse(bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
+	if (!ent) {
+		ret = -ESRCH;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent->e_tree_depth)
+		BUG();  /* FIXME: Make sure this isn't a corruption */
+
+	*ret_ent = ent;
+
+	return 0;
+}
+
+/*
+ * Callers must hold ip_lock.  This can insert pieces of the tree,
+ * thus racing lookup if the lock weren't held.
+ */
+static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
+					 struct ocfs2_extent_map_entry *ent)
+{
+	struct rb_node **p, *parent;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
+					  le32_to_cpu(ent->e_rec.e_clusters),
+					  &p, &parent);
+	if (old_ent)
+		return -EEXIST;
+
+	rb_link_node(&ent->e_node, parent, p);
+	rb_insert_color(&ent->e_node, &em->em_extents);
+
+	return 0;
+}
+
+
+/*
+ * Simple rule: on any return code other than -EAGAIN, anything left
+ * in the insert_context will be freed.
+ */
+static int ocfs2_extent_map_try_insert(struct inode *inode,
+				       struct ocfs2_extent_rec *rec,
+				       int tree_depth,
+				       struct ocfs2_em_insert_context *ctxt)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *old_ent;
+
+	ctxt->need_left = 0;
+	ctxt->need_right = 0;
+	ctxt->old_ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+	if (!ret) {
+		ctxt->new_ent = NULL;
+		goto out_unlock;
+	}
+
+	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
+					  le32_to_cpu(rec->e_clusters), NULL,
+					  NULL);
+
+	if (!old_ent)
+		BUG();
+
+	ret = -EEXIST;
+	if (old_ent->e_tree_depth < tree_depth)
+		goto out_unlock;
+
+	if (old_ent->e_tree_depth == tree_depth) {
+		if (!memcmp(rec, &old_ent->e_rec,
+			    sizeof(struct ocfs2_extent_rec)))
+			ret = 0;
+
+		/* FIXME: Should this be ESRCH/EBADR??? */
+		goto out_unlock;
+	}
+
+	/*
+	 * We do it in this order specifically so that no actual tree
+	 * changes occur until we have all the pieces we need.  We
+	 * don't want malloc failures to leave an inconsistent tree.
+	 * Whenever we drop the lock, another process could be
+	 * inserting.  Also note that, if another process just beat us
+	 * to an insert, we might not need the same pieces we needed
+	 * the first go round.  In the end, the pieces we need will
+	 * be used, and the pieces we don't will be freed.
+	 */
+	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
+			     le32_to_cpu(old_ent->e_rec.e_cpos));
+	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
+			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
+			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
+	ret = -EAGAIN;
+	if (ctxt->need_left) {
+		if (!ctxt->left_ent)
+			goto out_unlock;
+		*(ctxt->left_ent) = *old_ent;
+		ctxt->left_ent->e_rec.e_clusters =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
+				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
+	}
+	if (ctxt->need_right) {
+		if (!ctxt->right_ent)
+			goto out_unlock;
+		*(ctxt->right_ent) = *old_ent;
+		ctxt->right_ent->e_rec.e_cpos =
+			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
+				    le32_to_cpu(rec->e_clusters));
+		ctxt->right_ent->e_rec.e_clusters =
+			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
+				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
+				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
+	}
+
+	rb_erase(&old_ent->e_node, &em->em_extents);
+	/* Now that he's erased, set him up for deletion */
+	ctxt->old_ent = old_ent;
+
+	if (ctxt->need_left) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->left_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->left_ent = NULL;
+	}
+
+	if (ctxt->need_right) {
+		ret = ocfs2_extent_map_insert_entry(em,
+						    ctxt->right_ent);
+		if (ret)
+			goto out_unlock;
+		ctxt->right_ent = NULL;
+	}
+
+	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
+
+	if (!ret)
+		ctxt->new_ent = NULL;
+
+out_unlock:
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	return ret;
+}
+
+
+static int ocfs2_extent_map_insert(struct inode *inode,
+				   struct ocfs2_extent_rec *rec,
+				   int tree_depth)
+{
+	int ret;
+	struct ocfs2_em_insert_context ctxt = {0, };
+
+	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
+	    OCFS2_I(inode)->ip_map.em_clusters) {
+		ret = -EBADR;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Zero e_clusters means a truncated tail record.  It better be EOF */
+	if (!rec->e_clusters) {
+		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
+		    OCFS2_I(inode)->ip_map.em_clusters) {
+			ret = -EBADR;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		/* Ignore the truncated tail */
+		return 0;
+	}
+
+	ret = -ENOMEM;
+	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
+					GFP_KERNEL);
+	if (!ctxt.new_ent) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ctxt.new_ent->e_rec = *rec;
+	ctxt.new_ent->e_tree_depth = tree_depth;
+
+	do {
+		ret = -ENOMEM;
+		if (ctxt.need_left && !ctxt.left_ent) {
+			ctxt.left_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.left_ent)
+				break;
+		}
+		if (ctxt.need_right && !ctxt.right_ent) {
+			ctxt.right_ent =
+				kmem_cache_alloc(ocfs2_em_ent_cachep,
+						 GFP_KERNEL);
+			if (!ctxt.right_ent)
+				break;
+		}
+
+		ret = ocfs2_extent_map_try_insert(inode, rec,
+						  tree_depth, &ctxt);
+	} while (ret == -EAGAIN);
+
+	if (ret < 0)
+		mlog_errno(ret);
+
+	if (ctxt.left_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
+	if (ctxt.right_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
+	if (ctxt.old_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
+	if (ctxt.new_ent)
+		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+
+	return ret;
+}
+
+/*
+ * Append this record to the tail of the extent map.  It must be
+ * tree_depth 0.  The record might be an extension of an existing
+ * record, and as such that needs to be handled.  eg:
+ *
+ * Existing record in the extent map:
+ *
+ *	cpos = 10, len = 10
+ * 	|---------|
+ *
+ * New Record:
+ *
+ *	cpos = 10, len = 20
+ * 	|------------------|
+ *
+ * The passed record is the new on-disk record.  The new_clusters value
+ * is how many clusters were added to the file.  If the append is a
+ * contiguous append, the new_clusters has been added to
+ * rec->e_clusters.  If the append is an entirely new extent, then
+ * rec->e_clusters is == new_clusters.
+ */
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters)
+{
+	int ret;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+	struct ocfs2_extent_rec *old;
+
+	BUG_ON(!new_clusters);
+	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+
+	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
+			 le32_to_cpu(rec->e_clusters)) !=
+			(em->em_clusters + new_clusters),
+			"Inode %"MLFu64":\n"
+			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
+			"em->em_clusters = %u + new_clusters = %u = %u\n",
+			OCFS2_I(inode)->ip_blkno,
+			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
+			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
+			em->em_clusters, new_clusters,
+			em->em_clusters + new_clusters);
+
+	em->em_clusters += new_clusters;
+
+	ret = -ENOENT;
+	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
+		/* This is a contiguous append */
+		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
+					      NULL, NULL);
+		if (ent) {
+			old = &ent->e_rec;
+			BUG_ON((le32_to_cpu(rec->e_cpos) +
+				le32_to_cpu(rec->e_clusters)) !=
+				 (le32_to_cpu(old->e_cpos) +
+				  le32_to_cpu(old->e_clusters) +
+				  new_clusters));
+			if (ent->e_tree_depth == 0) {
+				BUG_ON(le32_to_cpu(old->e_cpos) !=
+				       le32_to_cpu(rec->e_cpos));
+				BUG_ON(le64_to_cpu(old->e_blkno) !=
+				       le64_to_cpu(rec->e_blkno));
+				ret = 0;
+			}
+			/*
+			 * Let non-leafs fall through as -ENOENT to
+			 * force insertion of the new leaf.
+			 */
+			le32_add_cpu(&old->e_clusters, new_clusters);
+		}
+	}
+
+	if (ret == -ENOENT)
+		ret = ocfs2_extent_map_insert(inode, rec, 0);
+	if (ret < 0)
+		mlog_errno(ret);
+	return ret;
+}
+
+#if 0
+/* Code here is included but defined out as it completes the extent
+ * map api and may be used in the future. */
+
+/*
+ * Look up the record containing this cluster offset.  This record is
+ * part of the extent map.  Do not free it.  Any changes you make to
+ * it will reflect in the extent map.  So, if your last extent
+ * is (cpos = 10, clusters = 10) and you truncate the file by 5
+ * clusters, you can do:
+ *
+ * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
+ * rec->e_clusters -= 5;
+ *
+ * The lookup does not read from disk.  If the map isn't filled in for
+ * an entry, you won't find it.
+ *
+ * Also note that the returned record is valid until alloc_sem is
+ * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ */
+int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
+			     struct ocfs2_extent_rec **rec,
+			     int *tree_depth)
+{
+	int ret = -ENOENT;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*rec = NULL;
+
+	if (cpos >= OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if (cpos >= em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
+	}
+
+	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
+				      NULL, NULL);
+
+	if (ent) {
+		*rec = &ent->e_rec;
+		if (tree_depth)
+			*tree_depth = ent->e_tree_depth;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+int ocfs2_extent_map_get_clusters(struct inode *inode,
+				  u32 v_cpos, int count,
+				  u32 *p_cpos, int *ret_count)
+{
+	int ret;
+	u32 coff, ccount;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	*p_cpos = ccount = 0;
+
+	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
+		return -EINVAL;
+
+	if ((v_cpos + count) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+
+	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
+	if (ret)
+		return ret;
+
+	if (ent) {
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
+							v_cpos,
+							count))
+			return -ESRCH;
+
+		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
+		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+				le64_to_cpu(ent->e_rec.e_blkno)) +
+			  coff;
+
+		if (ret_count)
+			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+
+		return 0;
+	}
+
+
+	return -ENOENT;
+}
+
+#endif  /*  0  */
+
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count)
+{
+	int ret;
+	u64 boff;
+	u32 cpos, clusters;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	struct ocfs2_extent_map_entry *ent = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_rec *rec;
+
+	*p_blkno = 0;
+
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
+	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
+					    (u64)count + bpc - 1);
+	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if ((cpos + clusters) > em->em_clusters) {
+		/*
+		 * Size changed underneath us on disk.  Drop any
+		 * straddling records and update our idea of
+		 * i_clusters
+		 */
+		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
+		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+	}
+
+	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	if (ent)
+	{
+		rec = &ent->e_rec;
+
+		/* We should never find ourselves straddling an interval */
+		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
+			ret = -ESRCH;
+			mlog_errno(ret);
+			return ret;
+		}
+
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
+						le32_to_cpu(rec->e_cpos));
+		boff += (v_blkno & (u64)(bpc - 1));
+		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+
+		if (ret_count) {
+			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
+					le32_to_cpu(rec->e_clusters)) - boff;
+		}
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int ocfs2_extent_map_init(struct inode *inode)
+{
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+
+	em->em_extents = RB_ROOT;
+	em->em_clusters = 0;
+
+	return 0;
+}
+
+/* Needs the lock */
+static void __ocfs2_extent_map_drop(struct inode *inode,
+				    u32 new_clusters,
+				    struct rb_node **free_head,
+				    struct ocfs2_extent_map_entry **tail_ent)
+{
+	struct rb_node *node, *next;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	*free_head = NULL;
+
+	ent = NULL;
+	node = rb_last(&em->em_extents);
+	while (node)
+	{
+		next = rb_prev(node);
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
+			break;
+
+		rb_erase(&ent->e_node, &em->em_extents);
+
+		node->rb_right = *free_head;
+		*free_head = node;
+
+		ent = NULL;
+		node = next;
+	}
+
+	/* Do we have an entry straddling new_clusters? */
+	if (tail_ent) {
+		if (ent &&
+		    ((le32_to_cpu(ent->e_rec.e_cpos) +
+		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
+			*tail_ent = ent;
+		else
+			*tail_ent = NULL;
+	}
+}
+
+static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
+{
+	struct rb_node *node;
+	struct ocfs2_extent_map_entry *ent;
+
+	while (free_head) {
+		node = free_head;
+		free_head = node->rb_right;
+
+		ent = rb_entry(node, struct ocfs2_extent_map_entry,
+			       e_node);
+		kmem_cache_free(ocfs2_em_ent_cachep, ent);
+	}
+}
+
+/*
+ * Remove all entries past new_clusters, inclusive of an entry that
+ * contains new_clusters.  This is effectively a cache forget.
+ *
+ * If you want to also clip the last extent by some number of clusters,
+ * you need to call ocfs2_extent_map_trunc().
+ * This code does not check or modify ip_clusters.
+ */
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	struct ocfs2_extent_map_entry *ent;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent) {
+		rb_erase(&ent->e_node, &em->em_extents);
+		ent->e_node.rb_right = free_head;
+		free_head = &ent->e_node;
+	}
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+/*
+ * Remove all entries past new_clusters and also clip any extent
+ * straddling new_clusters, if there is one.  This does not check
+ * or modify ip_clusters
+ */
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+{
+	struct rb_node *free_head = NULL;
+	struct ocfs2_extent_map_entry *ent = NULL;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+
+	if (ent)
+		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
+					       le32_to_cpu(ent->e_rec.e_cpos));
+
+	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (free_head)
+		__ocfs2_extent_map_drop_cleanup(free_head);
+
+	return 0;
+}
+
+int __init init_ocfs2_extent_maps(void)
+{
+	ocfs2_em_ent_cachep =
+		kmem_cache_create("ocfs2_em_ent",
+				  sizeof(struct ocfs2_extent_map_entry),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_em_ent_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit exit_ocfs2_extent_maps(void)
+{
+	kmem_cache_destroy(ocfs2_em_ent_cachep);
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 0000000..fa3745e
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * extent_map.h
+ *
+ * In-memory file extent mappings for OCFS2.
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _EXTENT_MAP_H
+#define _EXTENT_MAP_H
+
+int init_ocfs2_extent_maps(void);
+void exit_ocfs2_extent_maps(void);
+
+/*
+ * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
+ * to be held.  The allocation cannot change at all while the map is
+ * in the process of being updated.
+ */
+int ocfs2_extent_map_init(struct inode *inode);
+int ocfs2_extent_map_append(struct inode *inode,
+			    struct ocfs2_extent_rec *rec,
+			    u32 new_clusters);
+int ocfs2_extent_map_get_blocks(struct inode *inode,
+				u64 v_blkno, int count,
+				u64 *p_blkno, int *ret_count);
+int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
+int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+
+#endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 0000000..72ae9e3
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.c
+ *
+ * File open, close, extend, truncate
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "mmap.h"
+#include "suballoc.h"
+#include "super.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_sync_inode(struct inode *inode)
+{
+	filemap_fdatawrite(inode->i_mapping);
+	return sync_mapping_buffers(inode->i_mapping);
+}
+
+static int ocfs2_file_open(struct inode *inode, struct file *file)
+{
+	int status;
+	int mode = file->f_flags;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		   file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+
+	/* Check that the inode hasn't been wiped from disk by another
+	 * node. If it hasn't then we're safe as long as we hold the
+	 * spin lock until our increment of open count. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&oi->ip_lock);
+
+		status = -ENOENT;
+		goto leave;
+	}
+
+	if (mode & O_DIRECT)
+		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
+
+	oi->ip_open_count++;
+	spin_unlock(&oi->ip_lock);
+	status = 0;
+leave:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_file_release(struct inode *inode, struct file *file)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+		       file->f_dentry->d_name.len,
+		       file->f_dentry->d_name.name);
+
+	spin_lock(&oi->ip_lock);
+	if (!--oi->ip_open_count)
+		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+	spin_unlock(&oi->ip_lock);
+
+	mlog_exit(0);
+
+	return 0;
+}
+
+static int ocfs2_sync_file(struct file *file,
+			   struct dentry *dentry,
+			   int datasync)
+{
+	int err = 0;
+	journal_t *journal;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	err = ocfs2_sync_inode(dentry->d_inode);
+	if (err)
+		goto bail;
+
+	journal = osb->journal->j_journal;
+	err = journal_force_commit(journal);
+
+bail:
+	mlog_exit(err);
+
+	return (err < 0) ? -EIO : 0;
+}
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size)
+{
+	int status;
+
+	mlog_entry_void();
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_simple_size_update(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    u64 new_i_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_journal_handle *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, NULL,
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_set_inode_size(handle, inode, di_bh,
+				   new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(handle);
+out:
+	return ret;
+}
+
+static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     u64 new_i_size)
+{
+	int status;
+	struct ocfs2_journal_handle *handle;
+
+	mlog_entry_void();
+
+	/* TODO: This needs to actually orphan the inode in this
+	 * transaction. */
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_truncate_file(struct inode *inode,
+			       struct buffer_head *di_bh,
+			       u64 new_i_size)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_truncate_context *tc = NULL;
+
+	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
+		   OCFS2_I(inode)->ip_blkno, new_i_size);
+
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
+	fe = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
+			"Inode %"MLFu64", inode i_size = %lld != di "
+			"i_size = %"MLFu64", i_flags = 0x%x\n",
+			OCFS2_I(inode)->ip_blkno,
+			i_size_read(inode),
+			le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
+
+	if (new_i_size > le64_to_cpu(fe->i_size)) {
+		mlog(0, "asked to truncate file with size (%"MLFu64") "
+		     "to size (%"MLFu64")!\n",
+		     le64_to_cpu(fe->i_size), new_i_size);
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
+	     le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
+
+	/* lets handle the simple truncate cases before doing any more
+	 * cluster locking. */
+	if (new_i_size == le64_to_cpu(fe->i_size))
+		goto bail;
+
+	if (le32_to_cpu(fe->i_clusters) ==
+	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
+		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
+		     fe->i_clusters);
+		/* No allocation change is required, so lets fast path
+		 * this truncate. */
+		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (status < 0)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* This forces other nodes to sync and drop their pages */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
+	/* alright, we're going to need to do a full blown alloc size
+	 * change. Orphan the inode so that recovery can complete the
+	 * truncate if necessary. This does the task of marking
+	 * i_size. */
+	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* TODO: orphan dir cleanup here. */
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * extend allocation only here.
+ * we'll update all the disk stuff, and oip->alloc_size
+ *
+ * expect stuff to be locked, a transaction started and enough data /
+ * metadata reservations in the contexts.
+ *
+ * Will return -EAGAIN, and a reason if a restart is needed.
+ * If passed in, *reason will always be set, even in error.
+ */
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+
+	BUG_ON(!clusters_to_add);
+
+	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(fe))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				      &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
+	     num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
+				     num_bits, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_clusters, num_bits);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, clusters = %u, wanted = "
+		     "%u\n", fe->i_clusters, clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
+
+static int ocfs2_extend_allocation(struct inode *inode,
+				   u32 clusters_to_add)
+{
+	int status = 0;
+	int restart_func = 0;
+	int drop_alloc_sem = 0;
+	int credits, num_free_extents;
+	u32 prev_clusters;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+				  OCFS2_BH_CACHED, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+		status = -EIO;
+		goto leave;
+	}
+
+restart_all:
+	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
+
+	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     fe->i_clusters, clusters_to_add);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	num_free_extents = ocfs2_num_free_extents(osb,
+						  inode,
+						  fe);
+	if (num_free_extents < 0) {
+		status = num_free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!num_free_extents) {
+		status = ocfs2_reserve_new_metadata(osb,
+						    handle,
+						    fe,
+						    &meta_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					clusters_to_add,
+					&data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* blocks peope in read/write from reading our allocation
+	 * until we're done changing it. We depend on i_sem to block
+	 * other extend/truncate calls while we're here. Ordering wrt
+	 * start_trans is important here -- always do it before! */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	drop_alloc_sem = 1;
+
+	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	/* reserve a write to the file entry early on - that we if we
+	 * run out of credits in the allocation path, we can still
+	 * update i_size. */
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = OCFS2_I(inode)->ip_clusters;
+
+	status = ocfs2_do_extend_allocation(osb,
+					    inode,
+					    clusters_to_add,
+					    bh,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    fe,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
+	     fe->i_clusters, fe->i_size);
+	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
+	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+
+leave:
+	if (drop_alloc_sem) {
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+		drop_alloc_sem = 0;
+	}
+	if (handle) {
+		ocfs2_commit_trans(handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Some parts of this taken from generic_cont_expand, which turned out
+ * to be too fragile to do exactly what we need without us having to
+ * worry about recursive locking in ->commit_write(). */
+static int ocfs2_write_zero_page(struct inode *inode,
+				 u64 size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long index;
+	unsigned int offset;
+	struct ocfs2_journal_handle *handle = NULL;
+	int ret;
+
+	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+	/* ugh.  in prepare/commit_write, if from==to==start of block, we 
+	** skip the prepare.  make sure we never send an offset for the start
+	** of a block
+	*/
+	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
+		offset++;
+	}
+	index = size >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(mapping, index);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	if (ocfs2_should_order_data(inode)) {
+		handle = ocfs2_start_walk_page_trans(inode, page, offset,
+						     offset);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			handle = NULL;
+			goto out_unlock;
+		}
+	}
+
+	/* must not update i_size! */
+	ret = block_commit_write(page, offset, offset);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ret = 0;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int ocfs2_zero_extend(struct inode *inode,
+			     u64 zero_to_size)
+{
+	int ret = 0;
+	u64 start_off;
+	struct super_block *sb = inode->i_sb;
+
+	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	while (start_off < zero_to_size) {
+		ret = ocfs2_write_zero_page(inode, start_off);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start_off += sb->s_blocksize;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_file(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
+{
+	int ret = 0;
+	u32 clusters_to_add;
+
+	/* setattr sometimes calls us like this. */
+	if (new_i_size == 0)
+		goto out;
+
+	if (i_size_read(inode) == new_i_size)
+  		goto out;
+	BUG_ON(new_i_size < i_size_read(inode));
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
+		OCFS2_I(inode)->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_zero_extend(inode, new_i_size);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} 
+
+	/* No allocation required, we just use this helper to
+	 * do a trivial update of i_size. */
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int status = 0, size_change;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+
+	mlog_entry("(0x%p, '%.*s')\n", dentry,
+	           dentry->d_name.len, dentry->d_name.name);
+
+	if (attr->ia_valid & ATTR_MODE)
+		mlog(0, "mode change: %d\n", attr->ia_mode);
+	if (attr->ia_valid & ATTR_UID)
+		mlog(0, "uid change: %d\n", attr->ia_uid);
+	if (attr->ia_valid & ATTR_GID)
+		mlog(0, "gid change: %d\n", attr->ia_gid);
+	if (attr->ia_valid & ATTR_SIZE)
+		mlog(0, "size change...\n");
+	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
+		mlog(0, "time change...\n");
+
+#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
+			   | ATTR_GID | ATTR_UID | ATTR_MODE)
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+		return 0;
+	}
+
+	status = inode_change_ok(inode, attr);
+	if (status)
+		return status;
+
+	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
+	if (size_change) {
+		status = ocfs2_rw_lock(inode, 1);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail_unlock_rw;
+	}
+
+	if (size_change && attr->ia_size != i_size_read(inode)) {
+		if (i_size_read(inode) > attr->ia_size)
+			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
+		else
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			status = -ENOSPC;
+			goto bail_unlock;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = inode_setattr(inode, attr);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode, 1);
+bail_unlock_rw:
+	if (size_change)
+		ocfs2_rw_unlock(inode, 1);
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_getattr(struct vfsmount *mnt,
+		  struct dentry *dentry,
+		  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct ocfs2_super *osb = sb->s_fs_info;
+	int err;
+
+	mlog_entry_void();
+
+	err = ocfs2_inode_revalidate(dentry);
+	if (err) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	generic_fillattr(inode, stat);
+
+	/* We set the blksize from the cluster size for performance */
+	stat->blksize = osb->s_clustersize;
+
+bail:
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+
+	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
+		   inode->i_mode);
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_trans;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_bh;
+	}
+
+	inode->i_mode &= ~S_ISUID;
+	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
+		inode->i_mode &= ~S_ISGID;
+
+	di = (struct ocfs2_dinode *) bh->b_data;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ret = ocfs2_journal_dirty(handle, bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_bh:
+	brelse(bh);
+out_trans:
+	ocfs2_commit_trans(handle);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static inline int ocfs2_write_should_remove_suid(struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	if (!capable(CAP_FSETID)) {
+		if (unlikely(mode & S_ISUID))
+			return 1;
+
+		if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
+				    const char __user *buf,
+				    size_t count,
+				    loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
+	u32 clusters;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t newsize, saved_pos;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	/* happy write of zero bytes */
+	if (count == 0)
+		return 0;
+
+	if (!inode) {
+		mlog(0, "bad inode\n");
+		return -EIO;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	/* ugh, work around some applications which open everything O_DIRECT +
+	 * O_APPEND and really don't mean to use O_DIRECT. */
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 
+		filp->f_flags &= ~O_DIRECT;
+#endif
+
+	down(&inode->i_sem);
+	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
+	if (filp->f_flags & O_DIRECT) {
+		have_alloc_sem = 1;
+		down_read(&inode->i_alloc_sem);
+	}
+
+	/* concurrent O_DIRECT writes are allowed */
+	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	ret = ocfs2_rw_lock(inode, rw_level);
+	if (ret < 0) {
+		rw_level = -1;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* 
+	 * We sample i_size under a read level meta lock to see if our write
+	 * is extending the file, if it is we back off and get a write level
+	 * meta lock.
+	 */
+	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
+	for(;;) {
+		ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+		if (ret < 0) {
+			meta_level = -1;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Clear suid / sgid if necessary. We do this here
+		 * instead of later in the write path because
+		 * remove_suid() calls ->setattr without any hint that
+		 * we may have already done our cluster locking. Since
+		 * ocfs2_setattr() *must* take cluster locks to
+		 * proceeed, this will lead us to recursively lock the
+		 * inode. There's also the dinode i_size state which
+		 * can be lost via setattr during extending writes (we
+		 * set inode->i_size at the end of a write. */
+		if (ocfs2_write_should_remove_suid(inode)) {
+			if (meta_level == 0) {
+				ocfs2_meta_unlock(inode, meta_level);
+				meta_level = 1;
+				continue;
+			}
+
+			ret = ocfs2_write_remove_suid(inode);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		/* work on a copy of ppos until we're sure that we won't have
+		 * to recalculate it due to relocking. */
+		if (filp->f_flags & O_APPEND) {
+			saved_pos = i_size_read(inode);
+			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+		} else {
+			saved_pos = iocb->ki_pos;
+		}
+		newsize = count + saved_pos;
+
+		mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
+		     saved_pos, newsize, i_size_read(inode));
+
+		/* No need for a higher level metadata lock if we're
+		 * never going past i_size. */
+		if (newsize <= i_size_read(inode))
+			break;
+
+		if (meta_level == 0) {
+			ocfs2_meta_unlock(inode, meta_level);
+			meta_level = 1;
+			continue;
+		}
+
+		spin_lock(&OCFS2_I(inode)->ip_lock);
+		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
+			OCFS2_I(inode)->ip_clusters;
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+		mlog(0, "Writing at EOF, may need more allocation: "
+		     "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
+		     i_size_read(inode), newsize, clusters);
+
+		/* We only want to continue the rest of this loop if
+		 * our extend will actually require more
+		 * allocation. */
+		if (!clusters)
+			break;
+
+		ret = ocfs2_extend_allocation(inode, clusters);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		/* Fill any holes which would've been created by this
+		 * write. If we're O_APPEND, this will wind up
+		 * (correctly) being a noop. */
+		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/* ok, we're done with i_size and alloc work */
+	iocb->ki_pos = saved_pos;
+	ocfs2_meta_unlock(inode, meta_level);
+	meta_level = -1;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb);
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
+	    filp->f_flags & O_DIRECT) {
+		unsigned int saved_flags = filp->f_flags;
+		int sector_size = 1 << osb->s_sectsize_bits;
+
+		if ((saved_pos & (sector_size - 1)) ||
+		    (count & (sector_size - 1)) ||
+		    ((unsigned long)buf & (sector_size - 1))) {
+			filp->f_flags |= O_SYNC;
+			filp->f_flags &= ~O_DIRECT;
+		}
+
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+		filp->f_flags = saved_flags;
+	} else
+#endif
+		ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+						    &iocb->ki_pos);
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* 
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.  (it's the clustered equivalent of
+	 * i_alloc_sem; protects truncate from racing with pending ios).
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+out:
+	if (meta_level != -1)
+		ocfs2_meta_unlock(inode, meta_level);
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	up(&inode->i_sem);
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
+				   char __user *buf,
+				   size_t count,
+				   loff_t pos)
+{
+	int ret = 0, rw_level = -1, have_alloc_sem = 0;
+	struct file *filp = iocb->ki_filp;
+	struct inode *inode = filp->f_dentry->d_inode;
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+#endif
+
+	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
+		   (unsigned int)count,
+		   filp->f_dentry->d_name.len,
+		   filp->f_dentry->d_name.name);
+
+	if (!inode) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto bail;
+	}
+
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
+		if (filp->f_flags & O_DIRECT) {
+			int sector_size = 1 << osb->s_sectsize_bits;
+
+			if ((pos & (sector_size - 1)) ||
+			    (count & (sector_size - 1)) ||
+			    ((unsigned long)buf & (sector_size - 1)) ||
+			    (i_size_read(inode) & (sector_size -1))) {
+				filp->f_flags &= ~O_DIRECT;
+			}
+		}
+	}
+#endif
+
+	/* 
+	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
+	 * need locks to protect pending reads from racing with truncate.
+	 */
+	if (filp->f_flags & O_DIRECT) {
+		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
+
+		ret = ocfs2_rw_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto bail;
+		}
+		rw_level = 0;
+		/* communicate with ocfs2_dio_end_io */
+		ocfs2_iocb_set_rw_locked(iocb);
+	}
+
+	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	if (ret == -EINVAL)
+		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+
+	/* buffered aio wouldn't have proper lock coverage today */
+	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+
+	/* see ocfs2_file_aio_write */
+	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+		rw_level = -1;
+		have_alloc_sem = 0;
+	}
+
+bail:
+	if (have_alloc_sem)
+		up_read(&inode->i_alloc_sem);
+	if (rw_level != -1) 
+		ocfs2_rw_unlock(inode, rw_level);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+struct inode_operations ocfs2_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct inode_operations ocfs2_special_file_iops = {
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
+
+struct file_operations ocfs2_fops = {
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.sendfile	= generic_file_sendfile,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+};
+
+struct file_operations ocfs2_dops = {
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 0000000..a5ea33b
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * file.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_FILE_H
+#define OCFS2_FILE_H
+
+extern struct file_operations ocfs2_fops;
+extern struct file_operations ocfs2_dops;
+extern struct inode_operations ocfs2_file_iops;
+extern struct inode_operations ocfs2_special_file_iops;
+struct ocfs2_alloc_context;
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+			       struct inode *inode,
+			       u32 clusters_to_add,
+			       struct buffer_head *fe_bh,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
+int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		  struct kstat *stat);
+
+int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *fe_bh,
+			 u64 new_i_size);
+
+#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 0000000..0bbd22f
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.c
+ *
+ * Register ourselves with the heartbaet service, keep our node maps
+ * up to date, and fire off recovery when needed.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kmod.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_HB_NODE_DOWN_PRI     (0x0000002)
+#define OCFS2_HB_NODE_UP_PRI	   OCFS2_HB_NODE_DOWN_PRI
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit);
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit);
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from);
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb)
+{
+	spin_lock_init(&osb->node_map_lock);
+	ocfs2_node_map_init(&osb->mounted_map);
+	ocfs2_node_map_init(&osb->recovery_map);
+	ocfs2_node_map_init(&osb->umount_map);
+}
+
+static void ocfs2_do_node_down(int node_num,
+			       struct ocfs2_super *osb)
+{
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "ocfs2: node down event for %d\n", node_num);
+
+	if (!osb->dlm) {
+		/*
+		 * No DLM means we're not even ready to participate yet.
+		 * We check the slots after the DLM comes up, so we will
+		 * notice the node death then.  We can safely ignore it
+		 * here.
+		 */
+		return;
+	}
+
+	if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
+		/* If a node is in the umount map, then we've been
+		 * expecting him to go down and we know ahead of time
+		 * that recovery is not necessary. */
+		ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+		return;
+	}
+
+	ocfs2_recovery_thread(osb, node_num);
+
+	ocfs2_remove_node_from_vote_queues(osb, node_num);
+}
+
+static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
+				  int node_num,
+				  void *data)
+{
+	ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
+}
+
+/* Called from the dlm when it's about to evict a node. We may also
+ * get a heartbeat callback later. */
+static void ocfs2_dlm_eviction_cb(int node_num,
+				  void *data)
+{
+	struct ocfs2_super *osb = (struct ocfs2_super *) data;
+	struct super_block *sb = osb->sb;
+
+	mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
+	     MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
+
+	ocfs2_do_node_down(node_num, osb);
+}
+
+static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
+				int node_num,
+				void *data)
+{
+	struct ocfs2_super *osb = data;
+
+	BUG_ON(osb->node_num == node_num);
+
+	mlog(0, "node up event for %d\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
+{
+	o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
+			    ocfs2_hb_node_down_cb, osb,
+			    OCFS2_HB_NODE_DOWN_PRI);
+
+	o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
+			    ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
+
+	/* Not exactly a heartbeat callback, but leads to essentially
+	 * the same path so we set it up here. */
+	dlm_setup_eviction_cb(&osb->osb_eviction_cb,
+			      ocfs2_dlm_eviction_cb,
+			      osb);
+}
+
+/* Most functions here are just stubs for now... */
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_register_callback(&osb->osb_hb_down);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2hb_register_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	return status;
+}
+
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
+{
+	int status;
+
+	status = o2hb_unregister_callback(&osb->osb_hb_down);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = o2hb_unregister_callback(&osb->osb_hb_up);
+	if (status < 0)
+		mlog_errno(status);
+}
+
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	if (!osb->uuid_str) {
+		/* This can happen if we don't get far enough in mount... */
+		mlog(0, "No UUID with which to stop heartbeat!\n\n");
+		return;
+	}
+
+	argv[0] = (char *)o2nm_get_hb_ctl_path();
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = osb->uuid_str;
+	argv[4] = NULL;
+
+	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+}
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!!  */
+void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+	map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+	memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+	       sizeof(unsigned long));
+}
+
+static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
+					    int bit)
+{
+	set_bit(bit, map->map);
+}
+
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_set_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
+					      int bit)
+{
+	clear_bit(bit, map->map);
+}
+
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit)
+{
+	if (bit==-1)
+		return;
+	BUG_ON(bit >= map->num_nodes);
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_clear_bit(map, bit);
+	spin_unlock(&osb->node_map_lock);
+}
+
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit)
+{
+	int ret;
+	if (bit >= map->num_nodes) {
+		mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
+		BUG();
+	}
+	spin_lock(&osb->node_map_lock);
+	ret = test_bit(bit, map->map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
+{
+	int bit;
+	bit = find_next_bit(map->map, map->num_nodes, 0);
+	if (bit < map->num_nodes)
+		return 0;
+	return 1;
+}
+
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map)
+{
+	int ret;
+	BUG_ON(map->num_nodes == 0);
+	spin_lock(&osb->node_map_lock);
+	ret = __ocfs2_node_map_is_empty(map);
+	spin_unlock(&osb->node_map_lock);
+	return ret;
+}
+
+static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	BUG_ON(from->num_nodes == 0);
+	ocfs2_node_map_init(target);
+	__ocfs2_node_map_set(target, from);
+}
+
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit)
+{
+	struct ocfs2_node_map temp;
+	int ret;
+
+	spin_lock(&osb->node_map_lock);
+	__ocfs2_node_map_dup(&temp, target);
+	__ocfs2_node_map_clear_bit(&temp, bit);
+	ret = __ocfs2_node_map_is_empty(&temp);
+	spin_unlock(&osb->node_map_lock);
+
+	return ret;
+}
+
+static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
+				 struct ocfs2_node_map *from)
+{
+	int num_longs, i;
+
+	BUG_ON(target->num_nodes != from->num_nodes);
+	BUG_ON(target->num_nodes == 0);
+
+	num_longs = BITS_TO_LONGS(target->num_nodes);
+	for (i = 0; i < num_longs; i++)
+		target->map[i] = from->map[i];
+}
+
+/* Returns whether the recovery bit was actually set - it may not be
+ * if a node is still marked as needing recovery */
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num)
+{
+	int set = 0;
+
+	spin_lock(&osb->node_map_lock);
+
+	__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
+
+	if (!test_bit(num, osb->recovery_map.map)) {
+	    __ocfs2_node_map_set_bit(&osb->recovery_map, num);
+	    set = 1;
+	}
+
+	spin_unlock(&osb->node_map_lock);
+
+	return set;
+}
+
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num)
+{
+	ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
+}
+
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx)
+{
+	int i = idx;
+
+	idx = O2NM_INVALID_NODE_NUM;
+	spin_lock(&osb->node_map_lock);
+	if ((i != O2NM_INVALID_NODE_NUM) &&
+	    (i >= 0) &&
+	    (i < map->num_nodes)) {
+		while(i < map->num_nodes) {
+			if (test_bit(i, map->map)) {
+				idx = i;
+				break;
+			}
+			i++;
+		}
+	}
+	spin_unlock(&osb->node_map_lock);
+	return idx;
+}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 0000000..e8fb079
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * heartbeat.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_HEARTBEAT_H
+#define OCFS2_HEARTBEAT_H
+
+void ocfs2_init_node_maps(struct ocfs2_super *osb);
+
+void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
+int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
+void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+
+/* node map functions - used to keep track of mounted and in-recovery
+ * nodes. */
+void ocfs2_node_map_init(struct ocfs2_node_map *map);
+int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map);
+void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
+			      struct ocfs2_node_map *map,
+			      int bit);
+int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
+			    struct ocfs2_node_map *map,
+			    int bit);
+int ocfs2_node_map_iterate(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *map,
+			   int idx);
+static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
+					       struct ocfs2_node_map *map)
+{
+	return ocfs2_node_map_iterate(osb, map, 0);
+}
+int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+			   int num);
+void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+			      int num);
+/* returns 1 if bit is the only bit set in target, 0 otherwise */
+int ocfs2_node_map_is_only(struct ocfs2_super *osb,
+			   struct ocfs2_node_map *target,
+			   int bit);
+
+#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 0000000..a91ba4d
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.c
+ *
+ * vfs' aops, fops, dops and iops
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+#include <asm/byteorder.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "super.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_FI_FLAG_NOWAIT	0x1
+#define OCFS2_FI_FLAG_DELETE	0x2
+struct ocfs2_find_inode_args
+{
+	u64		fi_blkno;
+	unsigned long	fi_ino;
+	unsigned int	fi_flags;
+};
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args);
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
+static int ocfs2_find_actor(struct inode *inode, void *opaque);
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				    struct inode *inode,
+				    struct buffer_head *fe_bh);
+
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote)
+{
+	struct ocfs2_find_inode_args args;
+
+	/* ocfs2_ilookup_for_vote should *only* be called from the
+	 * vote thread */
+	BUG_ON(current != osb->vote_task);
+
+	args.fi_blkno = blkno;
+	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
+	if (delete_vote)
+		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
+	args.fi_ino = ino_from_blkno(osb->sb, blkno);
+	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
+}
+
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
+{
+	struct inode *inode = NULL;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_find_inode_args args;
+
+	mlog_entry("(blkno = %"MLFu64")\n", blkno);
+
+	/* Ok. By now we've either got the offsets passed to us by the
+	 * caller, or we just pulled them off the bh. Lets do some
+	 * sanity checks to make sure they're OK. */
+	if (blkno == 0) {
+		inode = ERR_PTR(-EINVAL);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+	args.fi_blkno = blkno;
+	args.fi_flags = 0;
+	args.fi_ino = ino_from_blkno(sb, blkno);
+
+	inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
+			     ocfs2_init_locked_inode, &args);
+	/* inode was *not* in the inode cache. 2.6.x requires
+	 * us to do our own read_inode call and unlock it
+	 * afterwards. */
+	if (inode && inode->i_state & I_NEW) {
+		mlog(0, "Inode was not in inode cache, reading it.\n");
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+	if (inode == NULL) {
+		inode = ERR_PTR(-ENOMEM);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = ERR_PTR(-ESTALE);
+		mlog_errno(PTR_ERR(inode));
+		goto bail;
+	}
+
+bail:
+	if (!IS_ERR(inode)) {
+		mlog(0, "returning inode with number %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		mlog_exit_ptr(inode);
+	} else
+		mlog_errno(PTR_ERR(inode));
+
+	return inode;
+}
+
+
+/*
+ * here's how inodes get read from disk:
+ * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
+ * found? : return the in-memory inode
+ * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
+ */
+
+static int ocfs2_find_actor(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
+
+	args = opaque;
+
+	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+
+	if (oi->ip_blkno != args->fi_blkno)
+		goto bail;
+
+	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
+	 * ocfs2_ilookup_for_vote which won't create an inode for one
+	 * that isn't found. The vote thread which doesn't want to get
+	 * an inode which is in the process of going away - otherwise
+	 * the call to __wait_on_freeing_inode in find_inode_fast will
+	 * cause it to deadlock on an inode which may be waiting on a
+	 * vote (or lock release) in delete_inode */
+	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
+	    (inode->i_state & (I_FREEING|I_CLEAR))) {
+		/* As stated above, we're not going to return an
+		 * inode.  In the case of a delete vote, the voting
+		 * code is going to signal the other node to go
+		 * ahead. Mark that state here, so this freeing inode
+		 * has the state when it gets to delete_inode. */
+		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
+			spin_lock(&oi->ip_lock);
+			ocfs2_mark_inode_remotely_deleted(inode);
+			spin_unlock(&oi->ip_lock);
+		}
+		goto bail;
+	}
+
+	ret = 1;
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * initialize the new inode, but don't do anything that would cause
+ * us to sleep.
+ * return 0 on success, 1 on failure
+ */
+static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
+{
+	struct ocfs2_find_inode_args *args = opaque;
+
+	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+
+	inode->i_ino = args->fi_ino;
+	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+
+	mlog_exit(0);
+	return 0;
+}
+
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+		     	 int create_ino)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	int status = -EINVAL;
+
+	mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
+
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	/* this means that read_inode cannot create a superblock inode
+	 * today.  change if needed. */
+	if (!OCFS2_IS_VALID_DINODE(fe) ||
+	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
+		     "signature = %.*s, flags = 0x%x\n",
+		     inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
+		     fe->i_signature, le32_to_cpu(fe->i_flags));
+		goto bail;
+	}
+
+	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
+		mlog(ML_ERROR, "file entry generation does not match "
+		     "superblock! osb->fs_generation=%x, "
+		     "fe->i_fs_generation=%x\n",
+		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
+		goto bail;
+	}
+
+	inode->i_version = 1;
+	inode->i_generation = le32_to_cpu(fe->i_generation);
+	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_blksize = (u32)osb->s_clustersize;
+
+	/* Fast symlinks will have i_size but no allocated clusters. */
+	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks =
+			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+	inode->i_mapping->a_ops = &ocfs2_aops;
+	inode->i_flags |= S_NOATIME;
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
+		mlog(ML_ERROR,
+		     "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
+		     OCFS2_I(inode)->ip_blkno, fe->i_blkno);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+
+	if (create_ino)
+		inode->i_ino = ino_from_blkno(inode->i_sb,
+			       le64_to_cpu(fe->i_blkno));
+
+	mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
+	     fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
+
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
+		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
+		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
+		/* we can't actually hit this as read_inode can't
+		 * handle superblocks today ;-) */
+		BUG();
+	}
+
+	switch (inode->i_mode & S_IFMT) {
+	    case S_IFREG:
+		    inode->i_fop = &ocfs2_fops;
+		    inode->i_op = &ocfs2_file_iops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFDIR:
+		    inode->i_op = &ocfs2_dir_iops;
+		    inode->i_fop = &ocfs2_dops;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    case S_IFLNK:
+		    if (ocfs2_inode_is_fast_symlink(inode))
+			inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		    else
+			inode->i_op = &ocfs2_symlink_inode_operations;
+		    i_size_write(inode, le64_to_cpu(fe->i_size));
+		    break;
+	    default:
+		    inode->i_op = &ocfs2_special_file_iops;
+		    init_special_inode(inode, inode->i_mode,
+				       inode->i_rdev);
+		    break;
+	}
+
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
+				  OCFS2_LOCK_TYPE_RW, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+				  OCFS2_LOCK_TYPE_META, inode);
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
+				  OCFS2_LOCK_TYPE_DATA, inode);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_read_locked_inode(struct inode *inode,
+				   struct ocfs2_find_inode_args *args)
+{
+	struct super_block *sb;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *fe;
+	struct buffer_head *bh = NULL;
+	int status;
+	int sysfile = 0;
+
+	mlog_entry("(0x%p, 0x%p)\n", inode, args);
+
+	status = -EINVAL;
+	if (inode == NULL || inode->i_sb == NULL) {
+		mlog(ML_ERROR, "bad inode\n");
+		goto bail;
+	}
+	sb = inode->i_sb;
+	osb = OCFS2_SB(sb);
+
+	if (!args) {
+		mlog(ML_ERROR, "bad inode args\n");
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	/* Read the FE off disk. This is safe because the kernel only
+	 * does one read_inode2 for a new inode, and if it doesn't
+	 * exist yet then nobody can be working on it! */
+	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
+		     fe->i_blkno, 7, fe->i_signature);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+		sysfile = 1;
+
+	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
+	    S_ISBLK(le16_to_cpu(fe->i_mode)))
+    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+
+	status = -EINVAL;
+	if (ocfs2_populate_inode(inode, fe, 0) < 0) {
+		mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
+		     "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
+		make_bad_inode(inode);
+		goto bail;
+	}
+
+	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+
+	if (sysfile)
+	       OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+
+	status = 0;
+
+bail:
+	if (args && bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_sync_blockdev(struct super_block *sb)
+{
+	sync_blockdev(sb->s_bdev);
+}
+
+static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
+				     struct inode *inode,
+				     struct buffer_head *fe_bh)
+{
+	int status = 0;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_truncate_context *tc = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	/* zero allocation, zero truncate :) */
+	if (!fe->i_clusters)
+		goto bail;
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_commit_trans(handle);
+	handle = NULL;
+
+	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_remove_inode(struct inode *inode,
+			      struct buffer_head *di_bh,
+			      struct inode *orphan_dir_inode,
+			      struct buffer_head *orphan_dir_bh)
+{
+	int status;
+	struct inode *inode_alloc_inode = NULL;
+	struct buffer_head *inode_alloc_bh = NULL;
+	struct ocfs2_journal_handle *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+
+	inode_alloc_inode =
+		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+					    le16_to_cpu(di->i_suballoc_slot));
+	if (!inode_alloc_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode_alloc_inode->i_sem);
+	status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+	if (status < 0) {
+		up(&inode_alloc_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	/* set the inodes dtime */
+	status = ocfs2_journal_access(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+
+	status = ocfs2_journal_dirty(handle, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_commit;
+	}
+
+	ocfs2_remove_from_cache(inode, di_bh);
+
+	status = ocfs2_free_dinode(handle, inode_alloc_inode,
+				   inode_alloc_bh, di);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_commit:
+	ocfs2_commit_trans(handle);
+bail_unlock:
+	ocfs2_meta_unlock(inode_alloc_inode, 1);
+	up(&inode_alloc_inode->i_sem);
+	brelse(inode_alloc_bh);
+bail:
+	iput(inode_alloc_inode);
+
+	return status;
+}
+
+static int ocfs2_wipe_inode(struct inode *inode,
+			    struct buffer_head *di_bh)
+{
+	int status, orphaned_slot;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We've already voted on this so it should be readonly - no
+	 * spinlock needed. */
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       orphaned_slot);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* Lock the orphan dir. The lock will be held for the entire
+	 * delete_inode operation. We do this now to avoid races with
+	 * recovery completion on other nodes. */
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we do this while holding the orphan dir lock because we
+	 * don't want recovery being run from another node to vote for
+	 * an inode delete on us -- this will result in two nodes
+	 * truncating the same file! */
+	status = ocfs2_truncate_for_delete(osb, inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
+	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
+				    orphan_dir_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail_unlock_dir:
+	ocfs2_meta_unlock(orphan_dir_inode, 1);
+	up(&orphan_dir_inode->i_sem);
+	brelse(orphan_dir_bh);
+bail:
+	iput(orphan_dir_inode);
+
+	return status;
+}
+
+/* There is a series of simple checks that should be done before a
+ * vote is even considered. Encapsulate those in this function. */
+static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	/* We shouldn't be getting here for the root directory
+	 * inode.. */
+	if (inode == osb->root_inode) {
+		mlog(ML_ERROR, "Skipping delete of root inode.\n");
+		goto bail;
+	}
+
+	/* If we're coming from process_vote we can't go into our own
+	 * voting [hello, deadlock city!], so unforuntately we just
+	 * have to skip deleting this guy. That's OK though because
+	 * the node who's doing the actual deleting should handle it
+	 * anyway. */
+	if (current == osb->vote_task) {
+		mlog(0, "Skipping delete of %lu because we're currently "
+		     "in process_vote\n", inode->i_ino);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	/* OCFS2 *never* deletes system files. This should technically
+	 * never get here as system file inodes should always have a
+	 * positive link count. */
+	if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
+		     oi->ip_blkno);
+		goto bail_unlock;
+	}
+
+	/* If we have voted "yes" on the wipe of this inode for
+	 * another node, it will be marked here so we can safely skip
+	 * it. Recovery will cleanup any inodes we might inadvertantly
+	 * skip here. */
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
+		mlog(0, "Skipping delete of %lu because another node "
+		     "has done this for us.\n", inode->i_ino);
+		goto bail_unlock;
+	}
+
+	ret = 1;
+bail_unlock:
+	spin_unlock(&oi->ip_lock);
+bail:
+	return ret;
+}
+
+/* Query the cluster to determine whether we should wipe an inode from
+ * disk or not.
+ *
+ * Requires the inode to have the cluster lock. */
+static int ocfs2_query_inode_wipe(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  int *wipe)
+{
+	int status = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di;
+
+	*wipe = 0;
+
+	/* While we were waiting for the cluster lock in
+	 * ocfs2_delete_inode, another node might have asked to delete
+	 * the inode. Recheck our flags to catch this. */
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
+		     oi->ip_blkno);
+		goto bail;
+	}
+
+	/* Now that we have an up to date inode, we can double check
+	 * the link count. */
+	if (inode->i_nlink) {
+		mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
+		     oi->ip_blkno, inode->i_nlink);
+		goto bail;
+	}
+
+	/* Do some basic inode verification... */
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/* for lack of a better error? */
+		status = -EEXIST;
+		mlog(ML_ERROR,
+		     "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
+		     "Disk flags  0x%x, inode flags 0x%x\n",
+		     oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
+		goto bail;
+	}
+
+	/* has someone already deleted us?! baaad... */
+	if (di->i_dtime) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_request_delete_vote(inode);
+	/* -EBUSY means that other nodes are still using the
+	 * inode. We're done here though, so avoid doing anything on
+	 * disk and let them worry about deleting it. */
+	if (status == -EBUSY) {
+		status = 0;
+		mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
+		     "other nodes\n", oi->ip_blkno);
+		goto bail;
+	}
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
+		/* Nobody knew which slot this inode was orphaned
+		 * into. This may happen during node death and
+		 * recovery knows how to clean it up so we can safely
+		 * ignore this inode for now on. */
+		mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
+		     oi->ip_blkno);
+	} else {
+		*wipe = 1;
+
+		mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
+		     oi->ip_blkno, oi->ip_orphaned_slot);
+	}
+	spin_unlock(&oi->ip_lock);
+
+bail:
+	return status;
+}
+
+/* Support function for ocfs2_delete_inode. Will help us keep the
+ * inode data in a consistent state for clear_inode. Always truncates
+ * pages, optionally sync's them first. */
+static void ocfs2_cleanup_delete_inode(struct inode *inode,
+				       int sync_data)
+{
+	mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
+	     OCFS2_I(inode)->ip_blkno, sync_data);
+	if (sync_data)
+		write_inode_now(inode, 1);
+	truncate_inode_pages(&inode->i_data, 0);
+}
+
+void ocfs2_delete_inode(struct inode *inode)
+{
+	int wipe, status;
+	sigset_t blocked, oldset;
+	struct buffer_head *di_bh = NULL;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	if (is_bad_inode(inode)) {
+		mlog(0, "Skipping delete of bad inode\n");
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_valid_to_delete(inode)) {
+		/* It's probably not necessary to truncate_inode_pages
+		 * here but we do it for safety anyway (it will most
+		 * likely be a no-op anyway) */
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail;
+	}
+
+	/* We want to block signals in delete_inode as the lock and
+	 * messaging paths may return us -ERESTARTSYS. Which would
+	 * cause us to exit early, resulting in inodes being orphaned
+	 * forever. */
+	sigfillset(&blocked);
+	status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail;
+	}
+
+	/* Lock down the inode. This gives us an up to date view of
+	 * it's metadata (for verification), and allows us to
+	 * serialize delete_inode votes. 
+	 *
+	 * Even though we might be doing a truncate, we don't take the
+	 * allocation lock here as it won't be needed - nobody will
+	 * have the file open.
+	 */
+	status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unblock;
+	}
+
+	/* Query the cluster. This will be the final decision made
+	 * before we go ahead and wipe the inode. */
+	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
+	if (!wipe || status < 0) {
+		/* Error and inode busy vote both mean we won't be
+		 * removing the inode, so they take almost the same
+		 * path. */
+		if (status < 0)
+			mlog_errno(status);
+
+		/* Someone in the cluster has voted to not wipe this
+		 * inode, or it was never completely orphaned. Write
+		 * out the pages and exit now. */
+		ocfs2_cleanup_delete_inode(inode, 1);
+		goto bail_unlock_inode;
+	}
+
+	ocfs2_cleanup_delete_inode(inode, 0);
+
+	status = ocfs2_wipe_inode(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_inode;
+	}
+
+	/* Mark the inode as successfully deleted. This is important
+	 * for ocfs2_clear_inode as it will check this flag and skip
+	 * any checkpointing work */
+	OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
+
+bail_unlock_inode:
+	ocfs2_meta_unlock(inode, 1);
+	brelse(di_bh);
+bail_unblock:
+	status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	clear_inode(inode);
+	mlog_exit_void();
+}
+
+void ocfs2_clear_inode(struct inode *inode)
+{
+	int status;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	if (!inode)
+		goto bail;
+
+	mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
+	     OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+
+	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
+			"Inode=%lu\n", inode->i_ino);
+
+	/* Do these before all the other work so that we don't bounce
+	 * the vote thread while waiting to destroy the locks. */
+	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+
+	/* We very well may get a clear_inode before all an inodes
+	 * metadata has hit disk. Of course, we can't drop any cluster
+	 * locks until the journal has finished with it. The only
+	 * exception here are successfully wiped inodes - their
+	 * metadata can now be considered to be part of the system
+	 * inodes from which it came. */
+	if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
+		ocfs2_checkpoint_inode(inode);
+
+	mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
+			"Clear inode of %"MLFu64", inode has io markers\n",
+			oi->ip_blkno);
+
+	ocfs2_extent_map_drop(inode, 0);
+	ocfs2_extent_map_init(inode);
+
+	status = ocfs2_drop_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	ocfs2_lock_res_free(&oi->ip_rw_lockres);
+	ocfs2_lock_res_free(&oi->ip_meta_lockres);
+	ocfs2_lock_res_free(&oi->ip_data_lockres);
+
+	ocfs2_metadata_cache_purge(inode);
+
+	mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+			"Clear inode of %"MLFu64", inode has %u cache items\n",
+			oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Clear inode of %"MLFu64", inode has a bad flag\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
+			"Clear inode of %"MLFu64", inode is locked\n",
+			oi->ip_blkno);
+
+	mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
+			"Clear inode of %"MLFu64", io_sem is locked\n",
+			oi->ip_blkno);
+	up(&oi->ip_io_sem);
+
+	/*
+	 * down_trylock() returns 0, down_write_trylock() returns 1
+	 * kernel 1, world 0
+	 */
+	mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
+			"Clear inode of %"MLFu64", alloc_sem is locked\n",
+			oi->ip_blkno);
+	up_write(&oi->ip_alloc_sem);
+
+	mlog_bug_on_msg(oi->ip_open_count,
+			"Clear inode of %"MLFu64" has open count %d\n",
+			oi->ip_blkno, oi->ip_open_count);
+	mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
+			"Clear inode of %"MLFu64" has non empty handle list\n",
+			oi->ip_blkno);
+	mlog_bug_on_msg(oi->ip_handle,
+			"Clear inode of %"MLFu64" has non empty handle pointer\n",
+			oi->ip_blkno);
+
+	/* Clear all other flags. */
+	oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
+	oi->ip_created_trans = 0;
+	oi->ip_last_trans = 0;
+	oi->ip_dir_start_lookup = 0;
+	oi->ip_blkno = 0ULL;
+
+bail:
+	mlog_exit_void();
+}
+
+/* Called under inode_lock, with no more references on the
+ * struct inode, so it's safe here to check the flags field
+ * and to manipulate i_nlink without any other locks. */
+void ocfs2_drop_inode(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog_entry_void();
+
+	mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
+	     oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+
+	/* Testing ip_orphaned_slot here wouldn't work because we may
+	 * not have gotten a delete_inode vote from any other nodes
+	 * yet. */
+	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+		mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+		inode->i_nlink = 0;
+	}
+
+	generic_drop_inode(inode);
+
+	mlog_exit_void();
+}
+
+/*
+ * TODO: this should probably be merged into ocfs2_get_block
+ *
+ * However, you now need to pay attention to the cont_prepare_write()
+ * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
+ * expects never to extend).
+ */
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = OCFS2_BH_CACHED;
+
+#if 0
+	/* only turn this on if we know we can deal with read_block
+	 * returning nothing */
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+#endif
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
+					     &p_blkno, NULL);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
+				  readflags, inode);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	if (bh) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*err = -EIO;
+	return NULL;
+}
+
+/*
+ * This is called from our getattr.
+ */
+int ocfs2_inode_revalidate(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int status = 0;
+
+	mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
+		   inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
+
+	if (!inode) {
+		mlog(0, "eep, no inode!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		mlog(0, "inode deleted!\n");
+		status = -ENOENT;
+		goto bail;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Let ocfs2_meta_lock do the work of updating our struct
+	 * inode for us. */
+	status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_meta_unlock(inode, 0);
+bail:
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * Updates a disk inode from a
+ * struct inode.
+ * Only takes ip_lock.
+ */
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh)
+{
+	int status;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+
+	mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
+
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	fe->i_size = cpu_to_le64(i_size_read(inode));
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0;
+leave:
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ *
+ * Updates a struct inode from a disk inode.
+ * does no i/o, only takes ip_lock.
+ */
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	i_size_write(inode, le64_to_cpu(fe->i_size));
+	inode->i_nlink = le16_to_cpu(fe->i_links_count);
+	inode->i_uid = le32_to_cpu(fe->i_uid);
+	inode->i_gid = le32_to_cpu(fe->i_gid);
+	inode->i_mode = le16_to_cpu(fe->i_mode);
+	inode->i_blksize = (u32) osb->s_clustersize;
+	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
+		inode->i_blocks = 0;
+	else
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
+	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
+	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
+	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
+	inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 0000000..9b01774
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * inode.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_INODE_H
+#define OCFS2_INODE_H
+
+/* OCFS2 Inode Private Data */
+struct ocfs2_inode_info
+{
+	u64			ip_blkno;
+
+	struct ocfs2_lock_res		ip_rw_lockres;
+	struct ocfs2_lock_res		ip_meta_lockres;
+	struct ocfs2_lock_res		ip_data_lockres;
+
+	/* protects allocation changes on this inode. */
+	struct rw_semaphore		ip_alloc_sem;
+
+	/* These fields are protected by ip_lock */
+	spinlock_t			ip_lock;
+	u32				ip_open_count;
+	u32				ip_clusters;
+	struct ocfs2_extent_map		ip_map;
+	struct list_head		ip_io_markers;
+	int				ip_orphaned_slot;
+
+	struct semaphore		ip_io_sem;
+
+	/* Used by the journalling code to attach an inode to a
+	 * handle.  These are protected by ip_io_sem in order to lock
+	 * out other I/O to the inode until we either commit or
+	 * abort. */
+	struct list_head		ip_handle_list;
+	struct ocfs2_journal_handle	*ip_handle;
+
+	u32				ip_flags; /* see below */
+
+	/* protected by recovery_lock. */
+	struct inode			*ip_next_orphan;
+
+	u32				ip_dir_start_lookup;
+
+	/* next two are protected by trans_inc_lock */
+	/* which transaction were we created on? Zero if none. */
+	unsigned long			ip_created_trans;
+	/* last transaction we were a part of. */
+	unsigned long			ip_last_trans;
+
+	struct ocfs2_caching_info	ip_metadata_cache;
+
+	struct inode			vfs_inode;
+};
+
+/*
+ * Flags for the ip_flags field
+ */
+/* System file inodes  */
+#define OCFS2_INODE_SYSTEM_FILE		0x00000001
+#define OCFS2_INODE_JOURNAL		0x00000002
+#define OCFS2_INODE_BITMAP		0x00000004
+/* This inode has been wiped from disk */
+#define OCFS2_INODE_DELETED		0x00000008
+/* Another node is deleting, so our delete is a nop */
+#define OCFS2_INODE_SKIP_DELETE		0x00000010
+/* Has the inode been orphaned on another node?
+ *
+ * This hints to ocfs2_drop_inode that it should clear i_nlink before
+ * continuing.
+ *
+ * We *only* set this on unlink vote from another node. If the inode
+ * was locally orphaned, then we're sure of the state and don't need
+ * to twiddle i_nlink later - it's either zero or not depending on
+ * whether our unlink succeeded. Otherwise we got this from a node
+ * whose intention was to orphan the inode, however he may have
+ * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
+ * rely on ocfs2_delete_inode to sort things out under the proper
+ * cluster locks.
+ */
+#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020
+/* Does someone have the file open O_DIRECT */
+#define OCFS2_INODE_OPEN_DIRECT		0x00000040
+/* Indicates that the metadata cache should be used as an array. */
+#define OCFS2_INODE_CACHE_INLINE	0x00000080
+
+static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct ocfs2_inode_info, vfs_inode);
+}
+
+#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
+#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
+
+extern kmem_cache_t *ocfs2_inode_cache;
+
+extern struct address_space_operations ocfs2_aops;
+
+struct buffer_head *ocfs2_bread(struct inode *inode, int block,
+				int *err, int reada);
+void ocfs2_clear_inode(struct inode *inode);
+void ocfs2_delete_inode(struct inode *inode);
+void ocfs2_drop_inode(struct inode *inode);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
+struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
+				     u64 blkno,
+				     int delete_vote);
+int ocfs2_inode_init_private(struct inode *inode);
+int ocfs2_inode_revalidate(struct dentry *dentry);
+int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			 int create_ino);
+void ocfs2_read_inode(struct inode *inode);
+void ocfs2_read_inode2(struct inode *inode, void *opaque);
+ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
+			size_t size, loff_t *offp);
+void ocfs2_sync_blockdev(struct super_block *sb);
+void ocfs2_refresh_inode(struct inode *inode,
+			 struct ocfs2_dinode *fe);
+int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+			   struct inode *inode,
+			   struct buffer_head *bh);
+int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
+int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+
+#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 0000000..0442804
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.c
+ *
+ * Defines functions of journalling api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+
+#define MLOG_MASK_PREFIX ML_JOURNAL
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "vote.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+
+static int ocfs2_force_read_journal(struct inode *inode);
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num);
+static int __ocfs2_recovery_thread(void *arg);
+static int ocfs2_commit_cache(struct ocfs2_super *osb);
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle);
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty);
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num);
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot);
+static int ocfs2_commit_thread(void *arg);
+
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
+{
+	int status = 0;
+	unsigned int flushed;
+	unsigned long old_id;
+	struct ocfs2_journal *journal = NULL;
+
+	mlog_entry_void();
+
+	journal = osb->journal;
+
+	/* Flush all pending commits and checkpoint the journal. */
+	down_write(&journal->j_trans_barrier);
+
+	if (atomic_read(&journal->j_num_trans) == 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog(0, "No transactions for me to flush!\n");
+		goto finally;
+	}
+
+	journal_lock_updates(journal->j_journal);
+	status = journal_flush(journal->j_journal);
+	journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		up_write(&journal->j_trans_barrier);
+		mlog_errno(status);
+		goto finally;
+	}
+
+	old_id = ocfs2_inc_trans_id(journal);
+
+	flushed = atomic_read(&journal->j_num_trans);
+	atomic_set(&journal->j_num_trans, 0);
+	up_write(&journal->j_trans_barrier);
+
+	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
+	     journal->j_trans_id, flushed);
+
+	ocfs2_kick_vote_thread(osb);
+	wake_up(&journal->j_checkpointed);
+finally:
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal_handle *retval = NULL;
+
+	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
+	if (!retval) {
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		return NULL;
+	}
+
+	retval->max_buffs = 0;
+	retval->num_locks = 0;
+	retval->k_handle = NULL;
+
+	INIT_LIST_HEAD(&retval->locks);
+	INIT_LIST_HEAD(&retval->inode_list);
+	retval->journal = osb->journal;
+
+	return retval;
+}
+
+/* pass it NULL and it will allocate a new handle object for you.  If
+ * you pass it a handle however, it may still return error, in which
+ * case it has free'd the passed handle for you. */
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs)
+{
+	int ret;
+	journal_t *journal = osb->journal->j_journal;
+
+	mlog_entry("(max_buffs = %d)\n", max_buffs);
+
+	if (!osb || !osb->journal->j_journal)
+		BUG();
+
+	if (ocfs2_is_hard_readonly(osb)) {
+		ret = -EROFS;
+		goto done_free;
+	}
+
+	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+	BUG_ON(max_buffs <= 0);
+
+	/* JBD might support this, but our journalling code doesn't yet. */
+	if (journal_current_handle()) {
+		mlog(ML_ERROR, "Recursive transaction attempted!\n");
+		BUG();
+	}
+
+	if (!handle)
+		handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		ret = -ENOMEM;
+		mlog(ML_ERROR, "Failed to allocate memory for journal "
+		     "handle!\n");
+		goto done_free;
+	}
+
+	handle->max_buffs = max_buffs;
+
+	down_read(&osb->journal->j_trans_barrier);
+
+	/* actually start the transaction now */
+	handle->k_handle = journal_start(journal, max_buffs);
+	if (IS_ERR(handle->k_handle)) {
+		up_read(&osb->journal->j_trans_barrier);
+
+		ret = PTR_ERR(handle->k_handle);
+		handle->k_handle = NULL;
+		mlog_errno(ret);
+
+		if (is_journal_aborted(journal)) {
+			ocfs2_abort(osb->sb, "Detected aborted journal");
+			ret = -EROFS;
+		}
+		goto done_free;
+	}
+
+	atomic_inc(&(osb->journal->j_num_trans));
+	handle->flags |= OCFS2_HANDLE_STARTED;
+
+	mlog_exit_ptr(handle);
+	return handle;
+
+done_free:
+	if (handle)
+		ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
+
+	mlog_exit(ret);
+	return ERR_PTR(ret);
+}
+
+void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+			    struct inode *inode)
+{
+	BUG_ON(!handle);
+	BUG_ON(!inode);
+
+	atomic_inc(&inode->i_count);
+
+	/* we're obviously changing it... */
+	down(&inode->i_sem);
+
+	/* sanity check */
+	BUG_ON(OCFS2_I(inode)->ip_handle);
+	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
+
+	OCFS2_I(inode)->ip_handle = handle;
+	list_del(&(OCFS2_I(inode)->ip_handle_list));
+	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+}
+
+static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct inode *inode;
+	struct ocfs2_inode_info *oi;
+
+	list_for_each_safe(p, n, &handle->inode_list) {
+		oi = list_entry(p, struct ocfs2_inode_info,
+				ip_handle_list);
+		inode = &oi->vfs_inode;
+
+		OCFS2_I(inode)->ip_handle = NULL;
+		list_del_init(&OCFS2_I(inode)->ip_handle_list);
+
+		up(&inode->i_sem);
+		iput(inode);
+	}
+}
+
+/* This is trivial so we do it out of the main commit
+ * paths. Beware, it can be called from start_trans too! */
+static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
+{
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_unlock_inodes(handle);
+	/* You are allowed to add journal locks before the transaction
+	 * has started. */
+	ocfs2_handle_cleanup_locks(handle->journal, handle);
+
+	kfree(handle);
+
+	mlog_exit_void();
+}
+
+void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
+{
+	handle_t *jbd_handle;
+	int retval;
+	struct ocfs2_journal *journal = handle->journal;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
+		ocfs2_commit_unstarted_handle(handle);
+		mlog_exit_void();
+		return;
+	}
+
+	/* release inode semaphores we took during this transaction */
+	ocfs2_handle_unlock_inodes(handle);
+
+	/* ocfs2_extend_trans may have had to call journal_restart
+	 * which will always commit the transaction, but may return
+	 * error for any number of reasons. If this is the case, we
+	 * clear k_handle as it's not valid any more. */
+	if (handle->k_handle) {
+		jbd_handle = handle->k_handle;
+
+		if (handle->flags & OCFS2_HANDLE_SYNC)
+			jbd_handle->h_sync = 1;
+		else
+			jbd_handle->h_sync = 0;
+
+		/* actually stop the transaction. if we've set h_sync,
+		 * it'll have been committed when we return */
+		retval = journal_stop(jbd_handle);
+		if (retval < 0) {
+			mlog_errno(retval);
+			mlog(ML_ERROR, "Could not commit transaction\n");
+			BUG();
+		}
+
+		handle->k_handle = NULL; /* it's been free'd in journal_stop */
+	}
+
+	ocfs2_handle_cleanup_locks(journal, handle);
+
+	up_read(&journal->j_trans_barrier);
+
+	kfree(handle);
+	mlog_exit_void();
+}
+
+/*
+ * 'nblocks' is what you want to add to the current
+ * transaction. extend_trans will either extend the current handle by
+ * nblocks, or commit it and start a new one with nblocks credits.
+ *
+ * WARNING: This will not release any semaphores or disk locks taken
+ * during the transaction, so make sure they were taken *before*
+ * start_trans or we'll have ordering deadlocks.
+ *
+ * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
+ * good because transaction ids haven't yet been recorded on the
+ * cluster locks associated with this handle.
+ */
+int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+		       int nblocks)
+{
+	int status;
+
+	BUG_ON(!handle);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+	BUG_ON(!nblocks);
+
+	mlog_entry_void();
+
+	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+
+	status = journal_extend(handle->k_handle, nblocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		mlog(0, "journal_extend failed, trying journal_restart\n");
+		status = journal_restart(handle->k_handle, nblocks);
+		if (status < 0) {
+			handle->k_handle = NULL;
+			mlog_errno(status);
+			goto bail;
+		}
+		handle->max_buffs = nblocks;
+	} else
+		handle->max_buffs += nblocks;
+
+	status = 0;
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+			 struct inode *inode,
+			 struct buffer_head *bh,
+			 int type)
+{
+	int status;
+
+	BUG_ON(!inode);
+	BUG_ON(!handle);
+	BUG_ON(!bh);
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
+		   (unsigned long long)bh->b_blocknr, type,
+		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+		   "OCFS2_JOURNAL_ACCESS_CREATE" :
+		   "OCFS2_JOURNAL_ACCESS_WRITE",
+		   bh->b_size);
+
+	/* we can safely remove this assertion after testing. */
+	if (!buffer_uptodate(bh)) {
+		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
+		mlog(ML_ERROR, "b_blocknr=%llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		BUG();
+	}
+
+	/* Set the current transaction information on the inode so
+	 * that the locking code knows whether it can drop it's locks
+	 * on this inode or not. We're protected from the commit
+	 * thread updating the current transaction id until
+	 * ocfs2_commit_trans() because ocfs2_start_trans() took
+	 * j_trans_barrier for us. */
+	ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+
+	down(&OCFS2_I(inode)->ip_io_sem);
+	switch (type) {
+	case OCFS2_JOURNAL_ACCESS_CREATE:
+	case OCFS2_JOURNAL_ACCESS_WRITE:
+		status = journal_get_write_access(handle->k_handle, bh);
+		break;
+
+	case OCFS2_JOURNAL_ACCESS_UNDO:
+		status = journal_get_undo_access(handle->k_handle, bh);
+		break;
+
+	default:
+		status = -EINVAL;
+		mlog(ML_ERROR, "Uknown access type!\n");
+	}
+	up(&OCFS2_I(inode)->ip_io_sem);
+
+	if (status < 0)
+		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
+		     status, type);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+			struct buffer_head *bh)
+{
+	int status;
+
+	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
+
+	mlog_entry("(bh->b_blocknr=%llu)\n",
+		   (unsigned long long)bh->b_blocknr);
+
+	status = journal_dirty_metadata(handle->k_handle, bh);
+	if (status < 0)
+		mlog(ML_ERROR, "Could not dirty metadata buffer. "
+		     "(bh->b_blocknr=%llu)\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_journal_dirty_data(handle_t *handle,
+			     struct buffer_head *bh)
+{
+	int err = journal_dirty_data(handle, bh);
+	if (err)
+		mlog_errno(err);
+	/* TODO: When we can handle it, abort the handle and go RO on
+	 * error here. */
+
+	return err;
+}
+
+/* We always assume you're adding a metadata lock at level 'ex' */
+int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+			  struct inode *inode)
+{
+	int status;
+	struct ocfs2_journal_lock *lock;
+
+	BUG_ON(!inode);
+
+	lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
+	if (!lock) {
+		status = -ENOMEM;
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	if (!igrab(inode))
+		BUG();
+	lock->jl_inode = inode;
+
+	list_add_tail(&(lock->jl_lock_list), &(handle->locks));
+	handle->num_locks++;
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
+				       struct ocfs2_journal_handle *handle)
+{
+	struct list_head *p, *n;
+	struct ocfs2_journal_lock *lock;
+	struct inode *inode;
+
+	list_for_each_safe(p, n, &(handle->locks)) {
+		lock = list_entry(p, struct ocfs2_journal_lock,
+				  jl_lock_list);
+		list_del(&lock->jl_lock_list);
+		handle->num_locks--;
+
+		inode = lock->jl_inode;
+		ocfs2_meta_unlock(inode, 1);
+		if (atomic_read(&inode->i_count) == 1)
+			mlog(ML_ERROR,
+			     "Inode %"MLFu64", I'm doing a last iput for!",
+			     OCFS2_I(inode)->ip_blkno);
+		iput(inode);
+		kmem_cache_free(ocfs2_lock_cache, lock);
+	}
+}
+
+#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * 5)
+
+void ocfs2_set_journal_params(struct ocfs2_super *osb)
+{
+	journal_t *journal = osb->journal->j_journal;
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+		journal->j_flags |= JFS_BARRIER;
+	else
+		journal->j_flags &= ~JFS_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+{
+	int status = -1;
+	struct inode *inode = NULL; /* the journal inode */
+	journal_t *j_journal = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_super *osb;
+	int meta_lock = 0;
+
+	mlog_entry_void();
+
+	BUG_ON(!journal);
+
+	osb = journal->j_osb;
+
+	/* already have the inode for our journal */
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    osb->slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto done;
+	}
+
+	SET_INODE_JOURNAL(inode);
+	OCFS2_I(inode)->ip_open_count++;
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+	if (status < 0) {
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not get lock on journal!\n");
+		goto done;
+	}
+
+	meta_lock = 1;
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
+		     inode->i_size);
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "inode->i_size = %lld\n", inode->i_size);
+	mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
+	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+
+	/* call the kernels journal init function now */
+	j_journal = journal_init_inode(inode);
+	if (j_journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EINVAL;
+		goto done;
+	}
+
+	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+
+	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
+		  OCFS2_JOURNAL_DIRTY_FL);
+
+	journal->j_journal = j_journal;
+	journal->j_inode = inode;
+	journal->j_bh = bh;
+
+	ocfs2_set_journal_params(osb);
+
+	journal->j_state = OCFS2_JOURNAL_LOADED;
+
+	status = 0;
+done:
+	if (status < 0) {
+		if (meta_lock)
+			ocfs2_meta_unlock(inode, 1);
+		if (bh != NULL)
+			brelse(bh);
+		if (inode) {
+			OCFS2_I(inode)->ip_open_count--;
+			iput(inode);
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
+				      int dirty)
+{
+	int status;
+	unsigned int flags;
+	struct ocfs2_journal *journal = osb->journal;
+	struct buffer_head *bh = journal->j_bh;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		/* This is called from startup/shutdown which will
+		 * handle the errors in a specific manner, so no need
+		 * to call ocfs2_error() here. */
+		mlog(ML_ERROR, "Journal dinode %"MLFu64"  has invalid "
+		     "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
+		status = -EIO;
+		goto out;
+	}
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	if (dirty)
+		flags |= OCFS2_JOURNAL_DIRTY_FL;
+	else
+		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, journal->j_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * If the journal has been kmalloc'd it needs to be freed after this
+ * call.
+ */
+void ocfs2_journal_shutdown(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = NULL;
+	int status = 0;
+	struct inode *inode = NULL;
+	int num_running_trans = 0;
+
+	mlog_entry_void();
+
+	if (!osb)
+		BUG();
+
+	journal = osb->journal;
+	if (!journal)
+		goto done;
+
+	inode = journal->j_inode;
+
+	if (journal->j_state != OCFS2_JOURNAL_LOADED)
+		goto done;
+
+	/* need to inc inode use count as journal_destroy will iput. */
+	if (!igrab(inode))
+		BUG();
+
+	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+	if (num_running_trans > 0)
+		mlog(0, "Shutting down journal: must wait on %d "
+		     "running transactions!\n",
+		     num_running_trans);
+
+	/* Do a commit_cache here. It will flush our journal, *and*
+	 * release any locks that are still held.
+	 * set the SHUTDOWN flag and release the trans lock.
+	 * the commit thread will take the trans lock for us below. */
+	journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
+
+	/* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
+	 * drop the trans_lock (which we want to hold until we
+	 * completely destroy the journal. */
+	if (osb->commit_task) {
+		/* Wait for the commit thread */
+		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		kthread_stop(osb->commit_task);
+		osb->commit_task = NULL;
+	}
+
+	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+
+	status = ocfs2_journal_toggle_dirty(osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Shutdown the kernel journal system */
+	journal_destroy(journal->j_journal);
+
+	OCFS2_I(inode)->ip_open_count--;
+
+	/* unlock our journal */
+	ocfs2_meta_unlock(inode, 1);
+
+	brelse(journal->j_bh);
+	journal->j_bh = NULL;
+
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+//	up_write(&journal->j_trans_barrier);
+done:
+	if (inode)
+		iput(inode);
+	mlog_exit_void();
+}
+
+static void ocfs2_clear_journal_error(struct super_block *sb,
+				      journal_t *journal,
+				      int slot)
+{
+	int olderr;
+
+	olderr = journal_errno(journal);
+	if (olderr) {
+		mlog(ML_ERROR, "File system error %d recorded in "
+		     "journal %u.\n", olderr, slot);
+		mlog(ML_ERROR, "File system on device %s needs checking.\n",
+		     sb->s_id);
+
+		journal_ack_err(journal);
+		journal_clear_err(journal);
+	}
+}
+
+int ocfs2_journal_load(struct ocfs2_journal *journal)
+{
+	int status = 0;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	osb = journal->j_osb;
+
+	status = journal_load(journal->j_journal);
+	if (status < 0) {
+		mlog(ML_ERROR, "Failed to load journal!\n");
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+
+	status = ocfs2_journal_toggle_dirty(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Launch the commit thread */
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
+				       osb->osb_id);
+	if (IS_ERR(osb->commit_task)) {
+		status = PTR_ERR(osb->commit_task);
+		osb->commit_task = NULL;
+		mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
+		     status);
+		goto done;
+	}
+
+done:
+	mlog_exit(status);
+	return status;
+}
+
+
+/* 'full' flag tells us whether we clear out all blocks or if we just
+ * mark the journal clean */
+int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
+{
+	int status;
+
+	mlog_entry_void();
+
+	if (!journal)
+		BUG();
+
+	status = journal_wipe(journal->j_journal, full);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * JBD Might read a cached version of another nodes journal file. We
+ * don't want this as this file changes often and we get no
+ * notification on those changes. The only way to be sure that we've
+ * got the most up to date version of those blocks then is to force
+ * read them off disk. Just searching through the buffer cache won't
+ * work as there may be pages backing this file which are still marked
+ * up to date. We know things can't change on this file underneath us
+ * as we have the lock by now :)
+ */
+static int ocfs2_force_read_journal(struct inode *inode)
+{
+	int status = 0;
+	int i, p_blocks;
+	u64 v_blkno, p_blkno;
+#define CONCURRENT_JOURNAL_FILL 32
+	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
+
+	mlog_entry_void();
+
+	BUG_ON(inode->i_blocks !=
+		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
+
+	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
+
+	mlog(0, "Force reading %lu blocks\n",
+	     (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
+
+	v_blkno = 0;
+	while (v_blkno <
+	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
+
+		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
+						     1, &p_blkno,
+						     &p_blocks);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (p_blocks > CONCURRENT_JOURNAL_FILL)
+			p_blocks = CONCURRENT_JOURNAL_FILL;
+
+		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+					   p_blkno, p_blocks, bhs, 0,
+					   inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		for(i = 0; i < p_blocks; i++) {
+			brelse(bhs[i]);
+			bhs[i] = NULL;
+		}
+
+		v_blkno += p_blocks;
+	}
+
+bail:
+	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
+		if (bhs[i])
+			brelse(bhs[i]);
+	mlog_exit(status);
+	return status;
+}
+
+struct ocfs2_la_recovery_item {
+	struct list_head	lri_list;
+	int			lri_slot;
+	struct ocfs2_dinode	*lri_la_dinode;
+	struct ocfs2_dinode	*lri_tl_dinode;
+};
+
+/* Does the second half of the recovery process. By this point, the
+ * node is marked clean and can actually be considered recovered,
+ * hence it's no longer in the recovery map, but there's still some
+ * cleanup we can do which shouldn't happen within the recovery thread
+ * as locking in that context becomes very difficult if we are to take
+ * recovering nodes into account.
+ *
+ * NOTE: This function can and will sleep on recovery of other nodes
+ * during cluster locking, just like any other ocfs2 process.
+ */
+void ocfs2_complete_recovery(void *data)
+{
+	int ret;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_journal *journal = osb->journal;
+	struct ocfs2_dinode *la_dinode, *tl_dinode;
+	struct ocfs2_la_recovery_item *item;
+	struct list_head *p, *n;
+	LIST_HEAD(tmp_la_list);
+
+	mlog_entry_void();
+
+	mlog(0, "completing recovery from keventd\n");
+
+	spin_lock(&journal->j_lock);
+	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
+	spin_unlock(&journal->j_lock);
+
+	list_for_each_safe(p, n, &tmp_la_list) {
+		item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+		list_del_init(&item->lri_list);
+
+		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+
+		la_dinode = item->lri_la_dinode;
+		if (la_dinode) {
+			mlog(0, "Clean up local alloc %"MLFu64"\n",
+			     la_dinode->i_blkno);
+
+			ret = ocfs2_complete_local_alloc_recovery(osb,
+								  la_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(la_dinode);
+		}
+
+		tl_dinode = item->lri_tl_dinode;
+		if (tl_dinode) {
+			mlog(0, "Clean up truncate log %"MLFu64"\n",
+			     tl_dinode->i_blkno);
+
+			ret = ocfs2_complete_truncate_log_recovery(osb,
+								   tl_dinode);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			kfree(tl_dinode);
+		}
+
+		ret = ocfs2_recover_orphans(osb, item->lri_slot);
+		if (ret < 0)
+			mlog_errno(ret);
+
+		kfree(item);
+	}
+
+	mlog(0, "Recovery completion\n");
+	mlog_exit_void();
+}
+
+/* NOTE: This function always eats your references to la_dinode and
+ * tl_dinode, either manually on error, or by passing them to
+ * ocfs2_complete_recovery */
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+					    int slot_num,
+					    struct ocfs2_dinode *la_dinode,
+					    struct ocfs2_dinode *tl_dinode)
+{
+	struct ocfs2_la_recovery_item *item;
+
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
+	if (!item) {
+		/* Though we wish to avoid it, we are in fact safe in
+		 * skipping local alloc cleanup as fsck.ocfs2 is more
+		 * than capable of reclaiming unused space. */
+		if (la_dinode)
+			kfree(la_dinode);
+
+		if (tl_dinode)
+			kfree(tl_dinode);
+
+		mlog_errno(-ENOMEM);
+		return;
+	}
+
+	INIT_LIST_HEAD(&item->lri_list);
+	item->lri_la_dinode = la_dinode;
+	item->lri_slot = slot_num;
+	item->lri_tl_dinode = tl_dinode;
+
+	spin_lock(&journal->j_lock);
+	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
+	queue_work(ocfs2_wq, &journal->j_recovery_work);
+	spin_unlock(&journal->j_lock);
+}
+
+/* Called by the mount code to queue recovery the last part of
+ * recovery for it's own slot. */
+void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
+{
+	struct ocfs2_journal *journal = osb->journal;
+
+	if (osb->dirty) {
+		/* No need to queue up our truncate_log as regular
+		 * cleanup will catch that. */
+		ocfs2_queue_recovery_completion(journal,
+						osb->slot_num,
+						osb->local_alloc_copy,
+						NULL);
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+
+		osb->local_alloc_copy = NULL;
+		osb->dirty = 0;
+	}
+}
+
+static int __ocfs2_recovery_thread(void *arg)
+{
+	int status, node_num;
+	struct ocfs2_super *osb = arg;
+
+	mlog_entry_void();
+
+	status = ocfs2_wait_on_mount(osb);
+	if (status < 0) {
+		goto bail;
+	}
+
+restart:
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		node_num = ocfs2_node_map_first_set_bit(osb,
+							&osb->recovery_map);
+		if (node_num == O2NM_INVALID_NODE_NUM) {
+			mlog(0, "Out of nodes to recover.\n");
+			break;
+		}
+
+		status = ocfs2_recover_node(osb, node_num);
+		if (status < 0) {
+			mlog(ML_ERROR,
+			     "Error %d recovering node %d on device (%u,%u)!\n",
+			     status, node_num,
+			     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+			mlog(ML_ERROR, "Volume requires unmount.\n");
+			continue;
+		}
+
+		ocfs2_recovery_map_clear(osb, node_num);
+	}
+	ocfs2_super_unlock(osb, 1);
+
+	/* We always run recovery on our own orphan dir - the dead
+	 * node(s) may have voted "no" on an inode delete earlier. A
+	 * revote is therefore required. */
+	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+					NULL);
+
+bail:
+	down(&osb->recovery_lock);
+	if (!status &&
+	    !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+		up(&osb->recovery_lock);
+		goto restart;
+	}
+
+	osb->recovery_thread_task = NULL;
+	mb(); /* sync with ocfs2_recovery_thread_running */
+	wake_up(&osb->recovery_event);
+
+	up(&osb->recovery_lock);
+
+	mlog_exit(status);
+	/* no one is callint kthread_stop() for us so the kthread() api
+	 * requires that we call do_exit().  And it isn't exported, but
+	 * complete_and_exit() seems to be a minimal wrapper around it. */
+	complete_and_exit(NULL, status);
+	return status;
+}
+
+void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
+{
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	down(&osb->recovery_lock);
+	if (osb->disable_recovery)
+		goto out;
+
+	/* People waiting on recovery will wait on
+	 * the recovery map to empty. */
+	if (!ocfs2_recovery_map_set(osb, node_num))
+		mlog(0, "node %d already be in recovery.\n", node_num);
+
+	mlog(0, "starting recovery thread...\n");
+
+	if (osb->recovery_thread_task)
+		goto out;
+
+	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
+						 "ocfs2rec-%d", osb->osb_id);
+	if (IS_ERR(osb->recovery_thread_task)) {
+		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
+		osb->recovery_thread_task = NULL;
+	}
+
+out:
+	up(&osb->recovery_lock);
+	wake_up(&osb->recovery_event);
+
+	mlog_exit_void();
+}
+
+/* Does the actual journal replay and marks the journal inode as
+ * clean. Will only replay if the journal inode is marked dirty. */
+static int ocfs2_replay_journal(struct ocfs2_super *osb,
+				int node_num,
+				int slot_num)
+{
+	int status;
+	int got_lock = 0;
+	unsigned int flags;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *fe;
+	journal_t *journal = NULL;
+	struct buffer_head *bh = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		status = -EACCES;
+		mlog_errno(status);
+		goto done;
+	}
+	if (is_bad_inode(inode)) {
+		status = -EACCES;
+		iput(inode);
+		inode = NULL;
+		mlog_errno(status);
+		goto done;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
+				      OCFS2_META_LOCK_RECOVERY);
+	if (status < 0) {
+		mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+		if (status != -ERESTARTSYS)
+			mlog(ML_ERROR, "Could not lock journal!\n");
+		goto done;
+	}
+	got_lock = 1;
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+
+	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
+		mlog(0, "No recovery required for node %d\n", node_num);
+		goto done;
+	}
+
+	mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
+	     node_num, slot_num,
+	     MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+
+	status = ocfs2_force_read_journal(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	mlog(0, "calling journal_init_inode\n");
+	journal = journal_init_inode(inode);
+	if (journal == NULL) {
+		mlog(ML_ERROR, "Linux journal layer error\n");
+		status = -EIO;
+		goto done;
+	}
+
+	status = journal_load(journal);
+	if (status < 0) {
+		mlog_errno(status);
+		if (!igrab(inode))
+			BUG();
+		journal_destroy(journal);
+		goto done;
+	}
+
+	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
+
+	/* wipe the journal */
+	mlog(0, "flushing the journal.\n");
+	journal_lock_updates(journal);
+	status = journal_flush(journal);
+	journal_unlock_updates(journal);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will mark the node clean */
+	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
+	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+
+	status = ocfs2_write_block(osb, bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	if (!igrab(inode))
+		BUG();
+
+	journal_destroy(journal);
+
+done:
+	/* drop the lock on this nodes journal */
+	if (got_lock)
+		ocfs2_meta_unlock(inode, 1);
+
+	if (inode)
+		iput(inode);
+
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Do the most important parts of node recovery:
+ *  - Replay it's journal
+ *  - Stamp a clean local allocator file
+ *  - Stamp a clean truncate log
+ *  - Mark the node clean
+ *
+ * If this function completes without error, a node in OCFS2 can be
+ * said to have been safely recovered. As a result, failure during the
+ * second part of a nodes recovery process (local alloc recovery) is
+ * far less concerning.
+ */
+static int ocfs2_recover_node(struct ocfs2_super *osb,
+			      int node_num)
+{
+	int status = 0;
+	int slot_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	struct ocfs2_dinode *la_copy = NULL;
+	struct ocfs2_dinode *tl_copy = NULL;
+
+	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+		   node_num, osb->node_num);
+
+	mlog(0, "checking node %d\n", node_num);
+
+	/* Should not ever be called to recover ourselves -- in that
+	 * case we should've called ocfs2_journal_load instead. */
+	if (osb->node_num == node_num)
+		BUG();
+
+	slot_num = ocfs2_node_num_to_slot(si, node_num);
+	if (slot_num == OCFS2_INVALID_SLOT) {
+		status = 0;
+		mlog(0, "no slot for this node, so no recovery required.\n");
+		goto done;
+	}
+
+	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+	status = ocfs2_replay_journal(osb, node_num, slot_num);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* Stamp a clean local alloc file AFTER recovering the journal... */
+	status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
+	if (status < 0) {
+		mlog_errno(status);
+		goto done;
+	}
+
+	/* An error from begin_truncate_log_recovery is not
+	 * serious enough to warrant halting the rest of
+	 * recovery. */
+	status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* Likewise, this would be a strange but ultimately not so
+	 * harmful place to get an error... */
+	ocfs2_clear_slot(si, slot_num);
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* This will kfree the memory pointed to by la_copy and tl_copy */
+	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
+					tl_copy);
+
+	status = 0;
+done:
+
+	mlog_exit(status);
+	return status;
+}
+
+/* Test node liveness by trylocking his journal. If we get the lock,
+ * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
+ * still alive (we couldn't get the lock) and < 0 on error. */
+static int ocfs2_trylock_journal(struct ocfs2_super *osb,
+				 int slot_num)
+{
+	int status, flags;
+	struct inode *inode = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (inode == NULL) {
+		mlog(ML_ERROR, "access error\n");
+		status = -EACCES;
+		goto bail;
+	}
+	if (is_bad_inode(inode)) {
+		mlog(ML_ERROR, "access error (bad inode)\n");
+		iput(inode);
+		inode = NULL;
+		status = -EACCES;
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
+	status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_meta_unlock(inode, 1);
+bail:
+	if (inode)
+		iput(inode);
+
+	return status;
+}
+
+/* Call this underneath ocfs2_super_lock. It also assumes that the
+ * slot info struct has been updated from disk. */
+int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
+{
+	int status, i, node_num;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	/* This is called with the super block cluster lock, so we
+	 * know that the slot map can't change underneath us. */
+
+	spin_lock(&si->si_lock);
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (i == osb->slot_num)
+			continue;
+		if (ocfs2_is_empty_slot(si, i))
+			continue;
+
+		node_num = si->si_global_node_nums[i];
+		if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+			continue;
+		spin_unlock(&si->si_lock);
+
+		/* Ok, we have a slot occupied by another node which
+		 * is not in the recovery map. We trylock his journal
+		 * file here to test if he's alive. */
+		status = ocfs2_trylock_journal(osb, i);
+		if (!status) {
+			/* Since we're called from mount, we know that
+			 * the recovery thread can't race us on
+			 * setting / checking the recovery bits. */
+			ocfs2_recovery_thread(osb, node_num);
+		} else if ((status < 0) && (status != -EAGAIN)) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		spin_lock(&si->si_lock);
+	}
+	spin_unlock(&si->si_lock);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_recover_orphans(struct ocfs2_super *osb,
+				 int slot)
+{
+	int status = 0;
+	int have_disk_lock = 0;
+	struct inode *inode = NULL;
+	struct inode *iter;
+	struct inode *orphan_dir_inode = NULL;
+	unsigned long offset, blk, local;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_inode_info *oi;
+
+	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       slot);
+	if  (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down(&orphan_dir_inode->i_sem);
+	status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
+	if (status < 0) {
+		up(&orphan_dir_inode->i_sem);
+		mlog_errno(status);
+		goto out;
+	}
+	have_disk_lock = 1;
+
+	offset = 0;
+	iter = NULL;
+	while(offset < i_size_read(orphan_dir_inode)) {
+		blk = offset >> sb->s_blocksize_bits;
+
+		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
+		if (!bh)
+			status = -EINVAL;
+		if (status < 0) {
+			up(&orphan_dir_inode->i_sem);
+			if (bh)
+				brelse(bh);
+			mlog_errno(status);
+			goto out;
+		}
+
+		local = 0;
+		while(offset < i_size_read(orphan_dir_inode)
+		      && local < sb->s_blocksize) {
+			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
+
+			if (!ocfs2_check_dir_entry(orphan_dir_inode,
+						  de, bh, local)) {
+				up(&orphan_dir_inode->i_sem);
+				status = -EINVAL;
+				mlog_errno(status);
+				brelse(bh);
+				goto out;
+			}
+
+			local += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+
+			/* I guess we silently fail on no inode? */
+			if (!le64_to_cpu(de->inode))
+				continue;
+			if (de->file_type > OCFS2_FT_MAX) {
+				mlog(ML_ERROR,
+				     "block %llu contains invalid de: "
+				     "inode = %"MLFu64", rec_len = %u, "
+				     "name_len = %u, file_type = %u, "
+				     "name='%.*s'\n",
+				     (unsigned long long)bh->b_blocknr,
+				     le64_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len),
+				     de->name_len,
+				     de->file_type,
+				     de->name_len,
+				     de->name);
+				continue;
+			}
+			if (de->name_len == 1 && !strncmp(".", de->name, 1))
+				continue;
+			if (de->name_len == 2 && !strncmp("..", de->name, 2))
+				continue;
+
+			iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
+			if (IS_ERR(iter))
+				continue;
+
+			mlog(0, "queue orphan %"MLFu64"\n",
+			     OCFS2_I(iter)->ip_blkno);
+			OCFS2_I(iter)->ip_next_orphan = inode;
+			inode = iter;
+		}
+		brelse(bh);
+	}
+	up(&orphan_dir_inode->i_sem);
+
+	ocfs2_meta_unlock(orphan_dir_inode, 0);
+	have_disk_lock = 0;
+
+	iput(orphan_dir_inode);
+	orphan_dir_inode = NULL;
+
+	while (inode) {
+		oi = OCFS2_I(inode);
+		mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
+
+		iter = oi->ip_next_orphan;
+
+		spin_lock(&oi->ip_lock);
+		/* Delete voting may have set these on the assumption
+		 * that the other node would wipe them successfully.
+		 * If they are still in the node's orphan dir, we need
+		 * to reset that state. */
+		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
+
+		/* Set the proper information to get us going into
+		 * ocfs2_delete_inode. */
+		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+		oi->ip_orphaned_slot = slot;
+		spin_unlock(&oi->ip_lock);
+
+		iput(inode);
+
+		inode = iter;
+	}
+
+out:
+	if (have_disk_lock)
+		ocfs2_meta_unlock(orphan_dir_inode, 0);
+
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	return status;
+}
+
+static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	/* This check is good because ocfs2 will wait on our recovery
+	 * thread before changing it to something other than MOUNTED
+	 * or DISABLED. */
+	wait_event(osb->osb_mount_event,
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
+
+	/* If there's an error on mount, then we may never get to the
+	 * MOUNTED flag, but this is set right before
+	 * dismount_volume() so we can trust it. */
+	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		mlog(0, "mount error, exiting!\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int ocfs2_commit_thread(void *arg)
+{
+	int status;
+	struct ocfs2_super *osb = arg;
+	struct ocfs2_journal *journal = osb->journal;
+
+	/* we can trust j_num_trans here because _should_stop() is only set in
+	 * shutdown and nobody other than ourselves should be able to start
+	 * transactions.  committing on shutdown might take a few iterations
+	 * as final transactions put deleted inodes on the list */
+	while (!(kthread_should_stop() &&
+		 atomic_read(&journal->j_num_trans) == 0)) {
+
+		wait_event_interruptible_timeout(osb->checkpoint_event,
+						 atomic_read(&journal->j_num_trans)
+						 || kthread_should_stop(),
+						 OCFS2_CHECKPOINT_INTERVAL);
+
+		status = ocfs2_commit_cache(osb);
+		if (status < 0)
+			mlog_errno(status);
+
+		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
+			mlog(ML_KTHREAD,
+			     "commit_thread: %u transactions pending on "
+			     "shutdown\n",
+			     atomic_read(&journal->j_num_trans));
+		}
+	}
+
+	return 0;
+}
+
+/* Look for a dirty journal without taking any cluster locks. Used for
+ * hard readonly access to determine whether the file system journals
+ * require recovery. */
+int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
+{
+	int ret = 0;
+	unsigned int slot;
+	struct buffer_head *di_bh;
+	struct ocfs2_dinode *di;
+	struct inode *journal = NULL;
+
+	for(slot = 0; slot < osb->max_slots; slot++) {
+		journal = ocfs2_get_system_file_inode(osb,
+						      JOURNAL_SYSTEM_INODE,
+						      slot);
+		if (!journal || is_bad_inode(journal)) {
+			ret = -EACCES;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di_bh = NULL;
+		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
+				       0, journal);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		di = (struct ocfs2_dinode *) di_bh->b_data;
+
+		if (le32_to_cpu(di->id1.journal1.ij_flags) &
+		    OCFS2_JOURNAL_DIRTY_FL)
+			ret = -EROFS;
+
+		brelse(di_bh);
+		if (ret)
+			break;
+	}
+
+out:
+	if (journal)
+		iput(journal);
+
+	return ret;
+}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 0000000..7d0a816
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * journal.h
+ *
+ * Defines journalling api and structures.
+ *
+ * Copyright (C) 2003, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_JOURNAL_H
+#define OCFS2_JOURNAL_H
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+
+#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
+
+enum ocfs2_journal_state {
+	OCFS2_JOURNAL_FREE = 0,
+	OCFS2_JOURNAL_LOADED,
+	OCFS2_JOURNAL_IN_SHUTDOWN,
+};
+
+struct ocfs2_super;
+struct ocfs2_dinode;
+struct ocfs2_journal_handle;
+
+struct ocfs2_journal {
+	enum ocfs2_journal_state   j_state;    /* Journals current state   */
+
+	journal_t                 *j_journal; /* The kernels journal type */
+	struct inode              *j_inode;   /* Kernel inode pointing to
+					       * this journal             */
+	struct ocfs2_super        *j_osb;     /* pointer to the super
+					       * block for the node
+					       * we're currently
+					       * running on -- not
+					       * necessarily the super
+					       * block from the node
+					       * which we usually run
+					       * from (recovery,
+					       * etc)                     */
+	struct buffer_head        *j_bh;      /* Journal disk inode block */
+	atomic_t                  j_num_trans; /* Number of transactions
+					        * currently in the system. */
+	unsigned long             j_trans_id;
+	struct rw_semaphore       j_trans_barrier;
+	wait_queue_head_t         j_checkpointed;
+
+	spinlock_t                j_lock;
+	struct list_head          j_la_cleanups;
+	struct work_struct        j_recovery_work;
+};
+
+extern spinlock_t trans_inc_lock;
+
+/* wrap j_trans_id so we never have it equal to zero. */
+static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
+{
+	unsigned long old_id;
+	spin_lock(&trans_inc_lock);
+	old_id = j->j_trans_id++;
+	if (unlikely(!j->j_trans_id))
+		j->j_trans_id = 1;
+	spin_unlock(&trans_inc_lock);
+	return old_id;
+}
+
+static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
+					      struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+/* Used to figure out whether it's safe to drop a metadata lock on an
+ * inode. Returns true if all the inodes changes have been
+ * checkpointed to disk. You should be holding the spinlock on the
+ * metadata lock while calling this to be sure that nobody can take
+ * the lock and put it on another transaction. */
+static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+{
+	int ret;
+	struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+
+	spin_lock(&trans_inc_lock);
+	ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+/* convenience function to check if an inode is still new (has never
+ * hit disk) Will do you a favor and set created_trans = 0 when you've
+ * been checkpointed.  returns '1' if the inode is still new. */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
+	int ret;
+
+	/* System files are never "new" as they're written out by
+	 * mkfs. This helps us early during mount, before we have the
+	 * journal open and j_trans_id could be junk. */
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+		return 0;
+	spin_lock(&trans_inc_lock);
+	ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
+			   OCFS2_I(inode)->ip_created_trans));
+	if (!ret)
+		OCFS2_I(inode)->ip_created_trans = 0;
+	spin_unlock(&trans_inc_lock);
+	return ret;
+}
+
+static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
+				       struct inode *inode)
+{
+	spin_lock(&trans_inc_lock);
+	OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+	spin_unlock(&trans_inc_lock);
+}
+
+extern kmem_cache_t *ocfs2_lock_cache;
+
+struct ocfs2_journal_lock {
+	struct inode     *jl_inode;
+	struct list_head  jl_lock_list;
+};
+
+struct ocfs2_journal_handle {
+	handle_t            *k_handle; /* kernel handle.                */
+	struct ocfs2_journal        *journal;
+	u32                 flags;     /* see flags below.              */
+	int                 max_buffs; /* Buffs reserved by this handle */
+
+	/* The following two fields are for ocfs2_handle_add_lock */
+	int                 num_locks;
+	struct list_head    locks;     /* A bunch of locks to
+					* release on commit. This
+					* should be a list_head */
+
+	struct list_head     inode_list;
+};
+
+#define OCFS2_HANDLE_STARTED			1
+/* should we sync-commit this handle? */
+#define OCFS2_HANDLE_SYNC			2
+static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
+{
+	return handle->flags & OCFS2_HANDLE_STARTED;
+}
+
+static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
+{
+	if (sync)
+		handle->flags |= OCFS2_HANDLE_SYNC;
+	else
+		handle->flags &= ~OCFS2_HANDLE_SYNC;
+}
+
+/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_complete_recovery(void *data);
+
+/*
+ *  Journal Control:
+ *  Initialize, Load, Shutdown, Wipe a journal.
+ *
+ *  ocfs2_journal_init     - Initialize journal structures in the OSB.
+ *  ocfs2_journal_load     - Load the given journal off disk. Replay it if
+ *                          there's transactions still in there.
+ *  ocfs2_journal_shutdown - Shutdown a journal, this will flush all
+ *                          uncommitted, uncheckpointed transactions.
+ *  ocfs2_journal_wipe     - Wipe transactions from a journal. Optionally
+ *                          zero out each block.
+ *  ocfs2_recovery_thread  - Perform recovery on a node. osb is our own osb.
+ *  ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
+ *                          event on.
+ *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
+ */
+void   ocfs2_set_journal_params(struct ocfs2_super *osb);
+int    ocfs2_journal_init(struct ocfs2_journal *journal,
+			  int *dirty);
+void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
+int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
+			  int full);
+int    ocfs2_journal_load(struct ocfs2_journal *journal);
+int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
+void   ocfs2_recovery_thread(struct ocfs2_super *osb,
+			     int node_num);
+int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
+void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+
+static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
+{
+	atomic_set(&osb->needs_checkpoint, 1);
+	wake_up(&osb->checkpoint_event);
+}
+
+static inline void ocfs2_checkpoint_inode(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!ocfs2_inode_fully_checkpointed(inode)) {
+		/* WARNING: This only kicks off a single
+		 * checkpoint. If someone races you and adds more
+		 * metadata to the journal, you won't know, and will
+		 * wind up waiting *alot* longer than necessary. Right
+		 * now we only use this in clear_inode so that's
+		 * OK. */
+		ocfs2_start_checkpoint(osb);
+
+		wait_event(osb->journal->j_checkpointed,
+			   ocfs2_inode_fully_checkpointed(inode));
+	}
+}
+
+/*
+ *  Transaction Handling:
+ *  Manage the lifetime of a transaction handle.
+ *
+ *  ocfs2_alloc_handle     - Only allocate a handle so we can start putting
+ *                          cluster locks on it. To actually change blocks,
+ *                          call ocfs2_start_trans with the handle returned
+ *                          from this function. You may call ocfs2_commit_trans
+ *                           at any time in the lifetime of a handle.
+ *  ocfs2_start_trans      - Begin a transaction. Give it an upper estimate of
+ *                          the number of blocks that will be changed during
+ *                          this handle.
+ *  ocfs2_commit_trans     - Complete a handle.
+ *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
+ *                          commit the handle to disk in the process, but will
+ *                          not release any locks taken during the transaction.
+ *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *                          buffer. Will have to call ocfs2_journal_dirty once
+ *                          we've actually dirtied it. Type is one of . or .
+ *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
+ *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *                             the current handle commits.
+ *  ocfs2_handle_add_lock  - Sometimes we need to delay lock release
+ *                          until after a transaction has been completed. Use
+ *                          ocfs2_handle_add_lock to indicate that a lock needs
+ *                          to be released at the end of that handle. Locks
+ *                          will be released in the order that they are added.
+ *  ocfs2_handle_add_inode - Add a locked inode to a transaction.
+ */
+
+/* You must always start_trans with a number of buffs > 0, but it's
+ * perfectly legal to go through an entire transaction without having
+ * dirtied any buffers. */
+struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
+struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
+					       struct ocfs2_journal_handle *handle,
+					       int max_buffs);
+void			     ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
+int			     ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
+						int nblocks);
+
+/*
+ * Create access is for when we get a newly created buffer and we're
+ * not gonna read it off disk, but rather fill it ourselves.  Right
+ * now, we don't do anything special with this (it turns into a write
+ * request), but this is a good placeholder in case we do...
+ *
+ * Write access is for when we read a block off disk and are going to
+ * modify it. This way the journalling layer knows it may need to make
+ * a copy of that block (if it's part of another, uncommitted
+ * transaction) before we do so.
+ */
+#define OCFS2_JOURNAL_ACCESS_CREATE 0
+#define OCFS2_JOURNAL_ACCESS_WRITE  1
+#define OCFS2_JOURNAL_ACCESS_UNDO   2
+
+int                  ocfs2_journal_access(struct ocfs2_journal_handle *handle,
+					  struct inode *inode,
+					  struct buffer_head *bh,
+					  int type);
+/*
+ * A word about the journal_access/journal_dirty "dance". It is
+ * entirely legal to journal_access a buffer more than once (as long
+ * as the access type is the same -- I'm not sure what will happen if
+ * access type is different but this should never happen anyway) It is
+ * also legal to journal_dirty a buffer more than once. In fact, you
+ * can even journal_access a buffer after you've done a
+ * journal_access/journal_dirty pair. The only thing you cannot do
+ * however, is journal_dirty a buffer which you haven't yet passed to
+ * journal_access at least once.
+ *
+ * That said, 99% of the time this doesn't matter and this is what the
+ * path looks like:
+ *
+ *	<read a bh>
+ *	ocfs2_journal_access(handle, bh,	OCFS2_JOURNAL_ACCESS_WRITE);
+ *	<modify the bh>
+ * 	ocfs2_journal_dirty(handle, bh);
+ */
+int                  ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
+					 struct buffer_head *bh);
+int                  ocfs2_journal_dirty_data(handle_t *handle,
+					      struct buffer_head *bh);
+int                  ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
+					   struct inode *inode);
+/*
+ * Use this to protect from other processes reading buffer state while
+ * it's in flight.
+ */
+void                 ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
+					    struct inode *inode);
+
+/*
+ *  Credit Macros:
+ *  Convenience macros to calculate number of credits needed.
+ *
+ *  For convenience sake, I have a set of macros here which calculate
+ *  the *maximum* number of sectors which will be changed for various
+ *  metadata updates.
+ */
+
+/* simple file updates like chmod, etc. */
+#define OCFS2_INODE_UPDATE_CREDITS 1
+
+/* get one bit out of a suballocator: dinode + group descriptor +
+ * prev. group desc. if we relink. */
+#define OCFS2_SUBALLOC_ALLOC (3)
+
+/* dinode + group descriptor update. We don't relink on free yet. */
+#define OCFS2_SUBALLOC_FREE  (2)
+
+#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
+#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
+					 + OCFS2_TRUNCATE_LOG_UPDATE)
+
+/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+ * bitmap block for the new bit) */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+
+/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
+ * group descriptor + mkdir/symlink blocks */
+#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+
+/* local alloc metadata change + main bitmap updates */
+#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
+				  + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
+
+/* used when we don't need an allocation change for a dir extend. One
+ * for the dinode, one for the new block. */
+#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
+
+/* file update (nlink, etc) + dir entry block */
+#define OCFS2_LINK_CREDITS  (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
+ * dir inode link */
+#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+			      + OCFS2_LINK_CREDITS)
+
+/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
+ * inode alloc group descriptor */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+
+/* dinode update, old dir dinode update, new dir dinode update, old
+ * dir dir entry, new dir dir entry, dir entry update for renaming
+ * directory + target unlink */
+#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+			     + OCFS2_UNLINK_CREDITS)
+
+static inline int ocfs2_calc_extend_credits(struct super_block *sb,
+					    struct ocfs2_dinode *fe,
+					    u32 bits_wanted)
+{
+	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+
+	/* bitmap dinode, group desc. + relinked group. */
+	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
+
+	/* we might need to shift tree depth so lets assume an
+	 * absolute worst case of complete fragmentation.  Even with
+	 * that, we only need one update for the dinode, and then
+	 * however many metadata chunks needed * a remaining suballoc
+	 * alloc. */
+	sysfile_bitmap_blocks = 1 +
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+
+	/* this does not include *new* metadata blocks, which are
+	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * prev. last_eb_blk + blocks along edge of tree.
+	 * calc_symlink_credits passes because we just need 1
+	 * credit for the dinode there. */
+	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+
+	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+}
+
+static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
+{
+	int blocks = OCFS2_MKNOD_CREDITS;
+
+	/* links can be longer than one block so we may update many
+	 * within our single allocated extent. */
+	blocks += ocfs2_clusters_to_blocks(sb, 1);
+
+	return blocks;
+}
+
+static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
+						 unsigned int cpg)
+{
+	int blocks;
+	int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
+	/* parent inode update + new block group header + bitmap inode update
+	   + bitmap blocks affected */
+	blocks = 1 + 1 + 1 + bitmap_blocks;
+	return blocks;
+}
+
+static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
+						unsigned int clusters_to_del,
+						struct ocfs2_dinode *fe,
+						struct ocfs2_extent_list *last_el)
+{
+ 	/* for dinode + all headers in this pass + update to next leaf */
+	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
+	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	int credits = 1 + tree_depth + 1;
+	int i;
+
+	i = next_free - 1;
+	BUG_ON(i < 0);
+
+	/* We may be deleting metadata blocks, so metadata alloc dinode +
+	   one desc. block for each possible delete. */
+	if (tree_depth && next_free == 1 &&
+	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+		credits += 1 + tree_depth;
+
+	/* update to the truncate log. */
+	credits += OCFS2_TRUNCATE_LOG_UPDATE;
+
+	return credits;
+}
+
+#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 0000000..fe373a2
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.c
+ *
+ * Node local data allocation
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
+
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits);
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
+
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh);
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh);
+
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac);
+
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode);
+
+/*
+ * Determine how large our local alloc window should be, in bits.
+ *
+ * These values (and the behavior in ocfs2_alloc_should_use_local) have
+ * been chosen so that most allocations, including new block groups go
+ * through local alloc.
+ */
+static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+{
+	BUG_ON(osb->s_clustersize_bits < 12);
+
+	return 2048 >> (osb->s_clustersize_bits - 12);
+}
+
+/*
+ * Tell us whether a given allocation should use the local alloc
+ * file. Otherwise, it has to go to the main bitmap.
+ */
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
+{
+	int la_bits = ocfs2_local_alloc_window_bits(osb);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+		return 0;
+
+	/* la_bits should be at least twice the size (in clusters) of
+	 * a new block group. We want to be sure block group
+	 * allocations go through the local alloc, so allow an
+	 * allocation to take up to half the bitmap. */
+	if (bits > (la_bits / 2))
+		return 0;
+
+	return 1;
+}
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb)
+{
+	int status = 0;
+	struct ocfs2_dinode *alloc = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	u32 num_used;
+	struct inode *inode = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	/* read the alloc off disk */
+	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (!(le32_to_cpu(alloc->i_flags) &
+	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
+		mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_blkno);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if ((la->la_size == 0) ||
+	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
+		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
+		     le16_to_cpu(la->la_size));
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* do a little verification. */
+	num_used = ocfs2_local_alloc_count_bits(alloc);
+
+	/* hopefully the local alloc has always been recovered before
+	 * we load it. */
+	if (num_used
+	    || alloc->id1.bitmap1.i_used
+	    || alloc->id1.bitmap1.i_total
+	    || la->la_bm_off)
+		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+		     "found = %u, set = %u, taken = %u, off = %u\n",
+		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
+		     le32_to_cpu(alloc->id1.bitmap1.i_total),
+		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+
+	osb->local_alloc_bh = alloc_bh;
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+
+bail:
+	if (status < 0)
+		if (alloc_bh)
+			brelse(alloc_bh);
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * return any unused bits to the bitmap and write out a clean
+ * local_alloc.
+ *
+ * local_alloc_bh is optional. If not passed, we will simply use the
+ * one off osb. If you do pass it however, be warned that it *will* be
+ * returned brelse'd and NULL'd out.*/
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *local_alloc_inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_dinode *alloc = NULL;
+
+	mlog_entry_void();
+
+	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
+		goto bail;
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->local_alloc_state = OCFS2_LA_DISABLED;
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* WINDOW_MOVE_CREDITS is a bit heavy... */
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		mlog_errno(PTR_ERR(handle));
+		handle = NULL;
+		goto bail;
+	}
+
+	bh = osb->local_alloc_bh;
+	alloc = (struct ocfs2_dinode *) bh->b_data;
+
+	alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	brelse(bh);
+	osb->local_alloc_bh = NULL;
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	mlog_exit_void();
+}
+
+/*
+ * We want to free the bitmap bits outside of any recovery context as
+ * we'll need a cluster lock to do so, but we must clear the local
+ * alloc before giving up the recovered nodes journal. To solve this,
+ * we kmalloc a copy of the local alloc before it's change for the
+ * caller to process with ocfs2_complete_local_alloc_recovery
+ */
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int slot_num,
+				     struct ocfs2_dinode **alloc_copy)
+{
+	int status = 0;
+	struct buffer_head *alloc_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_dinode *alloc;
+
+	mlog_entry("(slot_num = %d)\n", slot_num);
+
+	*alloc_copy = NULL;
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    slot_num);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	down(&inode->i_sem);
+
+	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+				  &alloc_bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
+	if (!(*alloc_copy)) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
+
+	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_write_block(osb, alloc_bh, inode);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if ((status < 0) && (*alloc_copy)) {
+		kfree(*alloc_copy);
+		*alloc_copy = NULL;
+	}
+
+	if (alloc_bh)
+		brelse(alloc_bh);
+
+	if (inode) {
+		up(&inode->i_sem);
+		iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Step 2: By now, we've completed the journal recovery, we've stamped
+ * a clean local alloc on disk and dropped the node out of the
+ * recovery map. Dlm locks will no longer stall, so lets clear out the
+ * main bitmap.
+ */
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc)
+{
+	int status;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	main_bm_inode = ocfs2_get_system_file_inode(osb,
+						    GLOBAL_BITMAP_SYSTEM_INODE,
+						    OCFS2_INVALID_SLOT);
+	if (!main_bm_inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_handle_add_inode(handle, main_bm_inode);
+	status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* we want the bitmap change to be recorded on disk asap */
+	ocfs2_handle_set_sync(handle, 1);
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * make sure we've got at least bitswanted contiguous bits in the
+ * local alloc. You lose them when you drop i_sem.
+ *
+ * We will add ourselves to the transaction passed in, but may start
+ * our own in order to shift windows.
+ */
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac)
+{
+	int status;
+	struct ocfs2_dinode *alloc;
+	struct inode *local_alloc_inode;
+	unsigned int free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(!passed_handle);
+	BUG_ON(!ac);
+	BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
+
+	local_alloc_inode =
+		ocfs2_get_system_file_inode(osb,
+					    LOCAL_ALLOC_SYSTEM_INODE,
+					    osb->slot_num);
+	if (!local_alloc_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
+
+	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+		mlog(0, "Asking for more than my max window size!\n");
+		status = -ENOSPC;
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
+	    ocfs2_local_alloc_count_bits(alloc)) {
+		ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
+			    "%u free bits, but a count shows %u",
+			    le64_to_cpu(alloc->i_blkno),
+			    le32_to_cpu(alloc->id1.bitmap1.i_used),
+			    ocfs2_local_alloc_count_bits(alloc));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+		le32_to_cpu(alloc->id1.bitmap1.i_used);
+	if (bits_wanted > free_bits) {
+		/* uhoh, window change time. */
+		status =
+			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	ac->ac_inode = igrab(local_alloc_inode);
+	get_bh(osb->local_alloc_bh);
+	ac->ac_bh = osb->local_alloc_bh;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
+	status = 0;
+bail:
+	if (local_alloc_inode)
+		iput(local_alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits)
+{
+	int status, start;
+	struct inode *local_alloc_inode;
+	u32 bits_wanted;
+	void *bitmap;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+	bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	local_alloc_inode = ac->ac_inode;
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		/* TODO: Shouldn't we just BUG here? */
+		status = -ENOSPC;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bitmap = la->la_bitmap;
+	*bit_off = le32_to_cpu(la->la_bm_off) + start;
+	/* local alloc is always contiguous by nature -- we never
+	 * delete bits from it! */
+	*num_bits = bits_wanted;
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	while(bits_wanted--)
+		ocfs2_set_bit(start++, bitmap);
+
+	alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
+				le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
+{
+	int i;
+	u8 *buffer;
+	u32 count = 0;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry_void();
+
+	buffer = la->la_bitmap;
+	for (i = 0; i < le16_to_cpu(la->la_size); i++)
+		count += hweight8(buffer[i]);
+
+	mlog_exit(count);
+	return count;
+}
+
+static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
+					     struct ocfs2_dinode *alloc,
+					     u32 numbits)
+{
+	int numfound, bitoff, left, startoff, lastzero;
+	void *bitmap = NULL;
+
+	mlog_entry("(numbits wanted = %u)\n", numbits);
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "No bits in my window!\n");
+		bitoff = -1;
+		goto bail;
+	}
+
+	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
+
+	numfound = bitoff = startoff = 0;
+	lastzero = -1;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
+		if (bitoff == left) {
+			/* mlog(0, "bitoff (%d) == left", bitoff); */
+			break;
+		}
+		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
+		   "numfound = %d\n", bitoff, startoff, numfound);*/
+
+		/* Ok, we found a zero bit... is it contig. or do we
+		 * start over?*/
+		if (bitoff == startoff) {
+			/* we found a zero */
+			numfound++;
+			startoff++;
+		} else {
+			/* got a zero after some ones */
+			numfound = 1;
+			startoff = bitoff+1;
+		}
+		/* we got everything we needed */
+		if (numfound == numbits) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
+	     numfound);
+
+	if (numfound == numbits)
+		bitoff = startoff - numfound;
+	else
+		bitoff = -1;
+
+bail:
+	mlog_exit(bitoff);
+	return bitoff;
+}
+
+static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
+{
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+	int i;
+	mlog_entry_void();
+
+	alloc->id1.bitmap1.i_total = 0;
+	alloc->id1.bitmap1.i_used = 0;
+	la->la_bm_off = 0;
+	for(i = 0; i < le16_to_cpu(la->la_size); i++)
+		la->la_bitmap[i] = 0;
+
+	mlog_exit_void();
+}
+
+#if 0
+/* turn this on and uncomment below to aid debugging window shifts. */
+static void ocfs2_verify_zero_bits(unsigned long *bitmap,
+				   unsigned int start,
+				   unsigned int count)
+{
+	unsigned int tmp = count;
+	while(tmp--) {
+		if (ocfs2_test_bit(start + tmp, bitmap)) {
+			printk("ocfs2_verify_zero_bits: start = %u, count = "
+			       "%u\n", start, count);
+			printk("ocfs2_verify_zero_bits: bit %u is set!",
+			       start + tmp);
+			BUG();
+		}
+	}
+}
+#endif
+
+/*
+ * sync the local alloc to main bitmap.
+ *
+ * assumes you've already locked the main bitmap -- the bitmap inode
+ * passed is used for caching.
+ */
+static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct ocfs2_dinode *alloc,
+				    struct inode *main_bm_inode,
+				    struct buffer_head *main_bm_bh)
+{
+	int status = 0;
+	int bit_off, left, count, start;
+	u64 la_start_blk;
+	u64 blkno;
+	void *bitmap;
+	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
+
+	mlog_entry("total = %u, COUNT = %u, used = %u\n",
+		   le32_to_cpu(alloc->id1.bitmap1.i_total),
+		   ocfs2_local_alloc_count_bits(alloc),
+		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+
+	if (!alloc->id1.bitmap1.i_total) {
+		mlog(0, "nothing to sync!\n");
+		goto bail;
+	}
+
+	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
+	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
+		mlog(0, "all bits were taken!\n");
+		goto bail;
+	}
+
+	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
+						le32_to_cpu(la->la_bm_off));
+	bitmap = la->la_bitmap;
+	start = count = bit_off = 0;
+	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
+
+	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
+	       != -1) {
+		if ((bit_off < left) && (bit_off == start)) {
+			count++;
+			start++;
+			continue;
+		}
+		if (count) {
+			blkno = la_start_blk +
+				ocfs2_clusters_to_blocks(osb->sb,
+							 start - count);
+
+			mlog(0, "freeing %u bits starting at local "
+			     "alloc bit %u (la_start_blk = %"MLFu64", "
+			     "blkno = %"MLFu64")\n", count, start - count,
+			     la_start_blk, blkno);
+
+			status = ocfs2_free_clusters(handle, main_bm_inode,
+						     main_bm_bh, blkno, count);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		if (bit_off >= left)
+			break;
+		count = 1;
+		start = bit_off + 1;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
+						struct ocfs2_journal_handle *handle,
+						struct ocfs2_alloc_context **ac,
+						struct inode **bitmap_inode,
+						struct buffer_head **bitmap_bh)
+{
+	int status;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+
+	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	*bitmap_inode = (*ac)->ac_inode;
+	igrab(*bitmap_inode);
+	*bitmap_bh = (*ac)->ac_bh;
+	get_bh(*bitmap_bh);
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * pass it the bitmap lock in lock_bh if you have it.
+ */
+static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
+					struct ocfs2_journal_handle *handle,
+					struct ocfs2_alloc_context *ac)
+{
+	int status = 0;
+	u32 cluster_off, cluster_count;
+	struct ocfs2_dinode *alloc = NULL;
+	struct ocfs2_local_alloc *la;
+
+	mlog_entry_void();
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	if (alloc->id1.bitmap1.i_total)
+		mlog(0, "asking me to alloc a new window over a non-empty "
+		     "one\n");
+
+	mlog(0, "Allocating %u clusters for a new window.\n",
+	     ocfs2_local_alloc_window_bits(osb));
+	/* we used the generic suballoc reserve function, but we set
+	 * everything up nicely, so there's no reason why we can't use
+	 * the more specific cluster api to claim bits. */
+	status = ocfs2_claim_clusters(osb, handle, ac,
+				      ocfs2_local_alloc_window_bits(osb),
+				      &cluster_off, &cluster_count);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	la->la_bm_off = cpu_to_le32(cluster_off);
+	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
+	/* just in case... In the future when we find space ourselves,
+	 * we don't have to get all contiguous -- but we'll have to
+	 * set all previously used bits in bitmap and update
+	 * la_bits_set before setting the bits in the main bitmap. */
+	alloc->id1.bitmap1.i_used = 0;
+	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
+	       le16_to_cpu(la->la_size));
+
+	mlog(0, "New window allocated:\n");
+	mlog(0, "window la_bm_off = %u\n",
+	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* Note that we do *NOT* lock the local alloc inode here as
+ * it's been locked already for us. */
+static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
+					  struct inode *local_alloc_inode)
+{
+	int status = 0;
+	struct buffer_head *main_bm_bh = NULL;
+	struct inode *main_bm_inode = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_dinode *alloc_copy = NULL;
+	struct ocfs2_alloc_context *ac = NULL;
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* This will lock the main bitmap for us. */
+	status = ocfs2_local_alloc_reserve_for_window(osb,
+						      handle,
+						      &ac,
+						      &main_bm_inode,
+						      &main_bm_bh);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+
+	/* We want to clear the local alloc before doing anything
+	 * else, so that if we error later during this operation,
+	 * local alloc shutdown won't try to double free main bitmap
+	 * bits. Make a copy so the sync function knows which bits to
+	 * free. */
+	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+	if (!alloc_copy) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
+
+	status = ocfs2_journal_access(handle, local_alloc_inode,
+				      osb->local_alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_clear_local_alloc(alloc);
+
+	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
+					  main_bm_inode, main_bm_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_local_alloc_new_window(osb, handle, ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_inc(&osb->alloc_stats.moves);
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (main_bm_bh)
+		brelse(main_bm_bh);
+
+	if (main_bm_inode)
+		iput(main_bm_inode);
+
+	if (alloc_copy)
+		kfree(alloc_copy);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	mlog_exit(status);
+	return status;
+}
+
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 0000000..30f88ce
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * localalloc.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCALALLOC_H
+#define OCFS2_LOCALALLOC_H
+
+int ocfs2_load_local_alloc(struct ocfs2_super *osb);
+
+void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+
+int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
+				     int node_num,
+				     struct ocfs2_dinode **alloc_copy);
+
+int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
+					struct ocfs2_dinode *alloc);
+
+int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
+				 u64 bits);
+
+struct ocfs2_alloc_context;
+int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
+				   struct ocfs2_journal_handle *passed_handle,
+				   u32 bits_wanted,
+				   struct ocfs2_alloc_context *ac);
+
+int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
+				 struct ocfs2_journal_handle *handle,
+				 struct ocfs2_alloc_context *ac,
+				 u32 min_bits,
+				 u32 *bit_off,
+				 u32 *num_bits);
+
+#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 0000000..afdeec4
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * mmap.c
+ *
+ * Code to deal with the mess that is clustered mmap.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/signal.h>
+#include <linux/rbtree.h>
+
+#define MLOG_MASK_PREFIX ML_FILE_IO
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "mmap.h"
+
+static struct page *ocfs2_nopage(struct vm_area_struct * area,
+				 unsigned long address,
+				 int *type)
+{
+	struct inode *inode = area->vm_file->f_dentry->d_inode;
+	struct page *page = NOPAGE_SIGBUS;
+	sigset_t blocked, oldset;
+	int ret;
+
+	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+
+	/* The best way to deal with signals in this path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(&blocked);
+
+	/* We should technically never get a bad ret return
+	 * from sigprocmask */
+	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	page = filemap_nopage(area, address, type);
+
+	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (ret < 0)
+		mlog_errno(ret);
+out:
+	mlog_exit_ptr(page);
+	return page;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage = ocfs2_nopage,
+};
+
+int ocfs2_mmap(struct file *file,
+	       struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+
+	/* We don't want to support shared writable mappings yet. */
+	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
+	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
+		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
+		/* This is -EINVAL because generic_file_readonly_mmap
+		 * returns it in a similar situation. */
+		return -EINVAL;
+	}
+
+	update_atime(inode);
+	vma->vm_ops = &ocfs2_file_vm_ops;
+	return 0;
+}
+
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 0000000..1274ee0
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
+#ifndef OCFS2_MMAP_H
+#define OCFS2_MMAP_H
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+
+#endif  /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 0000000..f6b77ff
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir);
+
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh);
+
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh);
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_dir_entry *dirent;
+	struct ocfs2_inode_info *oi;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+
+	dentry->d_op = &ocfs2_dentry_ops;
+	ret = d_splice_alias(inode, dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit_ptr(ret);
+
+	return ret;
+}
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
+		status = -EMLINK;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		le16_add_cpu(&dirfe->i_links_count, 1);
+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		dir->i_nlink++;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+
+	if (de_bh)
+		brelse(de_bh);
+
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	if ((status < 0) && inode)
+		iput(inode);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u64 fe_blkno = 0;
+	u16 suballoc_bit;
+	struct inode *inode = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	*new_fe_bh = NULL;
+	*ret_inode = NULL;
+
+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				       &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode = new_inode(dir->i_sb);
+	if (IS_ERR(inode)) {
+		status = PTR_ERR(inode);
+		mlog(ML_ERROR, "new_inode failed!\n");
+		goto leave;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_mode = mode;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+
+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_uid = cpu_to_le32(current->fsuid);
+	if (dir->i_mode & S_ISGID) {
+		fe->i_gid = cpu_to_le32(dir->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		fe->i_gid = cpu_to_le32(current->fsgid);
+	fe->i_mode = cpu_to_le16(mode);
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	fel = &fe->id2.i_list;
+	fel->l_tree_depth = 0;
+	fel->l_next_free_rec = 0;
+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+
+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
+		     fe->i_blkno, inode->i_ino);
+		BUG();
+	}
+
+	ocfs2_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+	*ret_inode = inode;
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+		if (inode)
+			iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode)) {
+		err = -EPERM;
+		goto bail;
+	}
+
+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		err = -ENOMEM;
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto bail;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_journal_access(handle, inode, fe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	inode->i_nlink++;
+	inode->i_ctime = CURRENT_TIME;
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	err = ocfs2_journal_dirty(handle, fe_bh);
+	if (err < 0) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, de_bh);
+	if (err) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (de_bh)
+		brelse(de_bh);
+	if (fe_bh)
+		brelse(fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	unsigned int saved_nlink = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	struct buffer_head *dirent_bh = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	if (inode == osb->root_inode) {
+		mlog(0, "Cannot delete the root directory\n");
+		status = -EPERM;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
+		     OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+	       	if (!ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		} else if (inode->i_nlink != 2) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	/* There are still a few steps left until we can consider the
+	 * unlink to have succeeded. Save off nlink here before
+	 * modification so we can set it back in case we hit an issue
+	 * before commit. */
+	saved_nlink = inode->i_nlink;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 0;
+	else
+		inode->i_nlink--;
+
+	status = ocfs2_request_unlink_vote(inode, dentry,
+					   (unsigned int) inode->i_nlink);
+	if (status < 0) {
+		/* This vote should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!inode->i_nlink) {
+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
+						  orphan_name,
+						  &orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (!inode->i_nlink) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+					  orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We can set nlink on the dinode now. clear the saved version
+	 * so that it doesn't get set later. */
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	saved_nlink = 0;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dir->i_nlink--;
+		status = ocfs2_mark_inode_dirty(handle, dir,
+						parent_node_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			dir->i_nlink++;
+		}
+	}
+
+leave:
+	if (status < 0 && saved_nlink)
+		inode->i_nlink = saved_nlink;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (fe_bh)
+		brelse(fe_bh);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	if (parent_node_bh)
+		brelse(parent_node_bh);
+
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
+		   oi1->ip_blkno, oi2->ip_blkno);
+
+	BUG_ON(!handle);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			mlog(0, "switching them around...\n");
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+	/* lock id1 */
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ocfs2_dir_entry *) \
+	 ((char *)buffer + \
+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
+							       // and new_dentry
+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
+						    // this is the 1st dirent bh
+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+	unsigned int links_count;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+		   old_dir, old_dentry, new_dir, new_dentry,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	if (atomic_read(&old_dentry->d_count) > 2) {
+		shrink_dcache_parent(old_dentry);
+		if (atomic_read(&old_dentry->d_count) > 2) {
+			status = -EBUSY;
+			goto bail;
+		}
+	}
+
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, handle,
+				  &old_dir_bh, old_dir,
+				  &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		/* Directories actually require metadata updates to
+		 * the directory info so we can't get away with not
+		 * doing node locking on it. */
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = -EIO;
+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
+		if (!old_inode_de_bh)
+			goto bail;
+
+		status = -EIO;
+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
+		    OCFS2_I(old_dir)->ip_blkno)
+			goto bail;
+		status = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+			goto bail;
+	} else {
+		/* Ah, the simple case - we're a file so just send a
+		 * message. */
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = -ENOENT;
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh)
+		goto bail;
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+		goto bail;
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!new_de && new_inode)
+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+		     "directory!", new_inode->i_ino);
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (new_de) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
+			     "disagree. ip_flags = %x\n",
+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			links_count = 0;
+		else
+			links_count = (unsigned int) (new_inode->i_nlink - 1);
+
+		status = ocfs2_request_unlink_vote(new_inode, new_dentry,
+						   links_count);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
+
+		mlog(0, "aha rename over existing... new_de=%p "
+		     "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
+		     new_de, newfe_blkno, newfe_bh, newfe_bh ?
+		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+
+		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
+			status = ocfs2_prepare_orphan_dir(osb, handle,
+							  new_inode,
+							  orphan_name,
+							  &orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+	} else {
+		BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+
+		status = ocfs2_check_dir_for_entry(new_dir,
+						   new_dentry->d_name.name,
+						   new_dentry->d_name.len);
+		if (status)
+			goto bail;
+
+		status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
+						      new_dentry->d_name.name,
+						      new_dentry->d_name.len,
+						      &insert_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_de) {
+		if (S_ISDIR(new_inode->i_mode)) {
+			if (!ocfs2_empty_dir(new_inode) ||
+			    new_inode->i_nlink != 2) {
+				status = -ENOTEMPTY;
+				goto bail;
+			}
+		}
+		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode) ||
+		    (newfe->i_links_count == cpu_to_le16(1))){
+			status = ocfs2_orphan_add(osb, handle, new_inode,
+						  newfe, orphan_name,
+						  orphan_entry_bh);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+
+		/* change the dirent to point to the correct inode */
+		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
+		new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		status = ocfs2_journal_dirty(handle, new_de_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (S_ISDIR(new_inode->i_mode))
+			newfe->i_links_count = 0;
+		else
+			le16_add_cpu(&newfe->i_links_count, -1);
+
+		status = ocfs2_journal_dirty(handle, newfe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	} else {
+		/* if the name was not found in new_dir, add it now */
+		status = ocfs2_add_entry(handle, new_dentry, old_inode,
+					 OCFS2_I(old_inode)->ip_blkno,
+					 new_dir_bh, insert_entry_bh);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(old_inode);
+
+	/* now that the name has been added to new_dir, remove the old name */
+	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (new_inode) {
+		new_inode->i_nlink--;
+		new_inode->i_ctime = CURRENT_TIME;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+	if (old_inode_de_bh) {
+		status = ocfs2_journal_access(handle, old_inode,
+					     old_inode_de_bh,
+					     OCFS2_JOURNAL_ACCESS_WRITE);
+		PARENT_INO(old_inode_de_bh->b_data) =
+			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
+		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
+		old_dir->i_nlink--;
+		if (new_inode) {
+			new_inode->i_nlink--;
+		} else {
+			new_dir->i_nlink++;
+			mark_inode_dirty(new_dir);
+		}
+	}
+	mark_inode_dirty(old_dir);
+	if (new_inode)
+		mark_inode_dirty(new_inode);
+
+	if (old_dir != new_dir)
+		if (new_dir_nlink != new_dir->i_nlink) {
+			if (!new_dir_bh) {
+				mlog(ML_ERROR, "need to change nlink for new "
+				     "dir %"MLFu64" from %d to %d but bh is "
+				     "NULL\n", OCFS2_I(new_dir)->ip_blkno,
+				     (int)new_dir_nlink, new_dir->i_nlink);
+			} else {
+				struct ocfs2_dinode *fe;
+				status = ocfs2_journal_access(handle,
+							      new_dir,
+							      new_dir_bh,
+							      OCFS2_JOURNAL_ACCESS_WRITE);
+				fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
+				fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
+				status = ocfs2_journal_dirty(handle, new_dir_bh);
+			}
+		}
+
+	if (old_dir_nlink != old_dir->i_nlink) {
+		if (!old_dir_bh) {
+			mlog(ML_ERROR, "need to change nlink for old dir "
+			     "%"MLFu64" from %d to %d but bh is NULL!\n",
+			     OCFS2_I(old_dir)->ip_blkno,
+			     (int)old_dir_nlink,
+			     old_dir->i_nlink);
+		} else {
+			struct ocfs2_dinode *fe;
+			status = ocfs2_journal_access(handle, old_dir,
+						      old_dir_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
+			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+			status = ocfs2_journal_dirty(handle, old_dir_bh);
+		}
+	}
+
+	status = 0;
+bail:
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (new_inode)
+		sync_mapping_buffers(old_inode->i_mapping);
+
+	if (new_inode)
+		iput(new_inode);
+	if (newfe_bh)
+		brelse(newfe_bh);
+	if (old_dir_bh)
+		brelse(old_dir_bh);
+	if (new_dir_bh)
+		brelse(new_dir_bh);
+	if (new_de_bh)
+		brelse(new_de_bh);
+	if (old_de_bh)
+		brelse(old_de_bh);
+	if (old_inode_de_bh)
+		brelse(old_inode_de_bh);
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+	if (insert_entry_bh)
+		brelse(insert_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * we expect i_size = strlen(symname). Copy symname into the file
+ * data, including the null terminator.
+ */
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname)
+{
+	struct buffer_head **bhs = NULL;
+	const char *c;
+	struct super_block *sb = osb->sb;
+	u64 p_blkno;
+	int p_blocks;
+	int virtual, blocks, status, i, bytes_left;
+
+	bytes_left = i_size_read(inode) + 1;
+	/* we can't trust i_blocks because we're actually going to
+	 * write i_size + 1 bytes. */
+	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+
+	mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
+		       inode->i_blocks, i_size_read(inode), blocks);
+
+	/* Sanity check -- make sure we're going to fit. */
+	if (bytes_left >
+	    ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bhs) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
+					     &p_blocks);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* links can never be larger than one cluster so we know this
+	 * is all going to be contiguous, but do a sanity check
+	 * anyway. */
+	if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	virtual = 0;
+	while(bytes_left > 0) {
+		c = &symname[virtual * sb->s_blocksize];
+
+		bhs[virtual] = sb_getblk(sb, p_blkno);
+		if (!bhs[virtual]) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+
+		status = ocfs2_journal_access(handle, inode, bhs[virtual],
+					      OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
+
+		memcpy(bhs[virtual]->b_data, c,
+		       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
+		       bytes_left);
+
+		status = ocfs2_journal_dirty(handle, bhs[virtual]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		virtual++;
+		p_blkno++;
+		bytes_left -= sb->s_blocksize;
+	}
+
+	status = 0;
+bail:
+
+	if (bhs) {
+		for(i = 0; i < blocks; i++)
+			if (bhs[i])
+				brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_symlink(struct inode *dir,
+			 struct dentry *dentry,
+			 const char *symname)
+{
+	int status, l, credits;
+	u64 newsize;
+	struct ocfs2_super *osb = NULL;
+	struct inode *inode = NULL;
+	struct super_block *sb;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_dinode *dirfe;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+
+	sb = dir->i_sb;
+	osb = OCFS2_SB(sb);
+
+	l = strlen(symname) + 1;
+
+	credits = ocfs2_calc_symlink_credits(sb);
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* lock the parent directory */
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto bail;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto bail;
+
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_mknod_locked(osb, dir, dentry,
+				    S_IFLNK | S_IRWXUGO, 0,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
+	inode->i_rdev = 0;
+	newsize = l - 1;
+	if (l > ocfs2_fast_symlink_chars(sb)) {
+		inode->i_op = &ocfs2_symlink_inode_operations;
+		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+						    handle, data_ac, NULL,
+						    NULL);
+		if (status < 0) {
+			if (status != -ENOSPC && status != -EINTR) {
+				mlog(ML_ERROR, "Failed to extend file to "
+					       "%"MLFu64"\n",
+				     newsize);
+				mlog_errno(status);
+				status = -ENOSPC;
+			}
+			goto bail;
+		}
+		i_size_write(inode, newsize);
+		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+	} else {
+		inode->i_op = &ocfs2_fast_symlink_inode_operations;
+		memcpy((char *) fe->id2.i_symlink, symname, l);
+		i_size_write(inode, newsize);
+		inode->i_blocks = 0;
+	}
+
+	status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!ocfs2_inode_is_fast_symlink(inode)) {
+		status = ocfs2_create_symlink_data(osb, handle, inode,
+						   symname);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+	if (de_bh)
+		brelse(de_bh);
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if ((status < 0) && inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+int ocfs2_check_dir_entry(struct inode * dir,
+			  struct ocfs2_dir_entry * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < OCFS2_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+
+	if (error_msg != NULL)
+		mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
+		     "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
+		     OCFS2_I(dir)->ip_blkno, error_msg, offset,
+		     le64_to_cpu(de->inode), rlen, de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there.
+ */
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct super_block *sb;
+	int retval, status;
+
+	mlog_entry_void();
+
+	sb = dir->i_sb;
+
+	if (!namelen)
+		return -EINVAL;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	while (1) {
+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			status = ocfs2_journal_access(handle, dir, insert_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+
+	mlog_exit(retval);
+	return retval;
+}
+
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = ocfs2_journal_access(handle, dir, bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	de_buf = bh->b_data;
+	dlimit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	mlog_entry_void();
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				/* XXX: questionable readahead stuff here */
+				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh_use[ra_max] = bh;
+#if 0		// ???
+				if (bh)
+					ll_rw_block(READ, 1, &bh);
+#endif
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			brelse(bh);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
+{
+	int status, namelen;
+
+	mlog_entry_void();
+
+	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
+			   blkno);
+	if (namelen <= 0) {
+		if (namelen)
+			status = namelen;
+		else
+			status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+	if (namelen != OCFS2_ORPHAN_NAMELEN) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
+	     namelen);
+
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_handle_add_inode(handle, orphan_dir_inode);
+	status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+					      orphan_dir_bh, name,
+					      OCFS2_ORPHAN_NAMELEN, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh)
+{
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	int status = 0;
+	struct ocfs2_dinode *orphan_fe;
+
+	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_read_block(osb,
+				  OCFS2_I(orphan_dir_inode)->ip_blkno,
+				  &orphan_dir_bh, OCFS2_BH_CACHED,
+				  orphan_dir_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* we're a cluster, and nlink can change on disk from
+	 * underneath us... */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, 1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+				   OCFS2_ORPHAN_NAMELEN, inode,
+				   OCFS2_I(inode)->ip_blkno,
+				   orphan_dir_bh, de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+
+	/* Record which orphan dir our inode now resides
+	 * in. delete_inode will use this to determine which orphan
+	 * dir to lock. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
+	     OCFS2_I(inode)->ip_blkno, osb->slot_num);
+
+leave:
+	if (orphan_dir_inode)
+		iput(orphan_dir_inode);
+
+	if (orphan_dir_bh)
+		brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh)
+{
+	char name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct ocfs2_dinode *orphan_fe;
+	int status = 0;
+	struct buffer_head *target_de_bh = NULL;
+	struct ocfs2_dir_entry *target_de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
+	     name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
+
+	/* find it's spot in the orphan directory */
+	target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
+					orphan_dir_inode, &target_de);
+	if (!target_de_bh) {
+		status = -ENOENT;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* remove it from the orphan directory */
+	status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
+				    target_de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the i_nlink dance! :) */
+	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+	if (S_ISDIR(inode->i_mode))
+		le16_add_cpu(&orphan_fe->i_links_count, -1);
+	orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+
+	status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+leave:
+	if (target_de_bh)
+		brelse(target_de_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+struct inode_operations ocfs2_dir_iops = {
+	.create		= ocfs2_create,
+	.lookup		= ocfs2_lookup,
+	.link		= ocfs2_link,
+	.unlink		= ocfs2_unlink,
+	.rmdir		= ocfs2_unlink,
+	.symlink	= ocfs2_symlink,
+	.mkdir		= ocfs2_mkdir,
+	.mknod		= ocfs2_mknod,
+	.rename		= ocfs2_rename,
+	.setattr	= ocfs2_setattr,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 0000000..deaaa97
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_NAMEI_H
+#define OCFS2_NAMEI_H
+
+extern struct inode_operations ocfs2_dir_iops;
+
+struct dentry *ocfs2_get_parent(struct dentry *child);
+
+int ocfs2_check_dir_entry (struct inode *dir,
+			   struct ocfs2_dir_entry *de,
+			   struct buffer_head *bh,
+			   unsigned long offset);
+struct buffer_head *ocfs2_find_entry(const char *name,
+				     int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir);
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+		     struct ocfs2_journal_handle *handle,
+		     struct inode *orphan_dir_inode,
+		     struct inode *inode,
+		     struct buffer_head *orphan_dir_bh);
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 0000000..0b499bc
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs1_fs_compat.h
+ *
+ * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
+ * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
+ * This allows an OCFS1 volume to see the partition and cleanly fail to
+ * mount it.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS1_FS_COMPAT_H
+#define _OCFS1_FS_COMPAT_H
+
+#define OCFS1_MAX_VOL_SIGNATURE_LEN          128
+#define OCFS1_MAX_MOUNT_POINT_LEN            128
+#define OCFS1_MAX_VOL_ID_LENGTH               16
+#define OCFS1_MAX_VOL_LABEL_LEN               64
+#define OCFS1_MAX_CLUSTER_NAME_LEN            64
+
+#define OCFS1_MAJOR_VERSION              (2)
+#define OCFS1_MINOR_VERSION              (0)
+#define OCFS1_VOLUME_SIGNATURE		 "OracleCFS"
+
+/*
+ * OCFS1 superblock.  Lives at sector 0.
+ */
+struct ocfs1_vol_disk_hdr
+{
+/*00*/	__u32 minor_version;
+	__u32 major_version;
+/*08*/	__u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
+/*88*/	__u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
+/*108*/	__u64 serial_num;
+/*110*/	__u64 device_size;
+	__u64 start_off;
+/*120*/	__u64 bitmap_off;
+	__u64 publ_off;
+/*130*/	__u64 vote_off;
+	__u64 root_bitmap_off;
+/*140*/	__u64 data_start_off;
+	__u64 root_bitmap_size;
+/*150*/	__u64 root_off;
+	__u64 root_size;
+/*160*/	__u64 cluster_size;
+	__u64 num_nodes;
+/*170*/	__u64 num_clusters;
+	__u64 dir_node_size;
+/*180*/	__u64 file_node_size;
+	__u64 internal_off;
+/*190*/	__u64 node_cfg_off;
+	__u64 node_cfg_size;
+/*1A0*/	__u64 new_cfg_off;
+	__u32 prot_bits;
+	__s32 excl_mount;
+/*1B0*/
+};
+
+
+struct ocfs1_disk_lock
+{
+/*00*/	__u32 curr_master;
+	__u8 file_lock;
+	__u8 compat_pad[3];  /* Not in orignal definition.  Used to
+				make the already existing alignment
+				explicit */
+	__u64 last_write_time;
+/*10*/	__u64 last_read_time;
+	__u32 writer_node_num;
+	__u32 reader_node_num;
+/*20*/	__u64 oin_node_map;
+	__u64 dlock_seq_num;
+/*30*/
+};
+
+/*
+ * OCFS1 volume label.  Lives at sector 1.
+ */
+struct ocfs1_vol_label
+{
+/*00*/	struct ocfs1_disk_lock disk_lock;
+/*30*/	__u8 label[OCFS1_MAX_VOL_LABEL_LEN];
+/*70*/	__u16 label_len;
+/*72*/	__u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
+/*82*/	__u16 vol_id_len;
+/*84*/	__u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
+/*A4*/	__u16 cluster_name_len;
+/*A6*/
+};
+
+
+#endif /* _OCFS1_FS_COMPAT_H */
+
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 0000000..f468c60
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2.h
+ *
+ * Defines macros and structures used in OCFS2
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_H
+#define OCFS2_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "ocfs2_fs.h"
+#include "endian.h"
+#include "ocfs2_lockid.h"
+
+struct ocfs2_extent_map {
+	u32		em_clusters;
+	struct rb_root	em_extents;
+};
+
+/* Most user visible OCFS2 inodes will have very few pieces of
+ * metadata, but larger files (including bitmaps, etc) must be taken
+ * into account when designing an access scheme. We allow a small
+ * amount of inlined blocks to be stored on an array and grow the
+ * structure into a rb tree when necessary. */
+#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+
+struct ocfs2_caching_info {
+	unsigned int		ci_num_cached;
+	union {
+		sector_t	ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+		struct rb_root	ci_tree;
+	} ci_cache;
+};
+
+/* this limits us to 256 nodes
+ * if we need more, we can do a kmalloc for the map */
+#define OCFS2_NODE_MAP_MAX_NODES    256
+struct ocfs2_node_map {
+	u16 num_nodes;
+	unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
+};
+
+enum ocfs2_ast_action {
+	OCFS2_AST_INVALID = 0,
+	OCFS2_AST_ATTACH,
+	OCFS2_AST_CONVERT,
+	OCFS2_AST_DOWNCONVERT,
+};
+
+/* actions for an unlockast function to take. */
+enum ocfs2_unlock_action {
+	OCFS2_UNLOCK_INVALID = 0,
+	OCFS2_UNLOCK_CANCEL_CONVERT,
+	OCFS2_UNLOCK_DROP_LOCK,
+};
+
+/* ocfs2_lock_res->l_flags flags. */
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+					       * the lvb */
+#define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define OCFS2_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					       * downconvert*/
+#define OCFS2_LOCK_LOCAL         (0x00000008) /* newly created inode */
+#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
+#define OCFS2_LOCK_REFRESHING    (0x00000020)
+#define OCFS2_LOCK_INITIALIZED   (0x00000040) /* track initialization
+					       * for shutdown paths */
+#define OCFS2_LOCK_FREEING       (0x00000080) /* help dlmglue track
+					       * when to skip queueing
+					       * a lock because it's
+					       * about to be
+					       * dropped. */
+#define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+
+struct ocfs2_lock_res_ops;
+
+typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+
+struct ocfs2_lock_res {
+	void                    *l_priv;
+	struct ocfs2_lock_res_ops *l_ops;
+	spinlock_t               l_lock;
+
+	struct list_head         l_blocked_list;
+	struct list_head         l_mask_waiters;
+
+	enum ocfs2_lock_type     l_type;
+	unsigned long		 l_flags;
+	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	/* used from AST/BAST funcs. */
+	enum ocfs2_ast_action    l_action;
+	enum ocfs2_unlock_action l_unlock_action;
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct list_head         l_debug_list;
+};
+
+struct ocfs2_dlm_debug {
+	struct kref d_refcnt;
+	struct dentry *d_locking_state;
+	struct list_head d_lockres_tracking;
+};
+
+enum ocfs2_vol_state
+{
+	VOLUME_INIT = 0,
+	VOLUME_MOUNTED,
+	VOLUME_DISMOUNTED,
+	VOLUME_DISABLED
+};
+
+struct ocfs2_alloc_stats
+{
+	atomic_t moves;
+	atomic_t local_data;
+	atomic_t bitmap_data;
+	atomic_t bg_allocs;
+	atomic_t bg_extends;
+};
+
+enum ocfs2_local_alloc_state
+{
+	OCFS2_LA_UNUSED = 0,
+	OCFS2_LA_ENABLED,
+	OCFS2_LA_DISABLED
+};
+
+enum ocfs2_mount_options
+{
+	OCFS2_MOUNT_HB_LOCAL   = 1 << 0, /* Heartbeat started in local mode */
+	OCFS2_MOUNT_BARRIER = 1 << 1,	/* Use block barriers */
+	OCFS2_MOUNT_NOINTR  = 1 << 2,   /* Don't catch signals */
+	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
+	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+#ifdef OCFS2_ORACORE_WORKAROUNDS
+	OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
+#endif
+};
+
+#define OCFS2_OSB_SOFT_RO	0x0001
+#define OCFS2_OSB_HARD_RO	0x0002
+#define OCFS2_OSB_ERROR_FS	0x0004
+
+struct ocfs2_journal;
+struct ocfs2_journal_handle;
+struct ocfs2_super
+{
+	u32 osb_id;		/* id used by the proc interface */
+	struct task_struct *commit_task;
+	struct super_block *sb;
+	struct inode *root_inode;
+	struct inode *sys_root_inode;
+	struct inode *system_inodes[NUM_SYSTEM_INODES];
+
+	struct ocfs2_slot_info *slot_info;
+
+	spinlock_t node_map_lock;
+	struct ocfs2_node_map mounted_map;
+	struct ocfs2_node_map recovery_map;
+	struct ocfs2_node_map umount_map;
+
+	u32 num_clusters;
+	u64 root_blkno;
+	u64 system_dir_blkno;
+	u64 bitmap_blkno;
+	u32 bitmap_cpg;
+	u8 *uuid;
+	char *uuid_str;
+	u8 *vol_label;
+	u64 first_cluster_group_blkno;
+	u32 fs_generation;
+
+	u32 s_feature_compat;
+	u32 s_feature_incompat;
+	u32 s_feature_ro_compat;
+
+	/* Protects s_next_generaion, osb_flags. Could protect more on
+	 * osb as it's very short lived. */
+	spinlock_t osb_lock;
+	u32 s_next_generation;
+	unsigned long osb_flags;
+
+	unsigned long s_mount_opt;
+
+	u16 max_slots;
+	u16 num_nodes;
+	s16 node_num;
+	s16 slot_num;
+	int s_sectsize_bits;
+	int s_clustersize;
+	int s_clustersize_bits;
+	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
+
+	atomic_t vol_state;
+	struct semaphore recovery_lock;
+	struct task_struct *recovery_thread_task;
+	int disable_recovery;
+	wait_queue_head_t checkpoint_event;
+	atomic_t needs_checkpoint;
+	struct ocfs2_journal *journal;
+
+	enum ocfs2_local_alloc_state local_alloc_state;
+	struct buffer_head *local_alloc_bh;
+
+	/* Next two fields are for local node slot recovery during
+	 * mount. */
+	int dirty;
+	struct ocfs2_dinode *local_alloc_copy;
+
+	struct ocfs2_alloc_stats alloc_stats;
+	char dev_str[20];		/* "major,minor" of the device */
+
+	struct dlm_ctxt *dlm;
+	struct ocfs2_lock_res osb_super_lockres;
+	struct ocfs2_lock_res osb_rename_lockres;
+	struct dlm_eviction_cb osb_eviction_cb;
+	struct ocfs2_dlm_debug *osb_dlm_debug;
+
+	struct dentry *osb_debug_root;
+
+	wait_queue_head_t recovery_event;
+
+	spinlock_t vote_task_lock;
+	struct task_struct *vote_task;
+	wait_queue_head_t vote_event;
+	unsigned long vote_wake_sequence;
+	unsigned long vote_work_sequence;
+
+	struct list_head blocked_lock_list;
+	unsigned long blocked_lock_count;
+
+	struct list_head vote_list;
+	int vote_count;
+
+	u32 net_key;
+	spinlock_t net_response_lock;
+	unsigned int net_response_ids;
+	struct list_head net_response_list;
+
+	struct o2hb_callback_func osb_hb_up;
+	struct o2hb_callback_func osb_hb_down;
+
+	struct list_head	osb_net_handlers;
+
+	wait_queue_head_t		osb_mount_event;
+
+	/* Truncate log info */
+	struct inode			*osb_tl_inode;
+	struct buffer_head		*osb_tl_bh;
+	struct work_struct		osb_truncate_log_wq;
+};
+
+#define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
+#define OCFS2_MAX_OSB_ID             65536
+
+static inline int ocfs2_should_order_data(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
+		return 0;
+	return 1;
+}
+
+/* set / clear functions because cluster events can make these happen
+ * in parallel so we want the transitions to be atomic. this also
+ * means that any future flags osb_flags must be protected by spinlock
+ * too! */
+static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
+				      unsigned long flag)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags |= flag;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
+				     int hard)
+{
+	spin_lock(&osb->osb_lock);
+	osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
+	if (hard)
+		osb->osb_flags |= OCFS2_OSB_HARD_RO;
+	else
+		osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+}
+
+static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
+{
+	int ret;
+
+	spin_lock(&osb->osb_lock);
+	ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+#define OCFS2_IS_VALID_DINODE(ptr)					\
+	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
+	typeof(__di) ____di = (__di);					\
+	ocfs2_error((__sb), 						\
+		"Dinode # %"MLFu64" has bad signature %.*s",		\
+		(____di)->i_blkno, 7,					\
+		(____di)->i_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
+	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
+	typeof(__eb) ____eb = (__eb);					\
+	ocfs2_error((__sb), 						\
+		"Extent Block # %"MLFu64" has bad signature %.*s",	\
+		(____eb)->h_blkno, 7,					\
+		(____eb)->h_signature);					\
+} while (0);
+
+#define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
+	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
+
+#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
+	typeof(__gd) ____gd = (__gd);					\
+		ocfs2_error((__sb),					\
+		"Group Descriptor # %"MLFu64" has bad signature %.*s",	\
+		(____gd)->bg_blkno, 7,					\
+		(____gd)->bg_signature);				\
+} while (0);
+
+static inline unsigned long ino_from_blkno(struct super_block *sb,
+					   u64 blkno)
+{
+	return (unsigned long)(blkno & (u64)ULONG_MAX);
+}
+
+static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
+					   u32 clusters)
+{
+	int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u64)clusters << c_to_b_bits;
+}
+
+static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
+					   u64 blocks)
+{
+	int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+		sb->s_blocksize_bits;
+
+	return (u32)(blocks >> b_to_c_bits);
+}
+
+static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
+						    u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	bytes += OCFS2_SB(sb)->s_clustersize - 1;
+	/* OCFS2 just cannot have enough clusters to overflow this */
+	clusters = (unsigned int)(bytes >> cl_bits);
+
+	return clusters;
+}
+
+static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
+					 u64 bytes)
+{
+	bytes += sb->s_blocksize - 1;
+	return bytes >> sb->s_blocksize_bits;
+}
+
+static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
+					  u32 clusters)
+{
+	return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
+						u64 bytes)
+{
+	int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int clusters;
+
+	clusters = ocfs2_clusters_for_bytes(sb, bytes);
+	return (u64)clusters << cl_bits;
+}
+
+static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
+					      u64 bytes)
+{
+	u64 blocks;
+
+        blocks = ocfs2_blocks_for_bytes(sb, bytes);
+	return blocks << sb->s_blocksize_bits;
+}
+
+static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
+{
+	return (unsigned long)((bytes + 511) >> 9);
+}
+
+#define ocfs2_set_bit ext2_set_bit
+#define ocfs2_clear_bit ext2_clear_bit
+#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#endif  /* OCFS2_H */
+
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 0000000..dfb8a5b
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_fs.h
+ *
+ * On-disk structures for OCFS2.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2,  as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _OCFS2_FS_H
+#define _OCFS2_FS_H
+
+/* Version */
+#define OCFS2_MAJOR_REV_LEVEL		0
+#define OCFS2_MINOR_REV_LEVEL          	90
+
+/*
+ * An OCFS2 volume starts this way:
+ * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
+ * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
+ * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
+ *
+ * All other structures are found from the superblock information.
+ *
+ * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors.  eg, for a
+ * blocksize of 2K, it is 4096 bytes into disk.
+ */
+#define OCFS2_SUPER_BLOCK_BLKNO		2
+
+/*
+ * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
+ * grow if needed.
+ */
+#define OCFS2_MIN_CLUSTERSIZE		4096
+#define OCFS2_MAX_CLUSTERSIZE		1048576
+
+/*
+ * Blocks cannot be bigger than clusters, so the maximum blocksize is the
+ * minimum cluster size.
+ */
+#define OCFS2_MIN_BLOCKSIZE		512
+#define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
+
+/* Filesystem magic number */
+#define OCFS2_SUPER_MAGIC		0x7461636f
+
+/* Object signatures */
+#define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
+#define OCFS2_INODE_SIGNATURE		"INODE01"
+#define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
+#define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+
+/* Compatibility flags */
+#define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_compat & (mask) )
+#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
+#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	( OCFS2_SB(sb)->s_feature_incompat & (mask) )
+#define OCFS2_SET_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat |= (mask)
+#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
+#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat |= (mask)
+#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_compat &= ~(mask)
+#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
+#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
+
+#define OCFS2_FEATURE_COMPAT_SUPP	0
+#define OCFS2_FEATURE_INCOMPAT_SUPP	0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	0
+
+/*
+ * Heartbeat-only devices are missing journals and other files.  The
+ * filesystem driver can't load them, but the library can.  Never put
+ * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
+ */
+#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV	0x0002
+
+
+/*
+ * Flags on ocfs2_dinode.i_flags
+ */
+#define OCFS2_VALID_FL		(0x00000001)	/* Inode is valid */
+#define OCFS2_UNUSED2_FL	(0x00000002)
+#define OCFS2_ORPHANED_FL	(0x00000004)	/* On the orphan list */
+#define OCFS2_UNUSED3_FL	(0x00000008)
+/* System inode flags */
+#define OCFS2_SYSTEM_FL		(0x00000010)	/* System inode */
+#define OCFS2_SUPER_BLOCK_FL	(0x00000020)	/* Super block */
+#define OCFS2_LOCAL_ALLOC_FL	(0x00000040)	/* Slot local alloc bitmap */
+#define OCFS2_BITMAP_FL		(0x00000080)	/* Allocation bitmap */
+#define OCFS2_JOURNAL_FL	(0x00000100)	/* Slot local journal */
+#define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
+#define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
+#define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+
+/*
+ * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
+ */
+#define OCFS2_JOURNAL_DIRTY_FL	(0x00000001)	/* Journal needs recovery */
+
+/*
+ * superblock s_state flags
+ */
+#define OCFS2_ERROR_FS		(0x00000001)	/* FS saw errors */
+
+/* Limit of space in ocfs2_dir_entry */
+#define OCFS2_MAX_FILENAME_LEN		255
+
+/* Maximum slots on an ocfs2 file system */
+#define OCFS2_MAX_SLOTS			255
+
+/* Slot map indicator for an empty slot */
+#define OCFS2_INVALID_SLOT		-1
+
+#define OCFS2_VOL_UUID_LEN		16
+#define OCFS2_MAX_VOL_LABEL_LEN		64
+
+/* Journal limits (in bytes) */
+#define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
+#define OCFS2_MAX_JOURNAL_SIZE		(500 * 1024 * 1024)
+
+struct ocfs2_system_inode_info {
+	char	*si_name;
+	int	si_iflags;
+	int	si_mode;
+};
+
+/* System file index */
+enum {
+	BAD_BLOCK_SYSTEM_INODE = 0,
+	GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+	SLOT_MAP_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
+	HEARTBEAT_SYSTEM_INODE,
+	GLOBAL_BITMAP_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	ORPHAN_DIR_SYSTEM_INODE,
+	EXTENT_ALLOC_SYSTEM_INODE,
+	INODE_ALLOC_SYSTEM_INODE,
+	JOURNAL_SYSTEM_INODE,
+	LOCAL_ALLOC_SYSTEM_INODE,
+	TRUNCATE_LOG_SYSTEM_INODE,
+	NUM_SYSTEM_INODES
+};
+
+static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
+	/* Global system inodes (single copy) */
+	/* The first two are only used from userspace mfks/tunefs */
+	[BAD_BLOCK_SYSTEM_INODE]		= { "bad_blocks", 0, S_IFREG | 0644 },
+	[GLOBAL_INODE_ALLOC_SYSTEM_INODE] 	= { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+
+	/* These are used by the running filesystem */
+	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
+	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
+	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+
+	/* Slot-specific system inodes (one copy per slot) */
+	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
+	[EXTENT_ALLOC_SYSTEM_INODE]		= { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
+	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
+	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+};
+
+/* Parameter passed from mount.ocfs2 to module */
+#define OCFS2_HB_NONE			"heartbeat=none"
+#define OCFS2_HB_LOCAL			"heartbeat=local"
+
+/*
+ * OCFS2 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define OCFS2_FT_UNKNOWN	0
+#define OCFS2_FT_REG_FILE	1
+#define OCFS2_FT_DIR		2
+#define OCFS2_FT_CHRDEV		3
+#define OCFS2_FT_BLKDEV		4
+#define OCFS2_FT_FIFO		5
+#define OCFS2_FT_SOCK		6
+#define OCFS2_FT_SYMLINK	7
+
+#define OCFS2_FT_MAX		8
+
+/*
+ * OCFS2_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define OCFS2_DIR_PAD			4
+#define OCFS2_DIR_ROUND			(OCFS2_DIR_PAD - 1)
+#define OCFS2_DIR_MEMBER_LEN 		offsetof(struct ocfs2_dir_entry, name)
+#define OCFS2_DIR_REC_LEN(name_len)	(((name_len) + OCFS2_DIR_MEMBER_LEN + \
+                                          OCFS2_DIR_ROUND) & \
+					 ~OCFS2_DIR_ROUND)
+
+#define OCFS2_LINK_MAX		32000
+
+#define S_SHIFT			12
+static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
+	[S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
+};
+
+
+/*
+ * Convenience casts
+ */
+#define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
+
+/*
+ * On disk extent record for OCFS2
+ * It describes a range of clusters on disk.
+ */
+struct ocfs2_extent_rec {
+/*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
+	__le32 e_clusters;	/* Clusters covered by this extent */
+	__le64 e_blkno;		/* Physical disk offset, in blocks */
+/*10*/
+};
+
+struct ocfs2_chain_rec {
+	__le32 c_free;	/* Number of free bits in this chain. */
+	__le32 c_total;	/* Number of total bits in this chain */
+	__le64 c_blkno;	/* Physical disk offset (blocks) of 1st group */
+};
+
+struct ocfs2_truncate_rec {
+	__le32 t_start;		/* 1st cluster in this log */
+	__le32 t_clusters;	/* Number of total clusters covered */
+};
+
+/*
+ * On disk extent list for OCFS2 (node in the tree).  Note that this
+ * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
+ * offsets are relative to ocfs2_dinode.id2.i_list or
+ * ocfs2_extent_block.h_list, respectively.
+ */
+struct ocfs2_extent_list {
+/*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
+					   point.  0 means data extents
+					   hang directly off this
+					   header (a leaf) */
+	__le16 l_count;			/* Number of extent records */
+	__le16 l_next_free_rec;		/* Next unused extent slot */
+	__le16 l_reserved1;
+	__le64 l_reserved2;		/* Pad to
+					   sizeof(ocfs2_extent_rec) */
+/*10*/	struct ocfs2_extent_rec l_recs[0];	/* Extent records */
+};
+
+/*
+ * On disk allocation chain list for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_chain.
+ */
+struct ocfs2_chain_list {
+/*00*/	__le16 cl_cpg;			/* Clusters per Block Group */
+	__le16 cl_bpc;			/* Bits per cluster */
+	__le16 cl_count;		/* Total chains in this list */
+	__le16 cl_next_free_rec;	/* Next unused chain slot */
+	__le64 cl_reserved1;
+/*10*/	struct ocfs2_chain_rec cl_recs[0];	/* Chain records */
+};
+
+/*
+ * On disk deallocation log for OCFS2.  Note that this is
+ * contained inside ocfs2_dinode, so the offsets are relative to
+ * ocfs2_dinode.id2.i_dealloc.
+ */
+struct ocfs2_truncate_log {
+/*00*/	__le16 tl_count;		/* Total records in this log */
+	__le16 tl_used;			/* Number of records in use */
+	__le32 tl_reserved1;
+/*08*/	struct ocfs2_truncate_rec tl_recs[0];	/* Truncate records */
+};
+
+/*
+ * On disk extent block (indirect block) for OCFS2
+ */
+struct ocfs2_extent_block
+{
+/*00*/	__u8 h_signature[8];		/* Signature for verification */
+	__le64 h_reserved1;
+/*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
+					   extent_header belongs to */
+	__le16 h_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 h_fs_generation;		/* Must match super block */
+	__le64 h_blkno;			/* Offset on disk, in blocks */
+/*20*/	__le64 h_reserved3;
+	__le64 h_next_leaf_blk;		/* Offset on disk, in blocks,
+					   of next leaf header pointing
+					   to data */
+/*30*/	struct ocfs2_extent_list h_list;	/* Extent record list */
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk superblock for OCFS2
+ * Note that it is contained inside an ocfs2_dinode, so all offsets
+ * are relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_super_block {
+/*00*/	__le16 s_major_rev_level;
+	__le16 s_minor_rev_level;
+	__le16 s_mnt_count;
+	__le16 s_max_mnt_count;
+	__le16 s_state;			/* File system state */
+	__le16 s_errors;			/* Behaviour when detecting errors */
+	__le32 s_checkinterval;		/* Max time between checks */
+/*10*/	__le64 s_lastcheck;		/* Time of last check */
+	__le32 s_creator_os;		/* OS */
+	__le32 s_feature_compat;		/* Compatible feature set */
+/*20*/	__le32 s_feature_incompat;	/* Incompatible feature set */
+	__le32 s_feature_ro_compat;	/* Readonly-compatible feature set */
+	__le64 s_root_blkno;		/* Offset, in blocks, of root directory
+					   dinode */
+/*30*/	__le64 s_system_dir_blkno;	/* Offset, in blocks, of system
+					   directory dinode */
+	__le32 s_blocksize_bits;		/* Blocksize for this fs */
+	__le32 s_clustersize_bits;	/* Clustersize for this fs */
+/*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
+					   before tunefs required */
+	__le16 s_reserved1;
+	__le32 s_reserved2;
+	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
+					 * group header */
+/*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
+/*90*/	__u8  s_uuid[OCFS2_VOL_UUID_LEN];	/* 128-bit uuid */
+/*A0*/
+};
+
+/*
+ * Local allocation bitmap for OCFS2 slots
+ * Note that it exists inside an ocfs2_dinode, so all offsets are
+ * relative to the start of ocfs2_dinode.id2.
+ */
+struct ocfs2_local_alloc
+{
+/*00*/	__le32 la_bm_off;	/* Starting bit offset in main bitmap */
+	__le16 la_size;		/* Size of included bitmap, in bytes */
+	__le16 la_reserved1;
+	__le64 la_reserved2;
+/*10*/	__u8   la_bitmap[0];
+};
+
+/*
+ * On disk inode for OCFS2
+ */
+struct ocfs2_dinode {
+/*00*/	__u8 i_signature[8];		/* Signature for validation */
+	__le32 i_generation;		/* Generation number */
+	__le16 i_suballoc_slot;		/* Slot suballocator this inode
+					   belongs to */
+	__le16 i_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+/*10*/	__le32 i_reserved0;
+	__le32 i_clusters;		/* Cluster count */
+	__le32 i_uid;			/* Owner UID */
+	__le32 i_gid;			/* Owning GID */
+/*20*/	__le64 i_size;			/* Size in bytes */
+	__le16 i_mode;			/* File mode */
+	__le16 i_links_count;		/* Links count */
+	__le32 i_flags;			/* File flags */
+/*30*/	__le64 i_atime;			/* Access time */
+	__le64 i_ctime;			/* Creation time */
+/*40*/	__le64 i_mtime;			/* Modification time */
+	__le64 i_dtime;			/* Deletion time */
+/*50*/	__le64 i_blkno;			/* Offset on disk, in blocks */
+	__le64 i_last_eb_blk;		/* Pointer to last extent
+					   block */
+/*60*/	__le32 i_fs_generation;		/* Generation per fs-instance */
+	__le32 i_atime_nsec;
+	__le32 i_ctime_nsec;
+	__le32 i_mtime_nsec;
+/*70*/	__le64 i_reserved1[9];
+/*B8*/	union {
+		__le64 i_pad1;		/* Generic way to refer to this
+					   64bit union */
+		struct {
+			__le64 i_rdev;	/* Device number */
+		} dev1;
+		struct {		/* Info for bitmap system
+					   inodes */
+			__le32 i_used;	/* Bits (ie, clusters) used  */
+			__le32 i_total;	/* Total bits (clusters)
+					   available */
+		} bitmap1;
+		struct {		/* Info for journal system
+					   inodes */
+			__le32 ij_flags;	/* Mounted, version, etc. */
+			__le32 ij_pad;
+		} journal1;
+	} id1;				/* Inode type dependant 1 */
+/*C0*/	union {
+		struct ocfs2_super_block	i_super;
+		struct ocfs2_local_alloc	i_lab;
+		struct ocfs2_chain_list		i_chain;
+		struct ocfs2_extent_list	i_list;
+		struct ocfs2_truncate_log	i_dealloc;
+		__u8               		i_symlink[0];
+	} id2;
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On-disk directory entry structure for OCFS2
+ *
+ * Packed as this structure could be accessed unaligned on 64-bit platforms
+ */
+struct ocfs2_dir_entry {
+/*00*/	__le64   inode;                  /* Inode number */
+	__le16   rec_len;                /* Directory entry length */
+	__u8    name_len;               /* Name length */
+	__u8    file_type;
+/*0C*/	char    name[OCFS2_MAX_FILENAME_LEN];   /* File name */
+/* Actual on-disk length specified by rec_len */
+} __attribute__ ((packed));
+
+/*
+ * On disk allocator group structure for OCFS2
+ */
+struct ocfs2_group_desc
+{
+/*00*/	__u8    bg_signature[8];        /* Signature for validation */
+	__le16   bg_size;                /* Size of included bitmap in
+					   bytes. */
+	__le16   bg_bits;                /* Bits represented by this
+					   group. */
+	__le16	bg_free_bits_count;     /* Free bits count */
+	__le16   bg_chain;               /* What chain I am in. */
+/*10*/	__le32   bg_generation;
+	__le32	bg_reserved1;
+	__le64   bg_next_group;          /* Next group in my list, in
+					   blocks */
+/*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
+					   blocks */
+	__le64   bg_blkno;               /* Offset on disk, in blocks */
+/*30*/	__le64   bg_reserved2[2];
+/*40*/	__u8    bg_bitmap[0];
+};
+
+#ifdef __KERNEL__
+static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
+{
+	return  sb->s_blocksize -
+		 offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
+{
+	u16 size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#else
+static inline int ocfs2_fast_symlink_chars(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
+}
+
+static inline int ocfs2_extent_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_chain_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
+
+	return size / sizeof(struct ocfs2_chain_rec);
+}
+
+static inline int ocfs2_extent_recs_per_eb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_extent_block, h_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_local_alloc_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_group_desc, bg_bitmap);
+
+	return size;
+}
+
+static inline int ocfs2_truncate_recs_per_inode(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
+
+	return size / sizeof(struct ocfs2_truncate_rec);
+}
+#endif  /* __KERNEL__ */
+
+
+static inline int ocfs2_system_inode_is_global(int type)
+{
+	return ((type >= 0) &&
+		(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
+}
+
+static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
+						  int type, int slot)
+{
+	int chars;
+
+        /*
+         * Global system inodes can only have one copy.  Everything
+         * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
+         * list has a copy per slot.
+         */
+	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name);
+	else
+		chars = snprintf(buf, len,
+				 ocfs2_system_inodes[type].si_name,
+				 slot);
+
+	return chars;
+}
+
+static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
+				    umode_t mode)
+{
+	de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#endif  /* _OCFS2_FS_H */
+
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 0000000..7dd9e1e
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockid.h
+ *
+ * Defines OCFS2 lockid bits.
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_LOCKID_H
+#define OCFS2_LOCKID_H
+
+/* lock ids are made up in the following manner:
+ * name[0]     --> type
+ * name[1-6]   --> 6 pad characters, reserved for now
+ * name[7-22]  --> block number, expressed in hex as 16 chars
+ * name[23-30] --> i_generation, expressed in hex 8 chars
+ * name[31]    --> '\0' */
+#define OCFS2_LOCK_ID_MAX_LEN  32
+#define OCFS2_LOCK_ID_PAD "000000"
+
+enum ocfs2_lock_type {
+	OCFS2_LOCK_TYPE_META = 0,
+	OCFS2_LOCK_TYPE_DATA,
+	OCFS2_LOCK_TYPE_SUPER,
+	OCFS2_LOCK_TYPE_RENAME,
+	OCFS2_LOCK_TYPE_RW,
+	OCFS2_NUM_LOCK_TYPES
+};
+
+static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
+{
+	char c;
+	switch (type) {
+		case OCFS2_LOCK_TYPE_META:
+			c = 'M';
+			break;
+		case OCFS2_LOCK_TYPE_DATA:
+			c = 'D';
+			break;
+		case OCFS2_LOCK_TYPE_SUPER:
+			c = 'S';
+			break;
+		case OCFS2_LOCK_TYPE_RENAME:
+			c = 'R';
+			break;
+		case OCFS2_LOCK_TYPE_RW:
+			c = 'W';
+			break;
+		default:
+			c = '\0';
+	}
+
+	return c;
+}
+
+#endif  /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 0000000..8716279
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slot_map.c
+ *
+ *
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global);
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num);
+
+/* Use the slot information we've collected to create a map of mounted
+ * nodes. Should be holding an EX on super block. assumes slot info is
+ * up to date. Note that we call this *after* we find a slot, so our
+ * own node should be set in the map too... */
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
+{
+	int i;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	spin_lock(&si->si_lock);
+
+	for (i = 0; i < si->si_size; i++)
+		if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
+			ocfs2_node_map_set_bit(osb, &osb->mounted_map,
+					      si->si_global_node_nums[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the slot information on disk into our slot_info struct. */
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+	int i;
+	__le16 *disk_info;
+
+	/* we don't read the slot block here as ocfs2_super_lock
+	 * should've made sure we have the most recent copy. */
+	spin_lock(&si->si_lock);
+	disk_info = (__le16 *) si->si_bh->b_data;
+
+	for (i = 0; i < si->si_size; i++)
+		si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+
+	spin_unlock(&si->si_lock);
+}
+
+/* post the our slot info stuff into it's destination bh and write it
+ * out. */
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si)
+{
+	int status, i;
+	__le16 *disk_info = (__le16 *) si->si_bh->b_data;
+
+	spin_lock(&si->si_lock);
+	for (i = 0; i < si->si_size; i++)
+		disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+/* try to find global node in the slot info. Returns
+ * OCFS2_INVALID_SLOT if nothing is found. */
+static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+				    s16 global)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (global == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+{
+	int i;
+	s16 ret = OCFS2_INVALID_SLOT;
+
+	for(i = 0; i < si->si_num_slots; i++) {
+		if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
+			ret = (s16) i;
+			break;
+		}
+	}
+	return ret;
+}
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global)
+{
+	s16 ret;
+
+	spin_lock(&si->si_lock);
+	ret = __ocfs2_node_num_to_slot(si, global);
+	spin_unlock(&si->si_lock);
+	return ret;
+}
+
+static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
+			      s16 slot_num,
+			      s16 node_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	BUG_ON(slot_num >= si->si_num_slots);
+	BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
+	       (node_num >= O2NM_MAX_NODES));
+
+	si->si_global_node_nums[slot_num] = node_num;
+}
+
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num)
+{
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
+	spin_unlock(&si->si_lock);
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+	int status, i;
+	u64 blkno;
+	struct inode *inode = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_slot_info *si;
+
+	si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+	if (!si) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock_init(&si->si_lock);
+	si->si_num_slots = osb->max_slots;
+	si->si_size = OCFS2_MAX_SLOTS;
+
+	for(i = 0; i < si->si_num_slots; i++)
+		si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+
+	inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	si->si_inode = inode;
+	si->si_bh = bh;
+	osb->slot_info = si;
+bail:
+	if (status < 0 && si)
+		ocfs2_free_slot_info(si);
+
+	return status;
+}
+
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+{
+	if (si->si_inode)
+		iput(si->si_inode);
+	if (si->si_bh)
+		brelse(si->si_bh);
+	kfree(si);
+}
+
+int ocfs2_find_slot(struct ocfs2_super *osb)
+{
+	int status;
+	s16 slot;
+	struct ocfs2_slot_info *si;
+
+	mlog_entry_void();
+
+	si = osb->slot_info;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	/* search for ourselves first and take the slot if it already
+	 * exists. Perhaps we need to mark this in a variable for our
+	 * own journal recovery? Possibly not, though we certainly
+	 * need to warn to the user */
+	slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+	if (slot == OCFS2_INVALID_SLOT) {
+		/* if no slot yet, then just take 1st available
+		 * one. */
+		slot = __ocfs2_find_empty_slot(si);
+		if (slot == OCFS2_INVALID_SLOT) {
+			spin_unlock(&si->si_lock);
+			mlog(ML_ERROR, "no free slots available!\n");
+			status = -EINVAL;
+			goto bail;
+		}
+	} else
+		mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
+		     slot);
+
+	__ocfs2_fill_slot(si, slot, osb->node_num);
+	osb->slot_num = slot;
+	spin_unlock(&si->si_lock);
+
+	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0)
+		mlog_errno(status);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+void ocfs2_put_slot(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_slot_info *si = osb->slot_info;
+
+	if (!si)
+		return;
+
+	ocfs2_update_slot_info(si);
+
+	spin_lock(&si->si_lock);
+	__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+	osb->slot_num = OCFS2_INVALID_SLOT;
+	spin_unlock(&si->si_lock);
+
+	status = ocfs2_update_disk_slots(osb, si);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	osb->slot_info = NULL;
+	ocfs2_free_slot_info(si);
+}
+
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 0000000..d8c8cee
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * slotmap.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef SLOTMAP_H
+#define SLOTMAP_H
+
+struct ocfs2_slot_info {
+	spinlock_t si_lock;
+
+       	struct inode *si_inode;
+	struct buffer_head *si_bh;
+	unsigned int si_num_slots;
+	unsigned int si_size;
+	s16 si_global_node_nums[OCFS2_MAX_SLOTS];
+};
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb);
+void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+
+int ocfs2_find_slot(struct ocfs2_super *osb);
+void ocfs2_put_slot(struct ocfs2_super *osb);
+
+void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
+int ocfs2_update_disk_slots(struct ocfs2_super *osb,
+			    struct ocfs2_slot_info *si);
+
+s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+			   s16 global);
+void ocfs2_clear_slot(struct ocfs2_slot_info *si,
+		      s16 slot_num);
+
+void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
+
+static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
+				      int slot_num)
+{
+	BUG_ON(slot_num == OCFS2_INVALID_SLOT);
+	assert_spin_locked(&si->si_lock);
+
+	return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
+}
+
+#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 0000000..c46c164
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.c
+ *
+ * metadata alloc and free
+ * Inspired by ext3 block groups.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl);
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh);
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac);
+
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found);
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found);
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno);
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr);
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found);
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits);
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits);
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain);
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted);
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count);
+static inline u64 ocfs2_which_suballoc_group(u64 block,
+					     unsigned int bit);
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off);
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster);
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off);
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+	if (ac->ac_inode)
+		iput(ac->ac_inode);
+	if (ac->ac_bh)
+		brelse(ac->ac_bh);
+	kfree(ac);
+}
+
+static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
+{
+	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
+}
+
+static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+				  struct inode *alloc_inode,
+				  struct buffer_head *bg_bh,
+				  u64 group_blkno,
+				  u16 my_chain,
+				  struct ocfs2_chain_list *cl)
+{
+	int status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct super_block * sb = alloc_inode->i_sb;
+
+	mlog_entry_void();
+
+	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
+		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
+			    "!= b_blocknr (%llu)", group_blkno,
+			    (unsigned long long) bg_bh->b_blocknr);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      bg_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	memset(bg, 0, sb->s_blocksize);
+	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
+	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+	bg->bg_chain = cpu_to_le16(my_chain);
+	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
+	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
+	bg->bg_blkno = cpu_to_le64(group_blkno);
+	/* set the 1st bit in the bitmap to account for the descriptor block */
+	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
+	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0)
+		mlog_errno(status);
+
+	/* There is no need to zero out or otherwise initialize the
+	 * other blocks in a group - All valid FS metadata in a block
+	 * group stores the superblock fs_generation value at
+	 * allocation time. */
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_count)) {
+		if (le32_to_cpu(cl->cl_recs[best].c_total) >
+		    le32_to_cpu(cl->cl_recs[curr].c_total))
+			best = curr;
+		curr++;
+	}
+	return best;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+				   struct inode *alloc_inode,
+				   struct buffer_head *bh)
+{
+	int status, credits;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_alloc_context *ac = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	u32 bit_off, num_bits;
+	u16 alloc_rec;
+	u64 bg_blkno;
+	struct buffer_head *bg_bh = NULL;
+	struct ocfs2_group_desc *bg;
+
+	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+	mlog_entry_void();
+
+	handle = ocfs2_alloc_handle(osb);
+	if (!handle) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	cl = &fe->id2.i_chain;
+	status = ocfs2_reserve_clusters(osb,
+					handle,
+					le16_to_cpu(cl->cl_cpg),
+					&ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	credits = ocfs2_calc_group_alloc_credits(osb->sb,
+						 le16_to_cpu(cl->cl_cpg));
+	handle = ocfs2_start_trans(osb, handle, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_claim_clusters(osb,
+				      handle,
+				      ac,
+				      le16_to_cpu(cl->cl_cpg),
+				      &bit_off,
+				      &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	alloc_rec = ocfs2_find_smallest_chain(cl);
+
+	/* setup the group */
+	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
+	     alloc_rec, bg_blkno);
+
+	bg_bh = sb_getblk(osb->sb, bg_blkno);
+	if (!bg_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
+
+	status = ocfs2_block_group_fill(handle,
+					alloc_inode,
+					bg_bh,
+					bg_blkno,
+					alloc_rec,
+					cl);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	status = ocfs2_journal_access(handle, alloc_inode,
+				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
+		     le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
+		le16_add_cpu(&cl->cl_next_free_rec, 1);
+
+	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
+					le16_to_cpu(bg->bg_free_bits_count));
+	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
+	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
+
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
+	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
+					     le32_to_cpu(fe->i_clusters)));
+	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
+	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
+	alloc_inode->i_blocks =
+		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+
+	status = 0;
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (ac)
+		ocfs2_free_alloc_context(ac);
+
+	if (bg_bh)
+		brelse(bg_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
+				       struct ocfs2_alloc_context *ac)
+{
+	int status;
+	u32 bits_wanted = ac->ac_bits_wanted;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct ocfs2_dinode *fe;
+	u32 free_bits;
+
+	mlog_entry_void();
+
+	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+
+	ocfs2_handle_add_inode(handle, alloc_inode);
+	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
+		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
+			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
+		status = -EIO;
+		goto bail;
+	}
+
+	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
+		le32_to_cpu(fe->id1.bitmap1.i_used);
+
+	if (bits_wanted > free_bits) {
+		/* cluster bitmap never grows */
+		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
+			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
+			     bits_wanted, free_bits);
+			status = -ENOSPC;
+			goto bail;
+		}
+
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+		atomic_inc(&osb->alloc_stats.bg_extends);
+
+		/* You should never ask for this much metadata */
+		BUG_ON(bits_wanted >
+		       (le32_to_cpu(fe->id1.bitmap1.i_total)
+			- le32_to_cpu(fe->id1.bitmap1.i_used)));
+	}
+
+	get_bh(bh);
+	ac->ac_bh = bh;
+bail:
+	if (bh)
+		brelse(bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_META;
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  0);
+#else
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  EXTENT_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+#endif
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac)
+{
+	int status;
+	struct inode *alloc_inode = NULL;
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = 1;
+	(*ac)->ac_handle = handle;
+	(*ac)->ac_which = OCFS2_AC_USE_INODE;
+
+	alloc_inode = ocfs2_get_system_file_inode(osb,
+						  INODE_ALLOC_SYSTEM_INODE,
+						  osb->slot_num);
+	if (!alloc_inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_inode = igrab(alloc_inode);
+	(*ac)->ac_group_search = ocfs2_block_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, *ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	if (alloc_inode)
+		iput(alloc_inode);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* local alloc code has to do the same thing, so rather than do this
+ * twice.. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac)
+{
+	int status;
+
+	ac->ac_inode = ocfs2_get_system_file_inode(osb,
+						   GLOBAL_BITMAP_SYSTEM_INODE,
+						   OCFS2_INVALID_SLOT);
+	if (!ac->ac_inode) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Could not get bitmap inode!\n");
+		goto bail;
+	}
+	ac->ac_which = OCFS2_AC_USE_MAIN;
+	ac->ac_group_search = ocfs2_cluster_group_search;
+
+	status = ocfs2_reserve_suballoc_bits(osb, ac);
+	if (status < 0 && status != -ENOSPC)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/* Callers don't need to care which bitmap (local alloc or main) to
+ * use so we figure it out for them, but unfortunately this clutters
+ * things a bit. */
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	int status;
+
+	mlog_entry_void();
+
+	BUG_ON(!handle);
+
+	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+	if (!(*ac)) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_handle = handle;
+
+	status = -ENOSPC;
+	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+		status = ocfs2_reserve_local_alloc_bits(osb,
+							handle,
+							bits_wanted,
+							*ac);
+		if ((status < 0) && (status != -ENOSPC)) {
+			mlog_errno(status);
+			goto bail;
+		} else if (status == -ENOSPC) {
+			/* reserve_local_bits will return enospc with
+			 * the local alloc inode still locked, so we
+			 * can change this safely here. */
+			mlog(0, "Disabling local alloc\n");
+			/* We set to OCFS2_LA_DISABLED so that umount
+			 * can clean up what's left of the local
+			 * allocation */
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+	}
+
+	if (status == -ENOSPC) {
+		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = 0;
+bail:
+	if ((status < 0) && *ac) {
+		ocfs2_free_alloc_context(*ac);
+		*ac = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * More or less lifted from ext3. I'll leave their description below:
+ *
+ * "For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes."
+ *
+ * Note: OCFS2 already does this differently for metadata vs data
+ * allocations, as those bitmaps are seperate and undo access is never
+ * called on a metadata group descriptor.
+ */
+static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
+					 int nr)
+{
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
+		return 0;
+	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+		return 1;
+
+	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+}
+
+static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
+					     struct buffer_head *bg_bh,
+					     unsigned int bits_wanted,
+					     u16 *bit_off,
+					     u16 *bits_found)
+{
+	void *bitmap;
+	u16 best_offset, best_size;
+	int offset, start, found, status = 0;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+		return -EIO;
+	}
+
+	found = start = best_offset = best_size = 0;
+	bitmap = bg->bg_bitmap;
+
+	while((offset = ocfs2_find_next_zero_bit(bitmap,
+						 le16_to_cpu(bg->bg_bits),
+						 start)) != -1) {
+		if (offset == le16_to_cpu(bg->bg_bits))
+			break;
+
+		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
+			/* We found a zero, but we can't use it as it
+			 * hasn't been put to disk yet! */
+			found = 0;
+			start = offset + 1;
+		} else if (offset == start) {
+			/* we found a zero */
+			found++;
+			/* move start to the next bit to test */
+			start++;
+		} else {
+			/* got a zero after some ones */
+			found = 1;
+			start = offset + 1;
+		}
+		if (found > best_size) {
+			best_size = found;
+			best_offset = start - found;
+		}
+		/* we got everything we needed */
+		if (found == bits_wanted) {
+			/* mlog(0, "Found it all!\n"); */
+			break;
+		}
+	}
+
+	/* XXX: I think the first clause is equivalent to the second
+	 * 	- jlbec */
+	if (found == bits_wanted) {
+		*bit_off = start - found;
+		*bits_found = found;
+	} else if (best_size) {
+		*bit_off = best_offset;
+		*bits_found = best_size;
+	} else {
+		status = -ENOSPC;
+		/* No error log here -- see the comment above
+		 * ocfs2_test_bg_bit_allocatable */
+	}
+
+	return status;
+}
+
+static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+					     struct inode *alloc_inode,
+					     struct ocfs2_group_desc *bg,
+					     struct buffer_head *group_bh,
+					     unsigned int bit_off,
+					     unsigned int num_bits)
+{
+	int status;
+	void *bitmap = bg->bg_bitmap;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+	     num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+
+	while(num_bits--)
+		ocfs2_set_bit(bit_off++, bitmap);
+
+	status = ocfs2_journal_dirty(handle,
+				     group_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* find the one with the most empty bits */
+static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
+{
+	u16 curr, best;
+
+	BUG_ON(!cl->cl_next_free_rec);
+
+	best = curr = 0;
+	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
+		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
+		    le32_to_cpu(cl->cl_recs[best].c_free))
+			best = curr;
+		curr++;
+	}
+
+	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
+	return best;
+}
+
+static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *fe_bh,
+				    struct buffer_head *bg_bh,
+				    struct buffer_head *prev_bg_bh,
+				    u16 chain)
+{
+	int status;
+	/* there is a really tiny chance the journal calls could fail,
+	 * but we wouldn't want inconsistent blocks in *any* case. */
+	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto out;
+	}
+	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
+		status = -EIO;
+		goto out;
+	}
+
+	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
+	     "top, prev = %"MLFu64"\n",
+	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
+
+	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
+	bg_ptr = le64_to_cpu(bg->bg_next_group);
+	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
+
+	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	prev_bg->bg_next_group = bg->bg_next_group;
+
+	status = ocfs2_journal_dirty(handle, prev_bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+
+	status = ocfs2_journal_dirty(handle, bg_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_rollback;
+	}
+
+	status = 0;
+out_rollback:
+	if (status < 0) {
+		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
+		bg->bg_next_group = cpu_to_le64(bg_ptr);
+		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	}
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
+						     u32 wanted)
+{
+	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
+}
+
+/* return 0 on success, -ENOSPC to keep searching and any other < 0
+ * value on error. */
+static int ocfs2_cluster_group_search(struct inode *inode,
+				      struct buffer_head *group_bh,
+				      u32 bits_wanted, u32 min_bits,
+				      u16 *bit_off, u16 *bits_found)
+{
+	int search = -ENOSPC;
+	int ret;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	u16 tmp_off, tmp_found;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count) {
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							&tmp_off, &tmp_found);
+		if (ret)
+			return ret;
+
+		/* ocfs2_block_group_find_clear_bits() might
+		 * return success, but we still want to return
+		 * -ENOSPC unless it found the minimum number
+		 * of bits. */
+		if (min_bits <= tmp_found) {
+			*bit_off = tmp_off;
+			*bits_found = tmp_found;
+			search = 0; /* success */
+		}
+	}
+
+	return search;
+}
+
+static int ocfs2_block_group_search(struct inode *inode,
+				    struct buffer_head *group_bh,
+				    u32 bits_wanted, u32 min_bits,
+				    u16 *bit_off, u16 *bits_found)
+{
+	int ret = -ENOSPC;
+	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
+
+	BUG_ON(min_bits != 1);
+	BUG_ON(ocfs2_is_cluster_bitmap(inode));
+
+	if (bg->bg_free_bits_count)
+		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
+							group_bh, bits_wanted,
+							bit_off, bits_found);
+
+	return ret;
+}
+
+static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+			      u32 bits_wanted,
+			      u32 min_bits,
+			      u16 *bit_off,
+			      unsigned int *num_bits,
+			      u64 *bg_blkno)
+{
+	int status;
+	u16 chain, tmp_bits;
+	u32 tmp_used;
+	u64 next_group;
+	struct ocfs2_journal_handle *handle = ac->ac_handle;
+	struct inode *alloc_inode = ac->ac_inode;
+	struct buffer_head *group_bh = NULL;
+	struct buffer_head *prev_group_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+	struct ocfs2_group_desc *bg;
+
+	chain = ac->ac_chain;
+	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
+	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
+
+	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	bg = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	status = -ENOSPC;
+	/* for now, the chain search is a bit simplistic. We just use
+	 * the 1st group with any empty bits. */
+	while ((status = ac->ac_group_search(alloc_inode, group_bh,
+					     bits_wanted, min_bits, bit_off,
+					     &tmp_bits)) == -ENOSPC) {
+		if (!bg->bg_next_group)
+			break;
+
+		if (prev_group_bh) {
+			brelse(prev_group_bh);
+			prev_group_bh = NULL;
+		}
+		next_group = le64_to_cpu(bg->bg_next_group);
+		prev_group_bh = group_bh;
+		group_bh = NULL;
+		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+					  next_group, &group_bh,
+					  OCFS2_BH_CACHED, alloc_inode);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		bg = (struct ocfs2_group_desc *) group_bh->b_data;
+		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+			status = -EIO;
+			goto bail;
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
+	     tmp_bits, bg->bg_blkno);
+
+	*num_bits = tmp_bits;
+
+	BUG_ON(*num_bits == 0);
+
+	/*
+	 * Keep track of previous block descriptor read. When
+	 * we find a target, if we have read more than X
+	 * number of descriptors, and the target is reasonably
+	 * empty, relink him to top of his chain.
+	 *
+	 * We've read 0 extra blocks and only send one more to
+	 * the transaction, yet the next guy to search has a
+	 * much easier time.
+	 *
+	 * Do this *after* figuring out how many bits we're taking out
+	 * of our target group.
+	 */
+	if (ac->ac_allow_chain_relink &&
+	    (prev_group_bh) &&
+	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+		status = ocfs2_relink_block_group(handle, alloc_inode,
+						  ac->ac_bh, group_bh,
+						  prev_group_bh, chain);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* Ok, claim our bits now: set the info on dinode, chainlist
+	 * and then the group */
+	status = ocfs2_journal_access(handle,
+				      alloc_inode,
+				      ac->ac_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+
+	status = ocfs2_journal_dirty(handle,
+				     ac->ac_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_block_group_set_bits(handle,
+					    alloc_inode,
+					    bg,
+					    group_bh,
+					    *bit_off,
+					    *num_bits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
+	     *num_bits, fe->i_blkno);
+
+	*bg_blkno = le64_to_cpu(bg->bg_blkno);
+bail:
+	if (group_bh)
+		brelse(group_bh);
+	if (prev_group_bh)
+		brelse(prev_group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* will give out up to bits_wanted contiguous bits. */
+static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+				     struct ocfs2_alloc_context *ac,
+				     u32 bits_wanted,
+				     u32 min_bits,
+				     u16 *bit_off,
+				     unsigned int *num_bits,
+				     u64 *bg_blkno)
+{
+	int status;
+	u16 victim, i;
+	struct ocfs2_chain_list *cl;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
+	BUG_ON(!ac->ac_bh);
+
+	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
+	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
+		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
+			    "used bits but only %u total.",
+			    le64_to_cpu(fe->i_blkno),
+			    le32_to_cpu(fe->id1.bitmap1.i_used),
+			    le32_to_cpu(fe->id1.bitmap1.i_total));
+		status = -EIO;
+		goto bail;
+	}
+
+	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
+
+	victim = ocfs2_find_victim_chain(cl);
+	ac->ac_chain = victim;
+	ac->ac_allow_chain_relink = 1;
+
+	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
+				    num_bits, bg_blkno);
+	if (!status)
+		goto bail;
+	if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "Search of victim chain %u came up with nothing, "
+	     "trying all chains now.\n", victim);
+
+	/* If we didn't pick a good victim, then just default to
+	 * searching each chain in order. Don't allow chain relinking
+	 * because we only calculate enough journal credits for one
+	 * relink per alloc. */
+	ac->ac_allow_chain_relink = 0;
+	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
+		if (i == victim)
+			continue;
+		if (!cl->cl_recs[i].c_free)
+			continue;
+
+		ac->ac_chain = i;
+		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
+					    bit_off, num_bits,
+					    bg_blkno);
+		if (!status)
+			break;
+		if (status < 0 && status != -ENOSPC) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 unsigned int *num_bits,
+			 u64 *blkno_start)
+{
+	int status;
+	u64 bg_blkno;
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   bits_wanted,
+					   1,
+					   suballoc_bit_start,
+					   num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+	ac->ac_bits_given += (*num_bits);
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno)
+{
+	int status;
+	unsigned int num_bits;
+	u64 bg_blkno;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given != 0);
+	BUG_ON(ac->ac_bits_wanted != 1);
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+	BUG_ON(ac->ac_handle != handle);
+
+	status = ocfs2_claim_suballoc_bits(osb,
+					   ac,
+					   1,
+					   1,
+					   suballoc_bit,
+					   &num_bits,
+					   &bg_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	atomic_inc(&osb->alloc_stats.bg_allocs);
+
+	BUG_ON(num_bits != 1);
+
+	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+	ac->ac_bits_given++;
+	status = 0;
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* translate a group desc. blkno and it's bitmap offset into
+ * disk cluster offset. */
+static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
+						   u64 bg_blkno,
+						   u16 bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 cluster = 0;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	if (bg_blkno != osb->first_cluster_group_blkno)
+		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
+	cluster += (u32) bg_bit_off;
+	return cluster;
+}
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+static inline u64 ocfs2_which_cluster_group(struct inode *inode,
+					    u32 cluster)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 group_no;
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	group_no = cluster / osb->bitmap_cpg;
+	if (!group_no)
+		return osb->first_cluster_group_blkno;
+	return ocfs2_clusters_to_blocks(inode->i_sb,
+					group_no * osb->bitmap_cpg);
+}
+
+/* given the block number of a cluster start, calculate which cluster
+ * group and descriptor bitmap offset that corresponds to. */
+static inline void ocfs2_block_to_cluster_group(struct inode *inode,
+						u64 data_blkno,
+						u64 *bg_blkno,
+						u16 *bg_bit_off)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
+
+	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
+
+	*bg_blkno = ocfs2_which_cluster_group(inode,
+					      data_cluster);
+
+	if (*bg_blkno == osb->first_cluster_group_blkno)
+		*bg_bit_off = (u16) data_cluster;
+	else
+		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
+							     data_blkno - *bg_blkno);
+}
+
+/*
+ * min_bits - minimum contiguous chunk from this total allocation we
+ * can handle. set to what we asked for originally for a full
+ * contig. allocation, set to '1' to indicate we can deal with extents
+ * of any size.
+ */
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters)
+{
+	int status;
+	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+	u64 bg_blkno;
+	u16 bg_bit_off;
+
+	mlog_entry_void();
+
+	BUG_ON(!ac);
+	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
+
+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
+	       && ac->ac_which != OCFS2_AC_USE_MAIN);
+	BUG_ON(ac->ac_handle != handle);
+
+	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+		status = ocfs2_claim_local_alloc_bits(osb,
+						      handle,
+						      ac,
+						      bits_wanted,
+						      cluster_start,
+						      num_clusters);
+		if (!status)
+			atomic_inc(&osb->alloc_stats.local_data);
+	} else {
+		if (min_clusters > (osb->bitmap_cpg - 1)) {
+			/* The only paths asking for contiguousness
+			 * should know about this already. */
+			mlog(ML_ERROR, "minimum allocation requested exceeds "
+				       "group bitmap size!");
+			status = -ENOSPC;
+			goto bail;
+		}
+		/* clamp the current request down to a realistic size. */
+		if (bits_wanted > (osb->bitmap_cpg - 1))
+			bits_wanted = osb->bitmap_cpg - 1;
+
+		status = ocfs2_claim_suballoc_bits(osb,
+						   ac,
+						   bits_wanted,
+						   min_clusters,
+						   &bg_bit_off,
+						   num_clusters,
+						   &bg_blkno);
+		if (!status) {
+			*cluster_start =
+				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
+								 bg_blkno,
+								 bg_bit_off);
+			atomic_inc(&osb->alloc_stats.bitmap_data);
+		}
+	}
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	ac->ac_bits_given += *num_clusters;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
+					       struct inode *alloc_inode,
+					       struct ocfs2_group_desc *bg,
+					       struct buffer_head *group_bh,
+					       unsigned int bit_off,
+					       unsigned int num_bits)
+{
+	int status;
+	unsigned int tmp;
+	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+	struct ocfs2_group_desc *undo_bg = NULL;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+		status = -EIO;
+		goto bail;
+	}
+
+	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+				      journal_type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (ocfs2_is_cluster_bitmap(alloc_inode))
+		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+
+	tmp = num_bits;
+	while(tmp--) {
+		ocfs2_clear_bit((bit_off + tmp),
+				(unsigned long *) bg->bg_bitmap);
+		if (ocfs2_is_cluster_bitmap(alloc_inode))
+			ocfs2_set_bit(bit_off + tmp,
+				      (unsigned long *) undo_bg->bg_bitmap);
+	}
+	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+
+	status = ocfs2_journal_dirty(handle, group_bh);
+	if (status < 0)
+		mlog_errno(status);
+bail:
+	return status;
+}
+
+/*
+ * expects the suballoc inode to already be locked.
+ */
+static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
+				    struct inode *alloc_inode,
+				    struct buffer_head *alloc_bh,
+				    unsigned int start_bit,
+				    u64 bg_blkno,
+				    unsigned int count)
+{
+	int status = 0;
+	u32 tmp_used;
+	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
+	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+	struct buffer_head *group_bh = NULL;
+	struct ocfs2_group_desc *group;
+
+	mlog_entry_void();
+
+	if (!OCFS2_IS_VALID_DINODE(fe)) {
+		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
+
+	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
+	     ", starting at %u\n",
+	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
+	     start_bit);
+
+	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
+				  alloc_inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	group = (struct ocfs2_group_desc *) group_bh->b_data;
+	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
+		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
+		status = -EIO;
+		goto bail;
+	}
+	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
+
+	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
+					      group, group_bh,
+					      start_bit, count);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
+		     count);
+	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
+	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+
+	status = ocfs2_journal_dirty(handle, alloc_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+bail:
+	if (group_bh)
+		brelse(group_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+	u64 group = block - (u64) bit;
+
+	return group;
+}
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di)
+{
+	u64 blk = le64_to_cpu(di->i_blkno);
+	u16 bit = le16_to_cpu(di->i_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
+					inode_alloc_bh, bit, bg_blkno, 1);
+}
+
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb)
+{
+	u64 blk = le64_to_cpu(eb->h_blkno);
+	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
+					bit, bg_blkno, 1);
+}
+
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+		       struct inode *bitmap_inode,
+		       struct buffer_head *bitmap_bh,
+		       u64 start_blk,
+		       unsigned int num_clusters)
+{
+	int status;
+	u16 bg_start_bit;
+	u64 bg_blkno;
+	struct ocfs2_dinode *fe;
+
+	/* You can't ever have a contiguous set of clusters
+	 * bigger than a block group bitmap so we never have to worry
+	 * about looping on them. */
+
+	mlog_entry_void();
+
+	/* This is expensive. We can safely remove once this stuff has
+	 * gotten tested really well. */
+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+
+	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
+
+	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
+				     &bg_start_bit);
+
+	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
+	     num_clusters, start_blk);
+	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
+	     bg_blkno, bg_start_bit);
+
+	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					  bg_start_bit, bg_blkno,
+					  num_clusters);
+	if (status < 0)
+		mlog_errno(status);
+
+	mlog_exit(status);
+	return status;
+}
+
+static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
+{
+	printk("Block Group:\n");
+	printk("bg_signature:       %s\n", bg->bg_signature);
+	printk("bg_size:            %u\n", bg->bg_size);
+	printk("bg_bits:            %u\n", bg->bg_bits);
+	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
+	printk("bg_chain:           %u\n", bg->bg_chain);
+	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
+	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
+	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
+	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
+}
+
+static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
+{
+	int i;
+
+	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
+	printk("i_signature:                  %s\n", fe->i_signature);
+	printk("i_size:                       %"MLFu64"\n", fe->i_size);
+	printk("i_clusters:                   %u\n", fe->i_clusters);
+	printk("i_generation:                 %u\n",
+	       le32_to_cpu(fe->i_generation));
+	printk("id1.bitmap1.i_used:           %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_used));
+	printk("id1.bitmap1.i_total:          %u\n",
+	       le32_to_cpu(fe->id1.bitmap1.i_total));
+	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
+	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
+	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
+	printk("id2.i_chain.cl_next_free_rec: %u\n",
+	       fe->id2.i_chain.cl_next_free_rec);
+	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
+		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_free);
+		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_total);
+		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
+		       fe->id2.i_chain.cl_recs[i].c_blkno);
+	}
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 0000000..a76c82a
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * suballoc.h
+ *
+ * Defines sub allocator api
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _CHAINALLOC_H_
+#define _CHAINALLOC_H_
+
+typedef int (group_search_t)(struct inode *,
+			     struct buffer_head *,
+			     u32,
+			     u32,
+			     u16 *,
+			     u16 *);
+
+struct ocfs2_alloc_context {
+	struct inode *ac_inode;    /* which bitmap are we allocating from? */
+	struct buffer_head *ac_bh; /* file entry bh */
+	u32    ac_bits_wanted;
+	u32    ac_bits_given;
+#define OCFS2_AC_USE_LOCAL 1
+#define OCFS2_AC_USE_MAIN  2
+#define OCFS2_AC_USE_INODE 3
+#define OCFS2_AC_USE_META  4
+	u32    ac_which;
+	struct ocfs2_journal_handle *ac_handle;
+
+	/* these are used by the chain search */
+	u16    ac_chain;
+	int    ac_allow_chain_relink;
+	group_search_t *ac_group_search;
+};
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
+static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
+{
+	return ac->ac_bits_wanted - ac->ac_bits_given;
+}
+
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_journal_handle *handle,
+			       struct ocfs2_dinode *fe,
+			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   struct ocfs2_journal_handle *handle,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac);
+
+int ocfs2_claim_metadata(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 bits_wanted,
+			 u16 *suballoc_bit_start,
+			 u32 *num_bits,
+			 u64 *blkno_start);
+int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+			  struct ocfs2_journal_handle *handle,
+			  struct ocfs2_alloc_context *ac,
+			  u16 *suballoc_bit,
+			  u64 *fe_blkno);
+int ocfs2_claim_clusters(struct ocfs2_super *osb,
+			 struct ocfs2_journal_handle *handle,
+			 struct ocfs2_alloc_context *ac,
+			 u32 min_clusters,
+			 u32 *cluster_start,
+			 u32 *num_clusters);
+
+int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+		      struct inode *inode_alloc_inode,
+		      struct buffer_head *inode_alloc_bh,
+		      struct ocfs2_dinode *di);
+int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
+			    struct inode *eb_alloc_inode,
+			    struct buffer_head *eb_alloc_bh,
+			    struct ocfs2_extent_block *eb);
+int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters);
+
+static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
+					  u64 bg_blkno)
+{
+	/* This should work for all block group descriptors as only
+	 * the 1st group descriptor of the cluster bitmap is
+	 * different. */
+
+	if (bg_blkno == osb->first_cluster_group_blkno)
+		return 0;
+
+	/* the rest of the block groups are located at the beginning
+	 * of their 1st cluster, so a direct translation just
+	 * works. */
+	return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
+}
+
+static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
+}
+
+/* This is for local alloc ONLY. Others should use the task-specific
+ * apis above. */
+int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
+				      struct ocfs2_alloc_context *ac);
+
+#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 0000000..48bf7f0
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.c
+ *
+ * load/unload driver, mount/dismount volumes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/statfs.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/parser.h>
+#include <linux/crc32.h>
+#include <linux/debugfs.h>
+
+#include <cluster/nodemanager.h>
+
+#define MLOG_MASK_PREFIX ML_SUPER
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+/* this should be the only file to include a version 1 header */
+#include "ocfs1_fs_compat.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "export.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "namei.h"
+#include "slot_map.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ver.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+/*
+ * Globals
+ */
+static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
+
+static u32 osb_id;             /* Keeps track of next available OSB Id */
+
+static kmem_cache_t *ocfs2_inode_cachep = NULL;
+
+kmem_cache_t *ocfs2_lock_cache = NULL;
+
+/* OCFS2 needs to schedule several differnt types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own. */
+struct workqueue_struct *ocfs2_wq = NULL;
+
+static struct dentry *ocfs2_debugfs_root = NULL;
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+static int ocfs2_parse_options(struct super_block *sb, char *options,
+			       unsigned long *mount_opt, int is_remount);
+static void ocfs2_put_super(struct super_block *sb);
+static int ocfs2_mount_volume(struct super_block *sb);
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
+static int ocfs2_initialize_mem_caches(void);
+static void ocfs2_free_mem_caches(void);
+static void ocfs2_delete_osb(struct ocfs2_super *osb);
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait);
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
+static int ocfs2_check_volume(struct ocfs2_super *osb);
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 sectsize);
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size);
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size);
+static void ocfs2_write_super(struct super_block *sb);
+static struct inode *ocfs2_alloc_inode(struct super_block *sb);
+static void ocfs2_destroy_inode(struct inode *inode);
+
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
+
+static struct super_operations ocfs2_sops = {
+	.statfs		= ocfs2_statfs,
+	.alloc_inode	= ocfs2_alloc_inode,
+	.destroy_inode	= ocfs2_destroy_inode,
+	.drop_inode	= ocfs2_drop_inode,
+	.clear_inode	= ocfs2_clear_inode,
+	.delete_inode	= ocfs2_delete_inode,
+	.sync_fs	= ocfs2_sync_fs,
+	.write_super	= ocfs2_write_super,
+	.put_super	= ocfs2_put_super,
+	.remount_fs	= ocfs2_remount,
+};
+
+enum {
+	Opt_barrier,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_intr,
+	Opt_nointr,
+	Opt_hb_none,
+	Opt_hb_local,
+	Opt_data_ordered,
+	Opt_data_writeback,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_barrier, "barrier=%u"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_intr, "intr"},
+	{Opt_nointr, "nointr"},
+	{Opt_hb_none, OCFS2_HB_NONE},
+	{Opt_hb_local, OCFS2_HB_LOCAL},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_err, NULL}
+};
+
+/*
+ * write_super and sync_fs ripped right out of ext3.
+ */
+static void ocfs2_write_super(struct super_block *sb)
+{
+	if (down_trylock(&sb->s_lock) == 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
+static int ocfs2_sync_fs(struct super_block *sb, int wait)
+{
+	int status = 0;
+	tid_t target;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	sb->s_dirt = 0;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (wait) {
+		status = ocfs2_flush_truncate_log(osb);
+		if (status < 0)
+			mlog_errno(status);
+	} else {
+		ocfs2_schedule_truncate_log_flush(osb, 0);
+	}
+
+	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+		if (wait)
+			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					target);
+	}
+	return 0;
+}
+
+static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	new = ocfs2_iget(osb, osb->root_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->root_inode = new;
+
+	new = ocfs2_iget(osb, osb->system_dir_blkno);
+	if (IS_ERR(new)) {
+		status = PTR_ERR(new);
+		mlog_errno(status);
+		goto bail;
+	}
+	osb->sys_root_inode = new;
+
+	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
+	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog_errno(status);
+			/* FIXME: Should ERROR_RO_FS */
+			mlog(ML_ERROR, "Unable to load system inode %d, "
+			     "possibly corrupt fs?", i);
+			goto bail;
+		}
+		// the array now has one ref, so drop this one
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
+{
+	struct inode *new = NULL;
+	int status = 0;
+	int i;
+
+	mlog_entry_void();
+
+	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
+	     i < NUM_SYSTEM_INODES;
+	     i++) {
+		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
+		if (!new) {
+			ocfs2_release_system_inodes(osb);
+			status = -EINVAL;
+			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
+			     status, i, osb->slot_num);
+			goto bail;
+		}
+		/* the array now has one ref, so drop this one */
+		iput(new);
+	}
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
+{
+	int status = 0, i;
+	struct inode *inode;
+
+	mlog_entry_void();
+
+	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
+		inode = osb->system_inodes[i];
+		if (inode) {
+			iput(inode);
+			osb->system_inodes[i] = NULL;
+		}
+	}
+
+	inode = osb->sys_root_inode;
+	if (inode) {
+		iput(inode);
+		osb->sys_root_inode = NULL;
+	}
+
+	inode = osb->root_inode;
+	if (inode) {
+		iput(inode);
+		osb->root_inode = NULL;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+/* We're allocating fs objects, use GFP_NOFS */
+static struct inode *ocfs2_alloc_inode(struct super_block *sb)
+{
+	struct ocfs2_inode_info *oi;
+
+	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
+	if (!oi)
+		return NULL;
+
+	return &oi->vfs_inode;
+}
+
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
+}
+
+/* From xfs_super.c:xfs_max_file_offset
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.
+ */
+static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+{
+	unsigned int pagefactor = 1;
+	unsigned int bitshift = BITS_PER_LONG - 1;
+
+	/* Figure out maximum filesize, on Linux this can depend on
+	 * the filesystem blocksize (on 32 bit platforms).
+	 * __block_prepare_write does this in an [unsigned] long...
+	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
+	 * So, for page sized blocks (4K on 32 bit platforms),
+	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+	 * but for smaller blocksizes it is less (bbits = log2 bsize).
+	 * Note1: get_block_t takes a long (implicit cast from above)
+	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+	 * can optionally convert the [unsigned] long from above into
+	 * an [unsigned] long long.
+	 */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBD)
+	BUG_ON(sizeof(sector_t) != 8);
+	pagefactor = PAGE_CACHE_SIZE;
+	bitshift = BITS_PER_LONG;
+# else
+	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+	return (((unsigned long long)pagefactor) << bitshift) - 1;
+}
+
+static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
+{
+	int incompat_features;
+	int ret = 0;
+	unsigned long parsed_options;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
+	    (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
+		goto out;
+	}
+
+	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
+	    (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot change data mode on remount\n");
+		goto out;
+	}
+
+	/* We're going to/from readonly mode. */
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Lock here so the check of HARD_RO and the potential
+		 * setting of SOFT_RO is atomic. */
+		spin_lock(&osb->osb_lock);
+		if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
+			mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
+			ret = -EROFS;
+			goto unlock_osb;
+		}
+
+		if (*flags & MS_RDONLY) {
+			mlog(0, "Going to ro mode.\n");
+			sb->s_flags |= MS_RDONLY;
+			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+		} else {
+			mlog(0, "Making ro filesystem writeable.\n");
+
+			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
+				mlog(ML_ERROR, "Cannot remount RDWR "
+				     "filesystem due to previous errors.\n");
+				ret = -EROFS;
+				goto unlock_osb;
+			}
+			incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
+			if (incompat_features) {
+				mlog(ML_ERROR, "Cannot remount RDWR because "
+				     "of unsupported optional features "
+				     "(%x).\n", incompat_features);
+				ret = -EINVAL;
+				goto unlock_osb;
+			}
+			sb->s_flags &= ~MS_RDONLY;
+			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+		}
+unlock_osb:
+		spin_unlock(&osb->osb_lock);
+	}
+
+	if (!ret) {
+		if (!ocfs2_is_hard_readonly(osb))
+			ocfs2_set_journal_params(osb);
+
+		/* Only save off the new mount options in case of a successful
+		 * remount. */
+		osb->s_mount_opt = parsed_options;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_sb_probe(struct super_block *sb,
+			  struct buffer_head **bh,
+			  int *sector_size)
+{
+	int status = 0, tmpstat;
+	struct ocfs1_vol_disk_hdr *hdr;
+	struct ocfs2_dinode *di;
+	int blksize;
+
+	*bh = NULL;
+
+	/* may be > 512 */
+	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
+		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
+		     *sector_size, OCFS2_MAX_BLOCKSIZE);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	/* Can this really happen? */
+	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
+		*sector_size = OCFS2_MIN_BLOCKSIZE;
+
+	/* check block zero for old format */
+	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
+	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
+		mlog(ML_ERROR, "incompatible version: %u.%u\n",
+		     hdr->major_version, hdr->minor_version);
+		status = -EINVAL;
+	}
+	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
+		   strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
+		mlog(ML_ERROR, "incompatible volume signature: %8s\n",
+		     hdr->signature);
+		status = -EINVAL;
+	}
+	brelse(*bh);
+	*bh = NULL;
+	if (status < 0) {
+		mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
+		     "upgraded before mounting with ocfs v2\n");
+		goto bail;
+	}
+
+	/*
+	 * Now check at magic offset for 512, 1024, 2048, 4096
+	 * blocksizes.  4096 is the maximum blocksize because it is
+	 * the minimum clustersize.
+	 */
+	status = -EINVAL;
+	for (blksize = *sector_size;
+	     blksize <= OCFS2_MAX_BLOCKSIZE;
+	     blksize <<= 1) {
+		tmpstat = ocfs2_get_sector(sb, bh,
+					   OCFS2_SUPER_BLOCK_BLKNO,
+					   blksize);
+		if (tmpstat < 0) {
+			status = tmpstat;
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *) (*bh)->b_data;
+		status = ocfs2_verify_volume(di, *bh, blksize);
+		if (status >= 0)
+			goto bail;
+		brelse(*bh);
+		*bh = NULL;
+		if (status != -EAGAIN)
+			break;
+	}
+
+bail:
+	return status;
+}
+
+static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct dentry *root;
+	int status, sector_size;
+	unsigned long parsed_opt;
+	struct inode *inode = NULL;
+	struct ocfs2_super *osb = NULL;
+	struct buffer_head *bh = NULL;
+
+	mlog_entry("%p, %p, %i", sb, data, silent);
+
+	/* for now we only have one cluster/node, make sure we see it
+	 * in the heartbeat universe */
+	if (!o2hb_check_local_node_heartbeating()) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+
+	/* probe for superblock */
+	status = ocfs2_sb_probe(sb, &bh, &sector_size);
+	if (status < 0) {
+		mlog(ML_ERROR, "superblock probe failed!\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_initialize_super(sb, bh, sector_size);
+	osb = OCFS2_SB(sb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto read_super_error;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+		status = -EINVAL;
+		goto read_super_error;
+	}
+	osb->s_mount_opt = parsed_opt;
+
+	sb->s_magic = OCFS2_SUPER_MAGIC;
+
+	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
+	 * heartbeat=none */
+	if (bdev_read_only(sb->s_bdev)) {
+		if (!(sb->s_flags & MS_RDONLY)) {
+			status = -EACCES;
+			mlog(ML_ERROR, "Readonly device detected but readonly "
+			     "mount was not specified.\n");
+			goto read_super_error;
+		}
+
+		/* You should not be able to start a local heartbeat
+		 * on a readonly device. */
+		if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+			status = -EROFS;
+			mlog(ML_ERROR, "Local heartbeat specified on readonly "
+			     "device.\n");
+			goto read_super_error;
+		}
+
+		status = ocfs2_check_journals_nolocks(osb);
+		if (status < 0) {
+			if (status == -EROFS)
+				mlog(ML_ERROR, "Recovery required on readonly "
+				     "file system, but write access is "
+				     "unavailable.\n");
+			else
+				mlog_errno(status);			
+			goto read_super_error;
+		}
+
+		ocfs2_set_ro_flag(osb, 1);
+
+		printk(KERN_NOTICE "Readonly device detected. No cluster "
+		       "services will be utilized for this mount. Recovery "
+		       "will be skipped.\n");
+	}
+
+	if (!ocfs2_is_hard_readonly(osb)) {
+		/* If this isn't a hard readonly mount, then we need
+		 * to make sure that heartbeat is in a valid state,
+		 * and that we mark ourselves soft readonly is -oro
+		 * was specified. */
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+			mlog(ML_ERROR, "No heartbeat for device (%s)\n",
+			     sb->s_id);
+			status = -EINVAL;
+			goto read_super_error;
+		}
+
+		if (sb->s_flags & MS_RDONLY)
+			ocfs2_set_ro_flag(osb, 0);
+	}
+
+	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
+						 ocfs2_debugfs_root);
+	if (!osb->osb_debug_root) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
+		goto read_super_error;
+	}
+
+	status = ocfs2_mount_volume(sb);
+	if (osb->root_inode)
+		inode = igrab(osb->root_inode);
+
+	if (status < 0)
+		goto read_super_error;
+
+	if (!inode) {
+		status = -EIO;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto read_super_error;
+	}
+
+	sb->s_root = root;
+
+	ocfs2_complete_mount_recovery(osb);
+
+	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
+	       "data mode.\n",
+	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
+	       osb->slot_num,
+	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
+	       "ordered");
+
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
+	wake_up(&osb->osb_mount_event);
+
+	mlog_exit(status);
+	return status;
+
+read_super_error:
+	if (bh != NULL)
+		brelse(bh);
+
+	if (inode)
+		iput(inode);
+
+	if (osb) {
+		atomic_set(&osb->vol_state, VOLUME_DISABLED);
+		wake_up(&osb->osb_mount_event);
+		ocfs2_dismount_volume(sb, 1);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
+					int flags,
+					const char *dev_name,
+					void *data)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+}
+
+static struct file_system_type ocfs2_fs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "ocfs2",
+	.get_sb         = ocfs2_get_sb, /* is this called when we mount
+					* the fs? */
+	.kill_sb        = kill_block_super, /* set to the generic one
+					     * right now, but do we
+					     * need to change that? */
+	.fs_flags       = FS_REQUIRES_DEV,
+	.next           = NULL
+};
+
+static int ocfs2_parse_options(struct super_block *sb,
+			       char *options,
+			       unsigned long *mount_opt,
+			       int is_remount)
+{
+	int status;
+	char *p;
+
+	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
+		   options ? options : "(none)");
+
+	*mount_opt = 0;
+
+	if (!options) {
+		status = 1;
+		goto bail;
+	}
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token, option;
+		substring_t args[MAX_OPT_ARGS];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_hb_local:
+			*mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_hb_none:
+			*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option)) {
+				status = 0;
+				goto bail;
+			}
+			if (option)
+				*mount_opt |= OCFS2_MOUNT_BARRIER;
+			else
+				*mount_opt &= ~OCFS2_MOUNT_BARRIER;
+			break;
+		case Opt_intr:
+			*mount_opt &= ~OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_nointr:
+			*mount_opt |= OCFS2_MOUNT_NOINTR;
+			break;
+		case Opt_err_panic:
+			*mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			break;
+		case Opt_data_ordered:
+			*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		case Opt_data_writeback:
+			*mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+			break;
+		default:
+			mlog(ML_ERROR,
+			     "Unrecognized mount option \"%s\" "
+			     "or missing value\n", p);
+			status = 0;
+			goto bail;
+		}
+	}
+
+	status = 1;
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+static int __init ocfs2_init(void)
+{
+	int status;
+
+	mlog_entry_void();
+
+	ocfs2_print_version();
+
+	if (init_ocfs2_extent_maps())
+		return -ENOMEM;
+
+	status = init_ocfs2_uptodate_cache();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_initialize_mem_caches();
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+	if (!ocfs2_wq) {
+		status = -ENOMEM;
+		goto leave;
+	}
+
+	spin_lock(&ocfs2_globals_lock);
+	osb_id = 0;
+	spin_unlock(&ocfs2_globals_lock);
+
+	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
+	if (!ocfs2_debugfs_root) {
+		status = -EFAULT;
+		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
+	}
+
+leave:
+	if (status < 0) {
+		ocfs2_free_mem_caches();
+		exit_ocfs2_uptodate_cache();
+		exit_ocfs2_extent_maps();
+	}
+
+	mlog_exit(status);
+
+	if (status >= 0) {
+		return register_filesystem(&ocfs2_fs_type);
+	} else
+		return -1;
+}
+
+static void __exit ocfs2_exit(void)
+{
+	mlog_entry_void();
+
+	if (ocfs2_wq) {
+		flush_workqueue(ocfs2_wq);
+		destroy_workqueue(ocfs2_wq);
+	}
+
+	debugfs_remove(ocfs2_debugfs_root);
+
+	ocfs2_free_mem_caches();
+
+	unregister_filesystem(&ocfs2_fs_type);
+
+	exit_ocfs2_extent_maps();
+
+	exit_ocfs2_uptodate_cache();
+
+	mlog_exit_void();
+}
+
+static void ocfs2_put_super(struct super_block *sb)
+{
+	mlog_entry("(0x%p)\n", sb);
+
+	ocfs2_sync_blockdev(sb);
+	ocfs2_dismount_volume(sb, 0);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct ocfs2_super *osb;
+	u32 numbits, freebits;
+	int status;
+	struct ocfs2_dinode *bm_lock;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = NULL;
+
+	mlog_entry("(%p, %p)\n", sb, buf);
+
+	osb = OCFS2_SB(sb);
+
+	inode = ocfs2_get_system_file_inode(osb,
+					    GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		mlog(ML_ERROR, "failed to get bitmap inode\n");
+		status = -EIO;
+		goto bail;
+	}
+
+	status = ocfs2_meta_lock(inode, NULL, &bh, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	bm_lock = (struct ocfs2_dinode *) bh->b_data;
+
+	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
+	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
+
+	buf->f_type = OCFS2_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
+	buf->f_blocks = ((sector_t) numbits) *
+			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bfree = ((sector_t) freebits) *
+		       (osb->s_clustersize >> osb->sb->s_blocksize_bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = numbits;
+	buf->f_ffree = freebits;
+
+	brelse(bh);
+
+	ocfs2_meta_unlock(inode, 0);
+	status = 0;
+bail:
+	if (inode)
+		iput(inode);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static void ocfs2_inode_init_once(void *data,
+				  kmem_cache_t *cachep,
+				  unsigned long flags)
+{
+	struct ocfs2_inode_info *oi = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		oi->ip_flags = 0;
+		oi->ip_open_count = 0;
+		spin_lock_init(&oi->ip_lock);
+		ocfs2_extent_map_init(&oi->vfs_inode);
+		INIT_LIST_HEAD(&oi->ip_handle_list);
+		INIT_LIST_HEAD(&oi->ip_io_markers);
+		oi->ip_handle = NULL;
+		oi->ip_created_trans = 0;
+		oi->ip_last_trans = 0;
+		oi->ip_dir_start_lookup = 0;
+
+		init_rwsem(&oi->ip_alloc_sem);
+		init_MUTEX(&(oi->ip_io_sem));
+
+		oi->ip_blkno = 0ULL;
+		oi->ip_clusters = 0;
+
+		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+
+		ocfs2_metadata_cache_init(&oi->vfs_inode);
+
+		inode_init_once(&oi->vfs_inode);
+	}
+}
+
+static int ocfs2_initialize_mem_caches(void)
+{
+	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
+					       sizeof(struct ocfs2_inode_info),
+					       0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+					       ocfs2_inode_init_once, NULL);
+	if (!ocfs2_inode_cachep)
+		return -ENOMEM;
+
+	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
+					     sizeof(struct ocfs2_journal_lock),
+					     0,
+					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     NULL, NULL);
+	if (!ocfs2_lock_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocfs2_free_mem_caches(void)
+{
+	if (ocfs2_inode_cachep)
+		kmem_cache_destroy(ocfs2_inode_cachep);
+	if (ocfs2_lock_cache)
+		kmem_cache_destroy(ocfs2_lock_cache);
+
+	ocfs2_inode_cachep = NULL;
+	ocfs2_lock_cache = NULL;
+}
+
+static int ocfs2_get_sector(struct super_block *sb,
+			    struct buffer_head **bh,
+			    int block,
+			    int sect_size)
+{
+	if (!sb_set_blocksize(sb, sect_size)) {
+		mlog(ML_ERROR, "unable to set blocksize\n");
+		return -EIO;
+	}
+
+	*bh = sb_getblk(sb, block);
+	if (!*bh) {
+		mlog_errno(-EIO);
+		return -EIO;
+	}
+	lock_buffer(*bh);
+	if (!buffer_dirty(*bh))
+		clear_buffer_uptodate(*bh);
+	unlock_buffer(*bh);
+	ll_rw_block(READ, 1, bh);
+	wait_on_buffer(*bh);
+	return 0;
+}
+
+/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
+static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
+{
+	int status;
+
+	/* XXX hold a ref on the node while mounte?  easy enough, if
+	 * desirable. */
+	osb->node_num = o2nm_this_node();
+	if (osb->node_num == O2NM_MAX_NODES) {
+		mlog(ML_ERROR, "could not find this host's node number\n");
+		status = -ENOENT;
+		goto bail;
+	}
+
+	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static int ocfs2_mount_volume(struct super_block *sb)
+{
+	int status = 0;
+	int unlock_super = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry_void();
+
+	if (ocfs2_is_hard_readonly(osb))
+		goto leave;
+
+	status = ocfs2_fill_local_node_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_register_hb_callbacks(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_dlm_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* requires vote_thread to be running. */
+	status = ocfs2_register_net_handlers(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_super_lock(osb, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+	unlock_super = 1;
+
+	/* This will load up the node map and add ourselves to it. */
+	status = ocfs2_find_slot(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	ocfs2_populate_mounted_map(osb);
+
+	/* load all node-local system inodes */
+	status = ocfs2_init_local_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_check_volume(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_truncate_log_init(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* This should be sent *after* we recovered our journal as it
+	 * will cause other nodes to unmark us as needing
+	 * recovery. However, we need to send it *before* dropping the
+	 * super block lock as otherwise their recovery threads might
+	 * try to clean us up while we're live! */
+	status = ocfs2_request_mount_vote(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (unlock_super)
+		ocfs2_super_unlock(osb, 1);
+
+	mlog_exit(status);
+	return status;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+	mb();
+	return osb->recovery_thread_task != NULL;
+}
+
+static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
+{
+	int tmp;
+	struct ocfs2_super *osb = NULL;
+
+	mlog_entry("(0x%p)\n", sb);
+
+	BUG_ON(!sb);
+	osb = OCFS2_SB(sb);
+	BUG_ON(!osb);
+
+	ocfs2_shutdown_local_alloc(osb);
+
+	ocfs2_truncate_log_shutdown(osb);
+
+	/* disable any new recovery threads and wait for any currently
+	 * running ones to exit. Do this before setting the vol_state. */
+	down(&osb->recovery_lock);
+	osb->disable_recovery = 1;
+	up(&osb->recovery_lock);
+	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+	/* At this point, we know that no more recovery threads can be
+	 * launched, so wait for any recovery completion work to
+	 * complete. */
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_journal_shutdown(osb);
+
+	ocfs2_sync_blockdev(sb);
+
+	/* No dlm means we've failed during mount, so skip all the
+	 * steps which depended on that to complete. */
+	if (osb->dlm) {
+		tmp = ocfs2_super_lock(osb, 1);
+		if (tmp < 0) {
+			mlog_errno(tmp);
+			return;
+		}
+
+		tmp = ocfs2_request_umount_vote(osb);
+		if (tmp < 0)
+			mlog_errno(tmp);
+
+		if (osb->slot_num != OCFS2_INVALID_SLOT)
+			ocfs2_put_slot(osb);
+
+		ocfs2_super_unlock(osb, 1);
+	}
+
+	ocfs2_release_system_inodes(osb);
+
+	if (osb->dlm) {
+		ocfs2_unregister_net_handlers(osb);
+
+		ocfs2_dlm_shutdown(osb);
+	}
+
+	ocfs2_clear_hb_callbacks(osb);
+
+	debugfs_remove(osb->osb_debug_root);
+
+	if (!mnt_err)
+		ocfs2_stop_heartbeat(osb);
+
+	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
+
+	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
+	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+
+	ocfs2_delete_osb(osb);
+	kfree(osb);
+	sb->s_dev = 0;
+	sb->s_fs_info = NULL;
+}
+
+static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
+				unsigned uuid_bytes)
+{
+	int i, ret;
+	char *ptr;
+
+	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
+
+	osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+	if (osb->uuid_str == NULL)
+		return -ENOMEM;
+
+	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
+
+	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
+		/* print with null */
+		ret = snprintf(ptr, 3, "%02X", uuid[i]);
+		if (ret != 2) /* drop super cleans up */
+			return -EINVAL;
+		/* then only advance past the last char */
+		ptr += 2;
+	}
+
+	return 0;
+}
+
+static int ocfs2_initialize_super(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int sector_size)
+{
+	int status = 0;
+	int i;
+	struct ocfs2_dinode *di = NULL;
+	struct inode *inode = NULL;
+	struct buffer_head *bitmap_bh = NULL;
+	struct ocfs2_journal *journal;
+	__le32 uuid_net_key;
+	struct ocfs2_super *osb;
+
+	mlog_entry_void();
+
+	osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
+	if (!osb) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	sb->s_fs_info = osb;
+	sb->s_op = &ocfs2_sops;
+	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_flags |= MS_NOATIME;
+	/* this is needed to support O_LARGEFILE */
+	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
+
+	osb->sb = sb;
+	/* Save off for ocfs2_rw_direct */
+	osb->s_sectsize_bits = blksize_bits(sector_size);
+	if (!osb->s_sectsize_bits)
+		BUG();
+
+	osb->net_response_ids = 0;
+	spin_lock_init(&osb->net_response_lock);
+	INIT_LIST_HEAD(&osb->net_response_list);
+
+	INIT_LIST_HEAD(&osb->osb_net_handlers);
+	init_waitqueue_head(&osb->recovery_event);
+	spin_lock_init(&osb->vote_task_lock);
+	init_waitqueue_head(&osb->vote_event);
+	osb->vote_work_sequence = 0;
+	osb->vote_wake_sequence = 0;
+	INIT_LIST_HEAD(&osb->blocked_lock_list);
+	osb->blocked_lock_count = 0;
+	INIT_LIST_HEAD(&osb->vote_list);
+	spin_lock_init(&osb->osb_lock);
+
+	atomic_set(&osb->alloc_stats.moves, 0);
+	atomic_set(&osb->alloc_stats.local_data, 0);
+	atomic_set(&osb->alloc_stats.bitmap_data, 0);
+	atomic_set(&osb->alloc_stats.bg_allocs, 0);
+	atomic_set(&osb->alloc_stats.bg_extends, 0);
+
+	ocfs2_init_node_maps(osb);
+
+	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
+		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+
+	init_MUTEX(&osb->recovery_lock);
+
+	osb->disable_recovery = 0;
+	osb->recovery_thread_task = NULL;
+
+	init_waitqueue_head(&osb->checkpoint_event);
+	atomic_set(&osb->needs_checkpoint, 0);
+
+	osb->node_num = O2NM_INVALID_NODE_NUM;
+	osb->slot_num = OCFS2_INVALID_SLOT;
+
+	osb->local_alloc_state = OCFS2_LA_UNUSED;
+	osb->local_alloc_bh = NULL;
+
+	ocfs2_setup_hb_callbacks(osb);
+
+	init_waitqueue_head(&osb->osb_mount_event);
+
+	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
+	if (!osb->vol_label) {
+		mlog(ML_ERROR, "unable to alloc vol label\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
+	if (!osb->uuid) {
+		mlog(ML_ERROR, "unable to alloc uuid\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *)bh->b_data;
+
+	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+		     osb->max_slots);
+		status = -EINVAL;
+		goto bail;
+	}
+	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+
+	osb->s_feature_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
+	osb->s_feature_ro_compat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
+	osb->s_feature_incompat =
+		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
+
+	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount because of unsupported "
+		     "optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+	if (!(osb->sb->s_flags & MS_RDONLY) &&
+	    (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
+		mlog(ML_ERROR, "couldn't mount RDWR because of "
+		     "unsupported optional features (%x).\n", i);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	get_random_bytes(&osb->s_next_generation, sizeof(u32));
+
+	/* FIXME
+	 * This should be done in ocfs2_journal_init(), but unknown
+	 * ordering issues will cause the filesystem to crash.
+	 * If anyone wants to figure out what part of the code
+	 * refers to osb->journal before ocfs2_journal_init() is run,
+	 * be my guest.
+	 */
+	/* initialize our journal structure */
+
+	journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
+	if (!journal) {
+		mlog(ML_ERROR, "unable to alloc journal\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+	osb->journal = journal;
+	journal->j_osb = osb;
+
+	atomic_set(&journal->j_num_trans, 0);
+	init_rwsem(&journal->j_trans_barrier);
+	init_waitqueue_head(&journal->j_checkpointed);
+	spin_lock_init(&journal->j_lock);
+	journal->j_trans_id = (unsigned long) 1;
+	INIT_LIST_HEAD(&journal->j_la_cleanups);
+	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
+	journal->j_state = OCFS2_JOURNAL_FREE;
+
+	/* get some pseudo constants for clustersize bits */
+	osb->s_clustersize_bits =
+		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+	osb->s_clustersize = 1 << osb->s_clustersize_bits;
+	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
+
+	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
+	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
+		mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
+		     osb->s_clustersize);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
+	    > (u32)~0UL) {
+		mlog(ML_ERROR, "Volume might try to write to blocks beyond "
+		     "what jbd can address in 32 bits.\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
+				 sizeof(di->id2.i_super.s_uuid))) {
+		mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+	osb->net_key = le32_to_cpu(uuid_net_key);
+
+	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
+	osb->vol_label[63] = '\0';
+	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
+	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
+	osb->first_cluster_group_blkno =
+		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
+	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	mlog(0, "vol_label: %s\n", osb->vol_label);
+	mlog(0, "uuid: %s\n", osb->uuid_str);
+	mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
+	     osb->root_blkno, osb->system_dir_blkno);
+
+	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
+	if (!osb->osb_dlm_debug) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	atomic_set(&osb->vol_state, VOLUME_INIT);
+
+	/* load root, system_dir, and all global system inodes */
+	status = ocfs2_init_global_system_inodes(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*
+	 * global bitmap
+	 */
+	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+					    OCFS2_INVALID_SLOT);
+	if (!inode) {
+		status = -EINVAL;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+
+	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
+				  inode);
+	iput(inode);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
+	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+	osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
+	brelse(bitmap_bh);
+	mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
+	     osb->bitmap_blkno, osb->bitmap_cpg);
+
+	status = ocfs2_init_slot_info(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/*  Link this osb onto the global linked list of all osb structures. */
+	/*  The Global Link List is mainted for the whole driver . */
+	spin_lock(&ocfs2_globals_lock);
+	osb->osb_id = osb_id;
+	if (osb_id < OCFS2_MAX_OSB_ID)
+		osb_id++;
+	else {
+		mlog(ML_ERROR, "Too many volumes mounted\n");
+		status = -ENOMEM;
+	}
+	spin_unlock(&ocfs2_globals_lock);
+
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * will return: -EAGAIN if it is ok to keep searching for superblocks
+ *              -EINVAL if there is a bad superblock
+ *              0 on success
+ */
+static int ocfs2_verify_volume(struct ocfs2_dinode *di,
+			       struct buffer_head *bh,
+			       u32 blksz)
+{
+	int status = -EAGAIN;
+
+	mlog_entry_void();
+
+	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
+		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		status = -EINVAL;
+		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+			mlog(ML_ERROR, "found superblock with incorrect block "
+			     "size: found %u, should be %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
+			       blksz);
+		} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
+			   OCFS2_MAJOR_REV_LEVEL ||
+			   le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
+			   OCFS2_MINOR_REV_LEVEL) {
+			mlog(ML_ERROR, "found superblock with bad version: "
+			     "found %u.%u, should be %u.%u\n",
+			     le16_to_cpu(di->id2.i_super.s_major_rev_level),
+			     le16_to_cpu(di->id2.i_super.s_minor_rev_level),
+			     OCFS2_MAJOR_REV_LEVEL,
+			     OCFS2_MINOR_REV_LEVEL);
+		} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
+			mlog(ML_ERROR, "bad block number on superblock: "
+			     "found %"MLFu64", should be %llu\n",
+			     di->i_blkno, (unsigned long long)bh->b_blocknr);
+		} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
+			    le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
+			mlog(ML_ERROR, "bad cluster size found: %u\n",
+			     1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+		} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
+			mlog(ML_ERROR, "bad root_blkno: 0\n");
+		} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
+			mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
+		} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
+			mlog(ML_ERROR,
+			     "Superblock slots found greater than file system "
+			     "maximum: found %u, max %u\n",
+			     le16_to_cpu(di->id2.i_super.s_max_slots),
+			     OCFS2_MAX_SLOTS);
+		} else {
+			/* found it! */
+			status = 0;
+		}
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_check_volume(struct ocfs2_super *osb)
+{
+	int status = 0;
+	int dirty;
+	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
+						  * recover
+						  * ourselves. */
+
+	mlog_entry_void();
+
+	/* Init our journal object. */
+	status = ocfs2_journal_init(osb->journal, &dirty);
+	if (status < 0) {
+		mlog(ML_ERROR, "Could not initialize journal!\n");
+		goto finally;
+	}
+
+	/* If the journal was unmounted cleanly then we don't want to
+	 * recover anything. Otherwise, journal_load will do that
+	 * dirty work for us :) */
+	if (!dirty) {
+		status = ocfs2_journal_wipe(osb->journal, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+	} else {
+		mlog(ML_NOTICE, "File system was not unmounted cleanly, "
+		     "recovering volume.\n");
+	}
+
+	/* will play back anything left in the journal. */
+	ocfs2_journal_load(osb->journal);
+
+	if (dirty) {
+		/* recover my local alloc if we didn't unmount cleanly. */
+		status = ocfs2_begin_local_alloc_recovery(osb,
+							  osb->slot_num,
+							  &local_alloc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto finally;
+		}
+		/* we complete the recovery process after we've marked
+		 * ourselves as mounted. */
+	}
+
+	mlog(0, "Journal loaded.\n");
+
+	status = ocfs2_load_local_alloc(osb);
+	if (status < 0) {
+		mlog_errno(status);
+		goto finally;
+	}
+
+	if (dirty) {
+		/* Recovery will be completed after we've mounted the
+		 * rest of the volume. */
+		osb->dirty = 1;
+		osb->local_alloc_copy = local_alloc;
+		local_alloc = NULL;
+	}
+
+	/* go through each journal, trylock it and if you get the
+	 * lock, and it's marked as dirty, set the bit in the recover
+	 * map and launch a recovery thread for it. */
+	status = ocfs2_mark_dead_nodes(osb);
+	if (status < 0)
+		mlog_errno(status);
+
+finally:
+	if (local_alloc)
+		kfree(local_alloc);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * The routine gets called from dismount or close whenever a dismount on
+ * volume is requested and the osb open count becomes 1.
+ * It will remove the osb from the global list and also free up all the
+ * initialized resources and fileobject.
+ */
+static void ocfs2_delete_osb(struct ocfs2_super *osb)
+{
+	mlog_entry_void();
+
+	/* This function assumes that the caller has the main osb resource */
+
+	if (osb->slot_info)
+		ocfs2_free_slot_info(osb->slot_info);
+
+	/* FIXME
+	 * This belongs in journal shutdown, but because we have to
+	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
+	 * we free it here.
+	 */
+	kfree(osb->journal);
+	if (osb->local_alloc_copy)
+		kfree(osb->local_alloc_copy);
+	kfree(osb->uuid_str);
+	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+	memset(osb, 0, sizeof(struct ocfs2_super));
+
+	mlog_exit_void();
+}
+
+/* Put OCFS2 into a readonly state, or (if the user specifies it),
+ * panic(). We do not support continue-on-error operation. */
+static void ocfs2_handle_error(struct super_block *sb)
+{
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+
+	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY &&
+	    (ocfs2_is_soft_readonly(osb) ||
+	     ocfs2_is_hard_readonly(osb)))
+		return;
+
+	printk(KERN_CRIT "File system is now read-only due to the potential "
+	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
+	       "system is unmounted.\n");
+	sb->s_flags |= MS_RDONLY;
+	ocfs2_set_ro_flag(osb, 0);
+}
+
+static char error_buf[1024];
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	/* Not using mlog here because we want to show the actual
+	 * function the error came from. */
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	ocfs2_handle_error(sb);
+}
+
+/* Handle critical errors. This is intentionally more drastic than
+ * ocfs2_handle_error, so we only use for things like journal errors,
+ * etc. */
+void __ocfs2_abort(struct super_block* sb,
+		   const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsprintf(error_buf, fmt, args);
+	va_end(args);
+
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
+	       sb->s_id, function, error_buf);
+
+	/* We don't have the cluster support yet to go straight to
+	 * hard readonly in here. Until then, we want to keep
+	 * ocfs2_abort() so that we can at least mark critical
+	 * errors.
+	 *
+	 * TODO: This should abort the journal and alert other nodes
+	 * that our slot needs recovery. */
+
+	/* Force a panic(). This stinks, but it's better than letting
+	 * things continue without having a proper hard readonly
+	 * here. */
+	OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+	ocfs2_handle_error(sb);
+}
+
+module_init(ocfs2_init);
+module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 0000000..c564177
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * super.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SUPER_H
+#define OCFS2_SUPER_H
+
+extern struct workqueue_struct *ocfs2_wq;
+
+int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
+				  int node_num);
+
+void __ocfs2_error(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+void __ocfs2_abort(struct super_block *sb,
+		   const char *function,
+		   const char *fmt, ...);
+#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+
+#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 0000000..f6986bd
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ *  linux/cluster/ssi/cfs/symlink.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation; either version 2 of
+ *	the License, or (at your option) any later version.
+ *
+ *	This program is distributed in the hope that it will be useful,
+ *	but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *	MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
+ *	or NON INFRINGEMENT.  See the GNU General Public License for more
+ *	details.
+ *
+ * 	You should have received a copy of the GNU General Public License
+ * 	along with this program; if not, write to the Free Software
+ * 	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *	Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  Optimization changes Copyright (C) 1994 Florian La Roche
+ *
+ *  Jun 7 1999, cache symlink lookups in the page cache.  -DaveM
+ *
+ *  Portions Copyright (C) 2001 Compaq Computer Corporation
+ *
+ *  ocfs2 symlink handling code.
+ *
+ *  Copyright (C) 2004, 2005 Oracle.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/utsname.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "symlink.h"
+
+#include "buffer_head_io.h"
+
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage);
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh);
+
+/* get the link contents into pagecache */
+static char *ocfs2_page_getlink(struct dentry * dentry,
+				struct page **ppage)
+{
+	struct page * page;
+	struct address_space *mapping = dentry->d_inode->i_mapping;
+	page = read_cache_page(mapping, 0,
+			       (filler_t *)mapping->a_ops->readpage, NULL);
+	if (IS_ERR(page))
+		goto sync_fail;
+	wait_on_page_locked(page);
+	if (!PageUptodate(page))
+		goto async_fail;
+	*ppage = page;
+	return kmap(page);
+
+async_fail:
+	page_cache_release(page);
+	return ERR_PTR(-EIO);
+
+sync_fail:
+	return (char*)page;
+}
+
+static char *ocfs2_fast_symlink_getlink(struct inode *inode,
+					struct buffer_head **bh)
+{
+	int status;
+	char *link = NULL;
+	struct ocfs2_dinode *fe;
+
+	mlog_entry_void();
+
+	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				  OCFS2_I(inode)->ip_blkno,
+				  bh,
+				  OCFS2_BH_CACHED,
+				  inode);
+	if (status < 0) {
+		mlog_errno(status);
+		link = ERR_PTR(status);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) (*bh)->b_data;
+	link = (char *) fe->id2.i_symlink;
+bail:
+	mlog_exit(status);
+
+	return link;
+}
+
+static int ocfs2_readlink(struct dentry *dentry,
+			  char __user *buffer,
+			  int buflen)
+{
+	int ret;
+	char *link;
+	struct buffer_head *bh = NULL;
+	struct inode *inode = dentry->d_inode;
+
+	mlog_entry_void();
+
+	link = ocfs2_fast_symlink_getlink(inode, &bh);
+	if (IS_ERR(link)) {
+		ret = PTR_ERR(link);
+		goto out;
+	}
+
+	ret = vfs_readlink(dentry, buffer, buflen, link);
+
+	brelse(bh);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static void *ocfs2_follow_link(struct dentry *dentry,
+			       struct nameidata *nd)
+{
+	int status;
+	char *link;
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	struct buffer_head *bh = NULL;
+	
+	if (ocfs2_inode_is_fast_symlink(inode))
+		link = ocfs2_fast_symlink_getlink(inode, &bh);
+	else
+		link = ocfs2_page_getlink(dentry, &page);
+	if (IS_ERR(link)) {
+		status = PTR_ERR(link);
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = vfs_follow_link(nd, link);
+	if (status)
+		mlog_errno(status);
+bail:
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+	if (bh)
+		brelse(bh);
+
+	return ERR_PTR(status);
+}
+
+struct inode_operations ocfs2_symlink_inode_operations = {
+	.readlink	= page_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
+struct inode_operations ocfs2_fast_symlink_inode_operations = {
+	.readlink	= ocfs2_readlink,
+	.follow_link	= ocfs2_follow_link,
+	.getattr	= ocfs2_getattr,
+};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 0000000..1ea9e4d
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * symlink.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYMLINK_H
+#define OCFS2_SYMLINK_H
+
+extern struct inode_operations ocfs2_symlink_inode_operations;
+extern struct inode_operations ocfs2_fast_symlink_inode_operations;
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
+{
+	return (S_ISLNK(inode->i_mode) &&
+		inode->i_blocks == 0);
+}
+
+
+#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 0000000..600a8bc
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.c
+ *
+ * Initialize, read, write, etc. system files.
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "ocfs2.h"
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "alloc.h"
+#include "dir.h"
+#include "inode.h"
+#include "journal.h"
+#include "sysfile.h"
+
+#include "buffer_head_io.h"
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot);
+
+static inline int is_global_system_inode(int type);
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+static inline int is_global_system_inode(int type)
+{
+	return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
+		type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
+}
+
+static inline int is_in_system_inode_array(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot)
+{
+	return slot == osb->slot_num || is_global_system_inode(type);
+}
+
+struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					  int type,
+					  u32 slot)
+{
+	struct inode *inode = NULL;
+	struct inode **arr = NULL;
+
+	/* avoid the lookup if cached in local system file array */
+	if (is_in_system_inode_array(osb, type, slot))
+		arr = &(osb->system_inodes[type]);
+
+	if (arr && ((inode = *arr) != NULL)) {
+		/* get a ref in addition to the array ref */
+		inode = igrab(inode);
+		if (!inode)
+			BUG();
+
+		return inode;
+	}
+
+	/* this gets one ref thru iget */
+	inode = _ocfs2_get_system_file_inode(osb, type, slot);
+
+	/* add one more if putting into array for first time */
+	if (arr && inode) {
+		*arr = igrab(inode);
+		if (!*arr)
+			BUG();
+	}
+	return inode;
+}
+
+static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+						   int type,
+						   u32 slot)
+{
+	char namebuf[40];
+	struct inode *inode = NULL;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+	int status = 0;
+
+	ocfs2_sprintf_system_inode_name(namebuf,
+					sizeof(namebuf),
+					type, slot);
+
+	status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
+					  &blkno, osb->sys_root_inode,
+					  &dirent_bh, &de);
+	if (status < 0) {
+		goto bail;
+	}
+
+	inode = ocfs2_iget(osb, blkno);
+	if (IS_ERR(inode)) {
+		mlog_errno(PTR_ERR(inode));
+		inode = NULL;
+		goto bail;
+	}
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+	return inode;
+}
+
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 0000000..cc9ea66
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * sysfile.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_SYSFILE_H
+#define OCFS2_SYSFILE_H
+
+struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
+					   int type,
+					   u32 slot);
+
+#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 0000000..3a0458f
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.c
+ *
+ * Tracking the up-to-date-ness of a local buffer_head with respect to
+ * the cluster.
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Standard buffer head caching flags (uptodate, etc) are insufficient
+ * in a clustered environment - a buffer may be marked up to date on
+ * our local node but could have been modified by another cluster
+ * member. As a result an additional (and performant) caching scheme
+ * is required. A further requirement is that we consume as little
+ * memory as possible - we never pin buffer_head structures in order
+ * to cache them.
+ *
+ * We track the existence of up to date buffers on the inodes which
+ * are associated with them. Because we don't want to pin
+ * buffer_heads, this is only a (strong) hint and several other checks
+ * are made in the I/O path to ensure that we don't use a stale or
+ * invalid buffer without going to disk:
+ *	- buffer_jbd is used liberally - if a bh is in the journal on
+ *	  this node then it *must* be up to date.
+ *	- the standard buffer_uptodate() macro is used to detect buffers
+ *	  which may be invalid (even if we have an up to date tracking
+ * 	  item for them)
+ *
+ * For a full understanding of how this code works together, one
+ * should read the callers in dlmglue.c, the I/O functions in
+ * buffer_head_io.c and ocfs2_journal_access in journal.c
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/rbtree.h>
+#include <linux/jbd.h>
+
+#define MLOG_MASK_PREFIX ML_UPTODATE
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "inode.h"
+#include "uptodate.h"
+
+struct ocfs2_meta_cache_item {
+	struct rb_node	c_node;
+	sector_t	c_block;
+};
+
+static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+
+void ocfs2_metadata_cache_init(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+	ci->ci_num_cached = 0;
+}
+
+/* No lock taken here as 'root' is not expected to be visible to other
+ * processes. */
+static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
+{
+	unsigned int purged = 0;
+	struct rb_node *node;
+	struct ocfs2_meta_cache_item *item;
+
+	while ((node = rb_last(root)) != NULL) {
+		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
+
+		mlog(0, "Purge item %llu\n",
+		     (unsigned long long) item->c_block);
+
+		rb_erase(&item->c_node, root);
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+
+		purged++;
+	}
+	return purged;
+}
+
+/* Called from locking and called from ocfs2_clear_inode. Dump the
+ * cache for a given inode.
+ *
+ * This function is a few more lines longer than necessary due to some
+ * accounting done here, but I think it's worth tracking down those
+ * bugs sooner -- Mark */
+void ocfs2_metadata_cache_purge(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int tree, to_purge, purged;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct rb_root root = RB_ROOT;
+
+	spin_lock(&oi->ip_lock);
+	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+	to_purge = ci->ci_num_cached;
+
+	mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
+	     tree ? "array" : "tree", oi->ip_blkno);
+
+	/* If we're a tree, save off the root so that we can safely
+	 * initialize the cache. We do the work to free tree members
+	 * without the spinlock. */
+	if (tree)
+		root = ci->ci_cache.ci_tree;
+
+	ocfs2_metadata_cache_init(inode);
+	spin_unlock(&oi->ip_lock);
+
+	purged = ocfs2_purge_copied_metadata_tree(&root);
+	/* If possible, track the number wiped so that we can more
+	 * easily detect counting errors. Unfortunately, this is only
+	 * meaningful for trees. */
+	if (tree && purged != to_purge)
+		mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
+		     oi->ip_blkno, to_purge, purged);
+}
+
+/* Returns the index in the cache array, -1 if not found.
+ * Requires ip_lock. */
+static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
+				    sector_t item)
+{
+	int i;
+
+	for (i = 0; i < ci->ci_num_cached; i++) {
+		if (item == ci->ci_cache.ci_array[i])
+			return i;
+	}
+
+	return -1;
+}
+
+/* Returns the cache item if found, otherwise NULL.
+ * Requires ip_lock. */
+static struct ocfs2_meta_cache_item *
+ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
+			sector_t block)
+{
+	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	while (n) {
+		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < item->c_block)
+			n = n->rb_left;
+		else if (block > item->c_block)
+			n = n->rb_right;
+		else
+			return item;
+	}
+
+	return NULL;
+}
+
+static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+			       struct buffer_head *bh)
+{
+	int index = -1;
+	struct ocfs2_meta_cache_item *item = NULL;
+
+	spin_lock(&oi->ip_lock);
+
+	mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
+	     oi->ip_blkno, (unsigned long long) bh->b_blocknr,
+	     !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
+		index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
+						 bh->b_blocknr);
+	else
+		item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
+					       bh->b_blocknr);
+
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "index = %d, item = %p\n", index, item);
+
+	return (index != -1) || (item != NULL);
+}
+
+/* Warning: even if it returns true, this does *not* guarantee that
+ * the block is stored in our inode metadata cache. */
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh)
+{
+	/* Doesn't matter if the bh is in our cache or not -- if it's
+	 * not marked uptodate then we know it can't have correct
+	 * data. */
+	if (!buffer_uptodate(bh))
+		return 0;
+
+	/* OCFS2 does not allow multiple nodes to be changing the same
+	 * block at the same time. */
+	if (buffer_jbd(bh))
+		return 1;
+
+	/* Ok, locally the buffer is marked as up to date, now search
+	 * our cache to see if we can trust that. */
+	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
+
+/* Requires ip_lock */
+static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
+				     sector_t block)
+{
+	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
+	ci->ci_num_cached++;
+}
+
+/* By now the caller should have checked that the item does *not*
+ * exist in the tree.
+ * Requires ip_lock. */
+static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
+				      struct ocfs2_meta_cache_item *new)
+{
+	sector_t block = new->c_block;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
+	struct ocfs2_meta_cache_item *tmp;
+
+	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+	     ci->ci_num_cached);
+
+	while(*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
+
+		if (block < tmp->c_block)
+			p = &(*p)->rb_left;
+		else if (block > tmp->c_block)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
+			     (unsigned long long) block);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->c_node, parent, p);
+	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached++;
+}
+
+static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
+					     struct ocfs2_caching_info *ci)
+{
+	assert_spin_locked(&oi->ip_lock);
+
+	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
+		(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+}
+
+/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+ * pointers in tree after we use them - this allows caller to detect
+ * when to free in case of error. */
+static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+			       struct ocfs2_meta_cache_item **tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
+			"Inode %"MLFu64", num cached = %u, should be %u\n",
+			oi->ip_blkno, ci->ci_num_cached,
+			OCFS2_INODE_MAX_CACHE_ARRAY);
+	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+			"Inode %"MLFu64" not marked as inline anymore!\n",
+			oi->ip_blkno);
+	assert_spin_locked(&oi->ip_lock);
+
+	/* Be careful to initialize the tree members *first* because
+	 * once the ci_tree is used, the array is junk... */
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+		tree[i]->c_block = ci->ci_cache.ci_array[i];
+
+	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+	ci->ci_cache.ci_tree = RB_ROOT;
+	/* this will be set again by __ocfs2_insert_cache_tree */
+	ci->ci_num_cached = 0;
+
+	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+		__ocfs2_insert_cache_tree(ci, tree[i]);
+		tree[i] = NULL;
+	}
+
+	mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
+	     oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+}
+
+/* Slow path function - memory allocation is necessary. See the
+ * comment above ocfs2_set_buffer_uptodate for more information. */
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+					sector_t block,
+					int expand_tree)
+{
+	int i;
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+	struct ocfs2_meta_cache_item *new = NULL;
+	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+		{ NULL, };
+
+	mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
+	     oi->ip_blkno, (unsigned long long) block, expand_tree);
+
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+	if (!new) {
+		mlog_errno(-ENOMEM);
+		return;
+	}
+	new->c_block = block;
+
+	if (expand_tree) {
+		/* Do *not* allocate an array here - the removal code
+		 * has no way of tracking that. */
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
+						   GFP_KERNEL);
+			if (!tree[i]) {
+				mlog_errno(-ENOMEM);
+				goto out_free;
+			}
+
+			/* These are initialized in ocfs2_expand_cache! */
+		}
+	}
+
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		mlog(0, "Someone cleared the tree underneath us\n");
+		/* Ok, items were removed from the cache in between
+		 * locks. Detect this and revert back to the fast path */
+		ocfs2_append_cache_array(ci, block);
+		spin_unlock(&oi->ip_lock);
+		goto out_free;
+	}
+
+	if (expand_tree)
+		ocfs2_expand_cache(oi, tree);
+
+	__ocfs2_insert_cache_tree(ci, new);
+	spin_unlock(&oi->ip_lock);
+
+	new = NULL;
+out_free:
+	if (new)
+		kmem_cache_free(ocfs2_uptodate_cachep, new);
+
+	/* If these were used, then ocfs2_expand_cache re-set them to
+	 * NULL for us. */
+	if (tree[0]) {
+		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+			if (tree[i])
+				kmem_cache_free(ocfs2_uptodate_cachep,
+						tree[i]);
+	}
+}
+
+/* Item insertion is guarded by ip_io_sem, so the insertion path takes
+ * advantage of this by not rechecking for a duplicate insert during
+ * the slow case. Additionally, if the cache needs to be bumped up to
+ * a tree, the code will not recheck after acquiring the lock --
+ * multiple paths cannot be expanding to a tree at the same time.
+ *
+ * The slow path takes into account that items can be removed
+ * (including the whole tree wiped and reset) when this process it out
+ * allocating memory. In those cases, it reverts back to the fast
+ * path.
+ *
+ * Note that this function may actually fail to insert the block if
+ * memory cannot be allocated. This is not fatal however (but may
+ * result in a performance penalty) */
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh)
+{
+	int expand;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	/* The block may very well exist in our cache already, so avoid
+	 * doing any more work in that case. */
+	if (ocfs2_buffer_cached(oi, bh))
+		return;
+
+	mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
+	     (unsigned long long) bh->b_blocknr);
+
+	/* No need to recheck under spinlock - insertion is guarded by
+	 * ip_io_sem */
+	spin_lock(&oi->ip_lock);
+	if (ocfs2_insert_can_use_array(oi, ci)) {
+		/* Fast case - it's an array and there's a free
+		 * spot. */
+		ocfs2_append_cache_array(ci, bh->b_blocknr);
+		spin_unlock(&oi->ip_lock);
+		return;
+	}
+
+	expand = 0;
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		/* We need to bump things up to a tree. */
+		expand = 1;
+	}
+	spin_unlock(&oi->ip_lock);
+
+	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+}
+
+/* Called against a newly allocated buffer. Most likely nobody should
+ * be able to read this sort of metadata while it's still being
+ * allocated, but this is careful to take ip_io_sem anyway. */
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	/* This should definitely *not* exist in our cache */
+	BUG_ON(ocfs2_buffer_cached(oi, bh));
+
+	set_buffer_uptodate(bh);
+
+	down(&oi->ip_io_sem);
+	ocfs2_set_buffer_uptodate(inode, bh);
+	up(&oi->ip_io_sem);
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
+					int index)
+{
+	sector_t *array = ci->ci_cache.ci_array;
+	int bytes;
+
+	BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+	BUG_ON(index >= ci->ci_num_cached);
+	BUG_ON(!ci->ci_num_cached);
+
+	mlog(0, "remove index %d (num_cached = %u\n", index,
+	     ci->ci_num_cached);
+
+	ci->ci_num_cached--;
+
+	/* don't need to copy if the array is now empty, or if we
+	 * removed at the tail */
+	if (ci->ci_num_cached && index < ci->ci_num_cached) {
+		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
+		memmove(&array[index], &array[index + 1], bytes);
+	}
+}
+
+/* Requires ip_lock. */
+static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
+				       struct ocfs2_meta_cache_item *item)
+{
+	mlog(0, "remove block %llu from tree\n",
+	     (unsigned long long) item->c_block);
+
+	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
+	ci->ci_num_cached--;
+}
+
+/* Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit. */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	int index;
+	sector_t block = bh->b_blocknr;
+	struct ocfs2_meta_cache_item *item = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+
+	spin_lock(&oi->ip_lock);
+	mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
+	     oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
+	     oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+
+	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+		index = ocfs2_search_cache_array(ci, block);
+		if (index != -1)
+			ocfs2_remove_metadata_array(ci, index);
+	} else {
+		item = ocfs2_search_cache_tree(ci, block);
+		if (item)
+			ocfs2_remove_metadata_tree(ci, item);
+	}
+	spin_unlock(&oi->ip_lock);
+
+	if (item)
+		kmem_cache_free(ocfs2_uptodate_cachep, item);
+}
+
+int __init init_ocfs2_uptodate_cache(void)
+{
+	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
+				  sizeof(struct ocfs2_meta_cache_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ocfs2_uptodate_cachep)
+		return -ENOMEM;
+
+	mlog(0, "%u inlined cache items per inode.\n",
+	     OCFS2_INODE_MAX_CACHE_ARRAY);
+
+	return 0;
+}
+
+void __exit exit_ocfs2_uptodate_cache(void)
+{
+	if (ocfs2_uptodate_cachep)
+		kmem_cache_destroy(ocfs2_uptodate_cachep);
+}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 0000000..e5aacdf
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * uptodate.h
+ *
+ * Cluster uptodate tracking
+ *
+ * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_UPTODATE_H
+#define OCFS2_UPTODATE_H
+
+int __init init_ocfs2_uptodate_cache(void);
+void __exit exit_ocfs2_uptodate_cache(void);
+
+void ocfs2_metadata_cache_init(struct inode *inode);
+void ocfs2_metadata_cache_purge(struct inode *inode);
+
+int ocfs2_buffer_uptodate(struct inode *inode,
+			  struct buffer_head *bh);
+void ocfs2_set_buffer_uptodate(struct inode *inode,
+			       struct buffer_head *bh);
+void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh);
+
+#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 0000000..5405ce1
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+#include "ver.h"
+
+#define OCFS2_BUILD_VERSION "1.3.3"
+
+#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
+
+void ocfs2_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 0000000..d7395cb
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_VER_H
+#define OCFS2_VER_H
+
+void ocfs2_print_version(void);
+
+#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 0000000..021978e
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.c
+ *
+ * description here
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kthread.h>
+
+#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
+#include <cluster/tcp.h>
+
+#include <dlm/dlmapi.h>
+
+#define MLOG_MASK_PREFIX ML_VOTE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "heartbeat.h"
+#include "inode.h"
+#include "journal.h"
+#include "slot_map.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define OCFS2_MESSAGE_TYPE_VOTE     (0x1)
+#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
+struct ocfs2_msg_hdr
+{
+	__be32 h_response_id; /* used to lookup message handle on sending
+			    * node. */
+	__be32 h_request;
+	__be64 h_blkno;
+	__be32 h_generation;
+	__be32 h_node_num;    /* node sending this particular message. */
+};
+
+/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
+ * for the network. */
+#define OCFS2_VOTE_FILENAME_LEN 256
+struct ocfs2_vote_msg
+{
+	struct ocfs2_msg_hdr v_hdr;
+	union {
+		__be32 v_generic1;
+		__be32 v_orphaned_slot;	/* Used during delete votes */
+		__be32 v_nlink;		/* Used during unlink votes */
+	} md1;				/* Message type dependant 1 */
+	__be32 v_unlink_namelen;
+	__be64 v_unlink_parent;
+	u8  v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
+};
+
+/* Responses are given these values to maintain backwards
+ * compatibility with older ocfs2 versions */
+#define OCFS2_RESPONSE_OK		(0)
+#define OCFS2_RESPONSE_BUSY		(-16)
+#define OCFS2_RESPONSE_BAD_MSG		(-22)
+
+struct ocfs2_response_msg
+{
+	struct ocfs2_msg_hdr r_hdr;
+	__be32 r_response;
+	__be32 r_orphaned_slot;
+};
+
+struct ocfs2_vote_work {
+	struct list_head   w_list;
+	struct ocfs2_vote_msg w_msg;
+};
+
+enum ocfs2_vote_request {
+	OCFS2_VOTE_REQ_INVALID = 0,
+	OCFS2_VOTE_REQ_DELETE,
+	OCFS2_VOTE_REQ_UNLINK,
+	OCFS2_VOTE_REQ_RENAME,
+	OCFS2_VOTE_REQ_MOUNT,
+	OCFS2_VOTE_REQ_UMOUNT,
+	OCFS2_VOTE_REQ_LAST
+};
+
+static inline int ocfs2_is_valid_vote_request(int request)
+{
+	return OCFS2_VOTE_REQ_INVALID < request &&
+		request < OCFS2_VOTE_REQ_LAST;
+}
+
+typedef void (*ocfs2_net_response_callback)(void *priv,
+					    struct ocfs2_response_msg *resp);
+struct ocfs2_net_response_cb {
+	ocfs2_net_response_callback	rc_cb;
+	void				*rc_priv;
+};
+
+struct ocfs2_net_wait_ctxt {
+	struct list_head        n_list;
+	u32                     n_response_id;
+	wait_queue_head_t       n_event;
+	struct ocfs2_node_map   n_node_map;
+	int                     n_response; /* an agreggate response. 0 if
+					     * all nodes are go, < 0 on any
+					     * negative response from any
+					     * node or network error. */
+	struct ocfs2_net_response_cb *n_callback;
+};
+
+static void ocfs2_process_mount_request(struct ocfs2_super *osb,
+					unsigned int node_num)
+{
+	mlog(0, "MOUNT vote from node %u\n", node_num);
+	/* The other node only sends us this message when he has an EX
+	 * on the superblock, so our recovery threads (if having been
+	 * launched) are waiting on it.*/
+	ocfs2_recovery_map_clear(osb, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
+
+	/* We clear the umount map here because a node may have been
+	 * previously mounted, safely unmounted but never stopped
+	 * heartbeating - in which case we'd have a stale entry. */
+	ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
+}
+
+static void ocfs2_process_umount_request(struct ocfs2_super *osb,
+					 unsigned int node_num)
+{
+	mlog(0, "UMOUNT vote from node %u\n", node_num);
+	ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
+	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
+}
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	assert_spin_locked(&oi->ip_lock);
+	/* We set the SKIP_DELETE flag on the inode so we don't try to
+	 * delete it in delete_inode ourselves, thus avoiding
+	 * unecessary lock pinging. If the other node failed to wipe
+	 * the inode as a result of a crash, then recovery will pick
+	 * up the slack. */
+	oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
+}
+
+static int ocfs2_process_delete_request(struct inode *inode,
+					int *orphaned_slot)
+{
+	int response = OCFS2_RESPONSE_BUSY;
+
+	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
+	     inode->i_ino, inode->i_nlink, *orphaned_slot);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+
+	/* Whatever our vote response is, we want to make sure that
+	 * the orphaned slot is recorded properly on this node *and*
+	 * on the requesting node. Technically, if the requesting node
+	 * did not know which slot the inode is orphaned in but we
+	 * respond with BUSY he doesn't actually need the orphaned
+	 * slot, but it doesn't hurt to do it here anyway. */
+	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
+		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
+				OCFS2_INVALID_SLOT &&
+				OCFS2_I(inode)->ip_orphaned_slot !=
+				(*orphaned_slot),
+				"Inode %"MLFu64": This node thinks it's "
+				"orphaned in slot %d, messaged it's in %d\n",
+				OCFS2_I(inode)->ip_blkno,
+				OCFS2_I(inode)->ip_orphaned_slot,
+				*orphaned_slot);
+
+		mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
+		     OCFS2_I(inode)->ip_blkno, *orphaned_slot);
+
+		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
+	} else {
+		mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
+		     OCFS2_I(inode)->ip_orphaned_slot,
+		     OCFS2_I(inode)->ip_blkno);
+
+		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	}
+
+	/* vote no if the file is still open. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* directories are a bit ugly... What if someone is sitting in
+	 * it? We want to make sure the inode is removed completely as
+	 * a result of the iput in process_vote. */
+	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
+		mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
+		goto done;
+	}
+
+	if (filemap_fdatawrite(inode->i_mapping)) {
+		mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
+		     OCFS2_I(inode)->ip_blkno);
+		goto done;
+	}
+	sync_mapping_buffers(inode->i_mapping);
+	truncate_inode_pages(inode->i_mapping, 0);
+	ocfs2_extent_map_trunc(inode, 0);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	/* double check open count - someone might have raced this
+	 * thread into ocfs2_file_open while we were writing out
+	 * data. If we're to allow a wipe of this inode now, we *must*
+	 * hold the spinlock until we've marked it. */
+	if (OCFS2_I(inode)->ip_open_count) {
+		mlog(0, "Raced to wipe! open count = %u\n",
+		     OCFS2_I(inode)->ip_open_count);
+		spin_unlock(&OCFS2_I(inode)->ip_lock);
+		goto done;
+	}
+
+	/* Mark the inode as being wiped from disk. */
+	ocfs2_mark_inode_remotely_deleted(inode);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	/* Not sure this is necessary anymore. */
+	d_prune_aliases(inode);
+
+	/* If we get here, then we're voting 'yes', so commit the
+	 * delete on our side. */
+	response = OCFS2_RESPONSE_OK;
+done:
+	return response;
+}
+
+static int ocfs2_match_dentry(struct dentry *dentry,
+			      u64 parent_blkno,
+			      unsigned int namelen,
+			      const char *name)
+{
+	struct inode *parent;
+
+	if (!dentry->d_parent) {
+		mlog(0, "Detached from parent.\n");
+		return 0;
+	}
+
+	parent = dentry->d_parent->d_inode;
+	/* Negative parent dentry? */
+	if (!parent)
+		return 0;
+
+	/* Name is in a different directory. */
+	if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+		return 0;
+
+	if (dentry->d_name.len != namelen)
+		return 0;
+
+	/* comparison above guarantees this is safe. */
+	if (memcmp(dentry->d_name.name, name, namelen))
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_process_dentry_request(struct inode *inode,
+					 int rename,
+					 unsigned int new_nlink,
+					 u64 parent_blkno,
+					 unsigned int namelen,
+					 const char *name)
+{
+	struct dentry *dentry = NULL;
+	struct list_head *p;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
+	     namelen, namelen, name);
+
+	spin_lock(&dcache_lock);
+
+	/* Another node is removing this name from the system. It is
+	 * up to us to find the corresponding dentry and if it exists,
+	 * unhash it from the dcache. */
+	list_for_each(p, &inode->i_dentry) {
+		dentry = list_entry(p, struct dentry, d_alias);
+
+		if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
+			mlog(0, "dentry found: %.*s\n",
+			     dentry->d_name.len, dentry->d_name.name);
+
+			dget_locked(dentry);
+			break;
+		}
+
+		dentry = NULL;
+	}
+
+	spin_unlock(&dcache_lock);
+
+	if (dentry) {
+		d_delete(dentry);
+		dput(dentry);
+	}
+
+	/* rename votes don't send link counts */
+	if (!rename) {
+		mlog(0, "new_nlink = %u\n", new_nlink);
+
+		/* We don't have the proper locks here to directly
+		 * change i_nlink and besides, the vote is sent
+		 * *before* the operation so it may have failed on the
+		 * other node. This passes a hint to ocfs2_drop_inode
+		 * to force ocfs2_delete_inode, who will take the
+		 * proper cluster locks to sort things out. */
+		if (new_nlink == 0) {
+			spin_lock(&oi->ip_lock);
+			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+			spin_unlock(&OCFS2_I(inode)->ip_lock);
+		}
+	}
+}
+
+static void ocfs2_process_vote(struct ocfs2_super *osb,
+			       struct ocfs2_vote_msg *msg)
+{
+	int net_status, vote_response;
+	int orphaned_slot = 0;
+	int rename = 0;
+	unsigned int node_num, generation, new_nlink, namelen;
+	u64 blkno, parent_blkno;
+	enum ocfs2_vote_request request;
+	struct inode *inode = NULL;
+	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
+	struct ocfs2_response_msg response;
+
+	/* decode the network mumbo jumbo into local variables. */
+	request = be32_to_cpu(hdr->h_request);
+	blkno = be64_to_cpu(hdr->h_blkno);
+	generation = be32_to_cpu(hdr->h_generation);
+	node_num = be32_to_cpu(hdr->h_node_num);
+	if (request == OCFS2_VOTE_REQ_DELETE)
+		orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
+
+	mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
+	     "generation = %u, node_num = %u, priv1 = %u\n", request,
+	     blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
+
+	if (!ocfs2_is_valid_vote_request(request)) {
+		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
+		     request, node_num);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+		goto respond;
+	}
+
+	vote_response = OCFS2_RESPONSE_OK;
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_UMOUNT:
+		ocfs2_process_umount_request(osb, node_num);
+		goto respond;
+	case OCFS2_VOTE_REQ_MOUNT:
+		ocfs2_process_mount_request(osb, node_num);
+		goto respond;
+	default:
+		/* avoids a gcc warning */
+		break;
+	}
+
+	/* We cannot process the remaining message types before we're
+	 * fully mounted. It's perfectly safe however to send a 'yes'
+	 * response as we can't possibly have any of the state they're
+	 * asking us to modify yet. */
+	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
+		goto respond;
+
+	/* If we get here, then the request is against an inode. */
+	inode = ocfs2_ilookup_for_vote(osb, blkno,
+				       request == OCFS2_VOTE_REQ_DELETE);
+
+	/* Not finding the inode is perfectly valid - it means we're
+	 * not interested in what the other node is about to do to it
+	 * so in those cases we automatically respond with an
+	 * affirmative. Cluster locking ensures that we won't race
+	 * interest in the inode with this vote request. */
+	if (!inode)
+		goto respond;
+
+	/* Check generation values. It's possible for us to get a
+	 * request against a stale inode. If so then we proceed as if
+	 * we had not found an inode in the first place. */
+	if (inode->i_generation != generation) {
+		mlog(0, "generation passed %u != inode generation = %u, "
+		     "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
+		     "i_count = %u, message type = %u\n",
+		     generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
+		     OCFS2_I(inode)->ip_blkno, blkno,
+		     atomic_read(&inode->i_count), request);
+		iput(inode);
+		inode = NULL;
+		goto respond;
+	}
+
+	switch (request) {
+	case OCFS2_VOTE_REQ_DELETE:
+		vote_response = ocfs2_process_delete_request(inode,
+							     &orphaned_slot);
+		break;
+	case OCFS2_VOTE_REQ_RENAME:
+		rename = 1;
+		/* fall through */
+	case OCFS2_VOTE_REQ_UNLINK:
+		parent_blkno = be64_to_cpu(msg->v_unlink_parent);
+		namelen = be32_to_cpu(msg->v_unlink_namelen);
+		/* new_nlink will be ignored in case of a rename vote */
+		new_nlink = be32_to_cpu(msg->md1.v_nlink);
+		ocfs2_process_dentry_request(inode, rename, new_nlink,
+					     parent_blkno, namelen,
+					     msg->v_unlink_dirent);
+		break;
+	default:
+		mlog(ML_ERROR, "node %u, invalid request: %u\n",
+		     node_num, request);
+		vote_response = OCFS2_RESPONSE_BAD_MSG;
+	}
+
+respond:
+	/* Response struture is small so we just put it on the stack
+	 * and stuff it inline. */
+	memset(&response, 0, sizeof(struct ocfs2_response_msg));
+	response.r_hdr.h_response_id = hdr->h_response_id;
+	response.r_hdr.h_blkno = hdr->h_blkno;
+	response.r_hdr.h_generation = hdr->h_generation;
+	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
+	response.r_response = cpu_to_be32(vote_response);
+	response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
+
+	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					&response,
+					sizeof(struct ocfs2_response_msg),
+					node_num,
+					NULL);
+	/* We still want to error print for ENOPROTOOPT here. The
+	 * sending node shouldn't have unregistered his net handler
+	 * without sending an unmount vote 1st */
+	if (net_status < 0
+	    && net_status != -ETIMEDOUT
+	    && net_status != -ENOTCONN)
+		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
+		     node_num, net_status);
+
+	if (inode)
+		iput(inode);
+}
+
+static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
+{
+	unsigned long processed;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_vote_work *work;
+
+	mlog_entry_void();
+
+	spin_lock(&osb->vote_task_lock);
+	/* grab this early so we know to try again if a state change and
+	 * wake happens part-way through our work  */
+	osb->vote_work_sequence = osb->vote_wake_sequence;
+
+	processed = osb->blocked_lock_count;
+	while (processed) {
+		BUG_ON(list_empty(&osb->blocked_lock_list));
+
+		lockres = list_entry(osb->blocked_lock_list.next,
+				     struct ocfs2_lock_res, l_blocked_list);
+		list_del_init(&lockres->l_blocked_list);
+		osb->blocked_lock_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		BUG_ON(!processed);
+		processed--;
+
+		ocfs2_process_blocked_lock(osb, lockres);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+
+	while (osb->vote_count) {
+		BUG_ON(list_empty(&osb->vote_list));
+		work = list_entry(osb->vote_list.next,
+				  struct ocfs2_vote_work, w_list);
+		list_del(&work->w_list);
+		osb->vote_count--;
+		spin_unlock(&osb->vote_task_lock);
+
+		ocfs2_process_vote(osb, &work->w_msg);
+		kfree(work);
+
+		spin_lock(&osb->vote_task_lock);
+	}
+	spin_unlock(&osb->vote_task_lock);
+
+	mlog_exit_void();
+}
+
+static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
+{
+	int empty = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (list_empty(&osb->blocked_lock_list) &&
+	    list_empty(&osb->vote_list))
+		empty = 1;
+
+	spin_unlock(&osb->vote_task_lock);
+	return empty;
+}
+
+static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
+{
+	int should_wake = 0;
+
+	spin_lock(&osb->vote_task_lock);
+	if (osb->vote_work_sequence != osb->vote_wake_sequence)
+		should_wake = 1;
+	spin_unlock(&osb->vote_task_lock);
+
+	return should_wake;
+}
+
+int ocfs2_vote_thread(void *arg)
+{
+	int status = 0;
+	struct ocfs2_super *osb = arg;
+
+	/* only quit once we've been asked to stop and there is no more
+	 * work available */
+	while (!(kthread_should_stop() &&
+		 ocfs2_vote_thread_lists_empty(osb))) {
+
+		wait_event_interruptible(osb->vote_event,
+					 ocfs2_vote_thread_should_wake(osb) ||
+					 kthread_should_stop());
+
+		mlog(0, "vote_thread: awoken\n");
+
+		ocfs2_vote_thread_do_work(osb);
+	}
+
+	osb->vote_task = NULL;
+	return status;
+}
+
+static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
+{
+	struct ocfs2_net_wait_ctxt *w;
+
+	w = kcalloc(1, sizeof(*w), GFP_KERNEL);
+	if (!w) {
+		mlog_errno(-ENOMEM);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&w->n_list);
+	init_waitqueue_head(&w->n_event);
+	ocfs2_node_map_init(&w->n_node_map);
+	w->n_response_id = response_id;
+	w->n_callback = NULL;
+bail:
+	return w;
+}
+
+static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
+{
+	unsigned int ret;
+
+	spin_lock(&osb->net_response_lock);
+	ret = ++osb->net_response_ids;
+	spin_unlock(&osb->net_response_lock);
+
+	return ret;
+}
+
+static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_del(&w->n_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
+				      struct ocfs2_net_wait_ctxt *w)
+{
+	spin_lock(&osb->net_response_lock);
+	list_add_tail(&w->n_list,
+		      &osb->net_response_list);
+	spin_unlock(&osb->net_response_lock);
+}
+
+static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
+					struct ocfs2_net_wait_ctxt *w,
+					int node_num)
+{
+	assert_spin_locked(&osb->net_response_lock);
+
+	ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
+	if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
+		wake_up(&w->n_event);
+}
+
+/* Intended to be called from the node down callback, we fake remove
+ * the node from all our response contexts */
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	spin_lock(&osb->net_response_lock);
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+
+		__ocfs2_mark_node_responded(osb, w, node_num);
+	}
+
+	spin_unlock(&osb->net_response_lock);
+}
+
+static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
+				struct ocfs2_vote_msg *request,
+				unsigned int response_id,
+				int *response,
+				struct ocfs2_net_response_cb *callback)
+{
+	int status, i, remote_err;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+	int dequeued = 0;
+
+	mlog_entry_void();
+
+	w = ocfs2_new_net_wait_ctxt(response_id);
+	if (!w) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+	w->n_callback = callback;
+
+	/* we're pretty much ready to go at this point, and this fills
+	 * in n_response which we need anyway... */
+	ocfs2_queue_net_wait_ctxt(osb, w);
+
+	i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
+
+	while (i != O2NM_INVALID_NODE_NUM) {
+		if (i != osb->node_num) {
+			mlog(0, "trying to send request to node %i\n", i);
+			ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
+
+			remote_err = 0;
+			status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
+						    osb->net_key,
+						    request,
+						    sizeof(*request),
+						    i,
+						    &remote_err);
+			if (status == -ETIMEDOUT) {
+				mlog(0, "remote node %d timed out!\n", i);
+				status = -EAGAIN;
+				goto bail;
+			}
+			if (remote_err < 0) {
+				status = remote_err;
+				mlog(0, "remote error %d on node %d!\n",
+				     remote_err, i);
+				mlog_errno(status);
+				goto bail;
+			}
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		i++;
+		i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
+		mlog(0, "next is %d, i am %d\n", i, osb->node_num);
+	}
+	mlog(0, "done sending, now waiting on responses...\n");
+
+	wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
+
+	ocfs2_dequeue_net_wait_ctxt(osb, w);
+	dequeued = 1;
+
+	*response = w->n_response;
+	status = 0;
+bail:
+	if (w) {
+		if (!dequeued)
+			ocfs2_dequeue_net_wait_ctxt(osb, w);
+		kfree(w);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
+						      u64 blkno,
+						      unsigned int generation,
+						      enum ocfs2_vote_request type,
+						      u32 priv)
+{
+	struct ocfs2_vote_msg *request;
+	struct ocfs2_msg_hdr *hdr;
+
+	BUG_ON(!ocfs2_is_valid_vote_request(type));
+
+	request = kcalloc(1, sizeof(*request), GFP_KERNEL);
+	if (!request) {
+		mlog_errno(-ENOMEM);
+	} else {
+		hdr = &request->v_hdr;
+		hdr->h_node_num = cpu_to_be32(osb->node_num);
+		hdr->h_request = cpu_to_be32(type);
+		hdr->h_blkno = cpu_to_be64(blkno);
+		hdr->h_generation = cpu_to_be32(generation);
+
+		request->md1.v_generic1 = cpu_to_be32(priv);
+	}
+
+	return request;
+}
+
+/* Complete the buildup of a new vote request and process the
+ * broadcast return value. */
+static int ocfs2_do_request_vote(struct ocfs2_super *osb,
+				 struct ocfs2_vote_msg *request,
+				 struct ocfs2_net_response_cb *callback)
+{
+	int status, response;
+	unsigned int response_id;
+	struct ocfs2_msg_hdr *hdr;
+
+	response_id = ocfs2_new_response_id(osb);
+
+	hdr = &request->v_hdr;
+	hdr->h_response_id = cpu_to_be32(response_id);
+
+	status = ocfs2_broadcast_vote(osb, request, response_id, &response,
+				      callback);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = response;
+bail:
+
+	return status;
+}
+
+static int ocfs2_request_vote(struct inode *inode,
+			      struct ocfs2_vote_msg *request,
+			      struct ocfs2_net_response_cb *callback)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (ocfs2_inode_is_new(inode))
+		return 0;
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current))
+			return -ERESTARTSYS;
+
+		status = ocfs2_super_lock(osb, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
+
+		status = 0;
+		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num))
+			status = ocfs2_do_request_vote(osb, request, callback);
+
+		ocfs2_super_unlock(osb, 0);
+	}
+	return status;
+}
+
+static void ocfs2_delete_response_cb(void *priv,
+				     struct ocfs2_response_msg *resp)
+{
+	int orphaned_slot, node;
+	struct inode *inode = priv;
+
+	orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
+	node = be32_to_cpu(resp->r_hdr.h_node_num);
+	mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
+	     "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	/* The other node may not actually know which slot the inode
+	 * is orphaned in. */
+	if (orphaned_slot == OCFS2_INVALID_SLOT)
+		return;
+
+	/* Ok, the responding node knows which slot this inode is
+	 * orphaned in. We verify that the information is correct and
+	 * then record this in the inode. ocfs2_delete_inode will use
+	 * this information to determine which lock to take. */
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
+			OCFS2_I(inode)->ip_orphaned_slot
+			!= OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
+			"says it's orphaned in slot %d, we think it's in %d\n",
+			OCFS2_I(inode)->ip_blkno,
+			be32_to_cpu(resp->r_hdr.h_node_num),
+			orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
+
+	OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode)
+{
+	int orphaned_slot, status;
+	struct ocfs2_net_response_cb delete_cb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	delete_cb.rc_cb = ocfs2_delete_response_cb;
+	delete_cb.rc_priv = inode;
+
+	mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
+	     OCFS2_I(inode)->ip_blkno, orphaned_slot);
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
+	if (request) {
+		status = ocfs2_request_vote(inode, request, &delete_cb);
+
+		kfree(request);
+	}
+
+	return status;
+}
+
+static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
+				    struct dentry *dentry)
+{
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* We need some values which will uniquely identify a dentry
+	 * on the other nodes so that they can find it and run
+	 * d_delete against it. Parent directory block and full name
+	 * should suffice. */
+
+	mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
+	     OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
+	     dentry->d_name.name);
+
+	request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
+	request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
+	memcpy(request->v_unlink_dirent, dentry->d_name.name,
+	       dentry->d_name.len);
+}
+
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_UNLINK, nlink);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry)
+{
+	int status;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_vote_msg *request;
+
+	if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
+		return -ENAMETOOLONG;
+
+	status = -ENOMEM;
+	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
+					 inode->i_generation,
+					 OCFS2_VOTE_REQ_RENAME, 0);
+	if (request) {
+		ocfs2_setup_unlink_vote(request, dentry);
+
+		status = ocfs2_request_vote(inode, request, NULL);
+
+		kfree(request);
+	}
+	return status;
+}
+
+int ocfs2_request_mount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_MOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
+		    signal_pending(current)) {
+			status = -ERESTARTSYS;
+			goto bail;
+		}
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+int ocfs2_request_umount_vote(struct ocfs2_super *osb)
+{
+	int status;
+	struct ocfs2_vote_msg *request = NULL;
+
+	request = ocfs2_new_vote_request(osb, 0ULL, 0,
+					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	if (!request) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
+	status = -EAGAIN;
+	while (status == -EAGAIN) {
+		/* Do not check signals on this vote... We really want
+		 * this one to go all the way through. */
+
+		if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+					   osb->node_num)) {
+			status = 0;
+			goto bail;
+		}
+
+		status = ocfs2_do_request_vote(osb, request, NULL);
+	}
+
+bail:
+	if (request)
+		kfree(request);
+
+	return status;
+}
+
+/* TODO: This should eventually be a hash table! */
+static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
+							       u32 response_id)
+{
+	struct list_head *p;
+	struct ocfs2_net_wait_ctxt *w = NULL;
+
+	list_for_each(p, &osb->net_response_list) {
+		w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
+		if (response_id == w->n_response_id)
+			break;
+		w = NULL;
+	}
+
+	return w;
+}
+
+/* Translate response codes into local node errno values */
+static inline int ocfs2_translate_response(int response)
+{
+	int ret;
+
+	switch (response) {
+	case OCFS2_RESPONSE_OK:
+		ret = 0;
+		break;
+
+	case OCFS2_RESPONSE_BUSY:
+		ret = -EBUSY;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_handle_response_message(struct o2net_msg *msg,
+					 u32 len,
+					 void *data)
+{
+	unsigned int response_id, node_num;
+	int response_status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_response_msg *resp;
+	struct ocfs2_net_wait_ctxt * w;
+	struct ocfs2_net_response_cb *resp_cb;
+
+	resp = (struct ocfs2_response_msg *) msg->buf;
+
+	response_id = be32_to_cpu(resp->r_hdr.h_response_id);
+	node_num = be32_to_cpu(resp->r_hdr.h_node_num);
+	response_status = 
+		ocfs2_translate_response(be32_to_cpu(resp->r_response));
+
+	mlog(0, "received response message:\n");
+	mlog(0, "h_response_id = %u\n", response_id);
+	mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n", node_num);
+	mlog(0, "r_response = %d\n", response_status);
+
+	spin_lock(&osb->net_response_lock);
+	w = __ocfs2_find_net_wait_ctxt(osb, response_id);
+	if (!w) {
+		mlog(0, "request not found!\n");
+		goto bail;
+	}
+	resp_cb = w->n_callback;
+
+	if (response_status && (!w->n_response)) {
+		/* we only really need one negative response so don't
+		 * set it twice. */
+		w->n_response = response_status;
+	}
+
+	if (resp_cb) {
+		spin_unlock(&osb->net_response_lock);
+
+		resp_cb->rc_cb(resp_cb->rc_priv, resp);
+
+		spin_lock(&osb->net_response_lock);
+	}
+
+	__ocfs2_mark_node_responded(osb, w, node_num);
+bail:
+	spin_unlock(&osb->net_response_lock);
+
+	return 0;
+}
+
+static int ocfs2_handle_vote_message(struct o2net_msg *msg,
+				     u32 len,
+				     void *data)
+{
+	int status;
+	struct ocfs2_super *osb = data;
+	struct ocfs2_vote_work *work;
+
+	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
+	if (!work) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	INIT_LIST_HEAD(&work->w_list);
+	memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
+
+	mlog(0, "scheduling vote request:\n");
+	mlog(0, "h_response_id = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_response_id));
+	mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
+	mlog(0, "h_blkno = %"MLFu64"\n",
+	     be64_to_cpu(work->w_msg.v_hdr.h_blkno));
+	mlog(0, "h_generation = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
+	mlog(0, "h_node_num = %u\n",
+	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
+	mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
+
+	spin_lock(&osb->vote_task_lock);
+	list_add_tail(&work->w_list, &osb->vote_list);
+	osb->vote_count++;
+	spin_unlock(&osb->vote_task_lock);
+
+	ocfs2_kick_vote_thread(osb);
+
+	status = 0;
+bail:
+	return status;
+}
+
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
+{
+	if (!osb->net_key)
+		return;
+
+	o2net_unregister_handler_list(&osb->osb_net_handlers);
+
+	if (!list_empty(&osb->net_response_list))
+		mlog(ML_ERROR, "net response list not empty!\n");
+
+	osb->net_key = 0;
+}
+
+int ocfs2_register_net_handlers(struct ocfs2_super *osb)
+{
+	int status = 0;
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
+					osb->net_key,
+					sizeof(struct ocfs2_response_msg),
+					ocfs2_handle_response_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
+					osb->net_key,
+					sizeof(struct ocfs2_vote_msg),
+					ocfs2_handle_vote_message,
+					osb, &osb->osb_net_handlers);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+bail:
+	if (status < 0)
+		ocfs2_unregister_net_handlers(osb);
+
+	return status;
+}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 0000000..9cce607
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * vote.h
+ *
+ * description here
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef VOTE_H
+#define VOTE_H
+
+int ocfs2_vote_thread(void *arg);
+static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->vote_task_lock);
+	/* make sure the voting thread gets a swipe at whatever changes
+	 * the caller may have made to the voting state */
+	osb->vote_wake_sequence++;
+	spin_unlock(&osb->vote_task_lock);
+	wake_up(&osb->vote_event);
+}
+
+int ocfs2_request_delete_vote(struct inode *inode);
+int ocfs2_request_unlink_vote(struct inode *inode,
+			      struct dentry *dentry,
+			      unsigned int nlink);
+int ocfs2_request_rename_vote(struct inode *inode,
+			      struct dentry *dentry);
+int ocfs2_request_mount_vote(struct ocfs2_super *osb);
+int ocfs2_request_umount_vote(struct ocfs2_super *osb);
+int ocfs2_register_net_handlers(struct ocfs2_super *osb);
+void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
+
+void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
+
+void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
+					int node_num);
+#endif
-- 
cgit v1.1


From b4e40a51881931bfcbc78a585e875bb2784d6d10 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 15 Dec 2005 14:31:24 -0800
Subject: [PATCH] OCFS2: The Second Oracle Cluster Filesystem

Link the code into the kernel build system. OCFS2 is marked as
experimental.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
---
 fs/Kconfig  | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/Makefile |  1 +
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index ba1dbe2..59b1795 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,7 @@ config FS_XIP
 
 config EXT3_FS
 	tristate "Ext3 journalling file system support"
+	select JBD
 	help
 	  This is the journaling version of the Second extended file system
 	  (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
 	  extended attributes for file security labels, say N.
 
 config JBD
-# CONFIG_JBD could be its own option (even modular), but until there are
-# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
-# dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
 	tristate
-	default EXT3_FS
 	help
 	  This is a generic journaling layer for block devices.  It is
-	  currently used by the ext3 file system, but it could also be used to
-	  add journal support to other file systems or block devices such as
-	  RAID or LVM.
+	  currently used by the ext3 and OCFS2 file systems, but it could
+	  also be used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
 
-	  If you are using the ext3 file system, you need to say Y here. If
-	  you are not using ext3 then you will probably want to say N.
+	  If you are using the ext3 or OCFS2 file systems, you need to
+	  say Y here. If you are not using ext3 OCFS2 then you will probably
+	  want to say N.
 
 	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 into the kernel, you cannot
-	  compile this code as a module.
+	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
 
 config JBD_DEBUG
 	bool "JBD (ext3) debugging support"
@@ -326,6 +324,39 @@ config FS_POSIX_ACL
 
 source "fs/xfs/Kconfig"
 
+config OCFS2_FS
+	tristate "OCFS2 file system support (EXPERIMENTAL)"
+	depends on NET && EXPERIMENTAL
+	select CONFIGFS_FS
+	select JBD
+	select CRC32
+	select INET
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  Note: Features which OCFS2 does not support yet:
+	          - extended attributes
+	          - readonly mount
+		  - shared writeable mmap
+	          - loopback is supported, but data written will not
+	            be cluster coherent.
+	          - quotas
+	          - cluster aware flock
+	          - Directory change notification (F_NOTIFY)
+	          - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+	          - POSIX ACLs
+	          - readpages / writepages (not user visible)
+
 config MINIX_FS
 	tristate "Minix fs support"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index ff3d48a..7367611 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -102,3 +102,4 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
+obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
-- 
cgit v1.1


From 82353b594c784deabb8d9764b477e65c2b3726f9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 19 Dec 2005 11:16:07 -0800
Subject: [PATCH] This patch contains the following cleanups:

- cluster/sys.c: make needlessly global code static
- dlm/: "extern" declarations for variables belong into header files
        (and in this case, they are already in dlmdomain.h)

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/sys.c   | 4 ++--
 fs/ocfs2/dlm/dlmmaster.c | 4 +---
 fs/ocfs2/dlm/dlmthread.c | 3 ---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index f1e9946..1d9f6ac 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -50,7 +50,7 @@ static ssize_t o2cb_interface_revision_show(char *buf)
 	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
 
-O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
 
 static struct attribute *o2cb_attrs[] = {
 	&o2cb_attr_interface_revision.attr,
@@ -73,7 +73,7 @@ static struct kobj_type o2cb_subsys_type = {
 };
 
 /* gives us o2cb_subsys */
-decl_subsys(o2cb, NULL, NULL);
+static decl_subsys(o2cb, NULL, NULL);
 
 static ssize_t
 o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0472795..27e984f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,6 +48,7 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdebug.h"
+#include "dlmdomain.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
@@ -178,9 +179,6 @@ static void dlm_dump_mles(struct dlm_ctxt *dlm)
 	spin_unlock(&dlm->master_lock);
 }
 
-extern spinlock_t dlm_domain_lock;
-extern struct list_head dlm_domains;
-
 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 {
 	struct list_head *iter;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 92cd5cd..5be9d14 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -52,9 +52,6 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
 #include "cluster/masklog.h"
 
-extern spinlock_t dlm_domain_lock;
-extern struct list_head dlm_domains;
-
 static int dlm_thread(void *data);
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
-- 
cgit v1.1


From 51e7a5987058c6b4d0c1337587f2ec0c34ffa708 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 19 Dec 2005 11:21:42 -0800
Subject: [PATCH] o Update Kconfig documentation to reflect support for
 readonly mounts.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 59b1795..382e3b2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -346,7 +346,6 @@ config OCFS2_FS
 
 	  Note: Features which OCFS2 does not support yet:
 	          - extended attributes
-	          - readonly mount
 		  - shared writeable mmap
 	          - loopback is supported, but data written will not
 	            be cluster coherent.
-- 
cgit v1.1


From 6c59f9d9fb95934bf3d7d64249b338ce79953b5b Mon Sep 17 00:00:00 2001
From: Jody McIntyre <scjody@modernduck.com>
Date: Thu, 5 Jan 2006 23:04:08 -0500
Subject: Update MAINTAINERS - Jody is no longer at Steamballoon.

---
 MAINTAINERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6246b7f..5daae53 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1230,7 +1230,7 @@ IEEE 1394 SUBSYSTEM
 P:	Ben Collins
 M:	bcollins@debian.org
 P:	Jody McIntyre
-M:	scjody@steamballoon.com
+M:	scjody@modernduck.com
 L:	linux1394-devel@lists.sourceforge.net
 W:	http://www.linux1394.org/
 T:	git kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
@@ -1240,14 +1240,14 @@ IEEE 1394 OHCI DRIVER
 P:	Ben Collins
 M:	bcollins@debian.org
 P:	Jody McIntyre
-M:	scjody@steamballoon.com
+M:	scjody@modernduck.com
 L:	linux1394-devel@lists.sourceforge.net
 W:	http://www.linux1394.org/
 S:	Maintained
 
 IEEE 1394 PCILYNX DRIVER
 P:	Jody McIntyre
-M:	scjody@steamballoon.com
+M:	scjody@modernduck.com
 L:	linux1394-devel@lists.sourceforge.net
 W:	http://www.linux1394.org/
 S:	Maintained
-- 
cgit v1.1


From 9f155b9802bb7049cd0f216c3fe903b58620df11 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Thu, 5 Jan 2006 23:11:29 -0500
Subject: [PATCH] i386: PTRACE_POKEUSR: allow changing RF bit in EFLAGS
 register.

Setting RF (resume flag) allows a debugger to resume execution after a
code breakpoint without tripping the breakpoint again.  It is reset by
the CPU after execution of one instruction.

Requested by Stephane Eranian:
  "I am trying to the user HW debug registers on i386 and I am running
   into a problem with ptrace() not allowing access to EFLAGS_RF for
   POKEUSER (see FLAG_MASK).  [ ...  ] It avoids the need to remove the
   breakpoint, single step, and reinstall.  The equivalent functionality
   exists on IA-64 and is allowed by ptrace()"

Cc: Stephane Eranian <eranian@hpl.hp.com>
Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5ffbb4b..5c1fb6a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -32,9 +32,12 @@
  * in exit.c or in signal.c.
  */
 
-/* determines which flags the user has access to. */
-/* 1 = access 0 = no access */
-#define FLAG_MASK 0x00044dd5
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x00054dd5
 
 /* set's the trap flag. */
 #define TRAP_FLAG 0x100
-- 
cgit v1.1


From ef9be1d336378de279d4e37779f1b83cebadbcc0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 11 Nov 2005 14:27:09 +0100
Subject: [BLOCK] as-iosched: update alias handling

Unlike other ioscheds, as-iosched handles alias by chaing them using
rq->queuelist.  As aliased requests are very rare in the first place,
this complicates merge/dispatch handling without meaningful
performance improvement.  This patch updates as-iosched to dump
aliased requests into dispatch queue as other ioscheds do.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c | 144 ++++++++++-------------------------------------------
 1 file changed, 25 insertions(+), 119 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 43fa204..8da3cf6 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
 
 static kmem_cache_t *arq_pool;
 
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+static void as_antic_stop(struct as_data *ad);
+
 /*
  * IO Context helper functions
  */
@@ -370,7 +373,7 @@ static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
  * existing request against the same sector), which can happen when using
  * direct IO, then return the alias.
  */
-static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
 	struct rb_node *parent = NULL;
@@ -397,6 +400,16 @@ static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 	return NULL;
 }
 
+static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	struct as_rq *alias;
+
+	while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
+		as_move_to_dispatch(ad, alias);
+		as_antic_stop(ad);
+	}
+}
+
 static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
 	if (!ON_RB(&arq->rb_node)) {
@@ -1133,23 +1146,6 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	/*
 	 * take it off the sort and fifo list, add to dispatch queue
 	 */
-	while (!list_empty(&rq->queuelist)) {
-		struct request *__rq = list_entry_rq(rq->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_del(&__rq->queuelist);
-
-		elv_dispatch_add_tail(ad->q, __rq);
-
-		if (__arq->io_context && __arq->io_context->aic)
-			atomic_inc(&__arq->io_context->aic->nr_dispatched);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-		__arq->state = AS_RQ_DISPATCHED;
-
-		ad->nr_dispatched++;
-	}
-
 	as_remove_queued_request(ad->q, rq);
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 
@@ -1326,49 +1322,12 @@ fifo_expired:
 }
 
 /*
- * Add arq to a list behind alias
- */
-static inline void
-as_add_aliased_request(struct as_data *ad, struct as_rq *arq,
-				struct as_rq *alias)
-{
-	struct request  *req = arq->request;
-	struct list_head *insert = alias->request->queuelist.prev;
-
-	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&req->queuelist)) {
-		struct request *__rq = list_entry_rq(req->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &alias->request->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
-	/*
-	 * Another request with the same start sector on the rbtree.
-	 * Link this request to that sector. They are untangled in
-	 * as_move_to_dispatch
-	 */
-	list_add(&arq->request->queuelist, insert);
-
-	/*
-	 * Don't want to have to handle merges.
-	 */
-	as_del_arq_hash(arq);
-	arq->request->flags |= REQ_NOMERGE;
-}
-
-/*
  * add arq to rbtree and fifo
  */
 static void as_add_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
-	struct as_rq *alias;
 	int data_dir;
 
 	arq->state = AS_RQ_NEW;
@@ -1387,33 +1346,17 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 		atomic_inc(&arq->io_context->aic->nr_queued);
 	}
 
-	alias = as_add_arq_rb(ad, arq);
-	if (!alias) {
-		/*
-		 * set expire time (only used for reads) and add to fifo list
-		 */
-		arq->expires = jiffies + ad->fifo_expire[data_dir];
-		list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	as_add_arq_rb(ad, arq);
+	if (rq_mergeable(arq->request))
+		as_add_arq_hash(ad, arq);
 
-		if (rq_mergeable(arq->request))
-			as_add_arq_hash(ad, arq);
-		as_update_arq(ad, arq); /* keep state machine up to date */
-
-	} else {
-		as_add_aliased_request(ad, arq, alias);
-
-		/*
-		 * have we been anticipating this request?
-		 * or does it come from the same process as the one we are
-		 * anticipating for?
-		 */
-		if (ad->antic_status == ANTIC_WAIT_REQ
-				|| ad->antic_status == ANTIC_WAIT_NEXT) {
-			if (as_can_break_anticipation(ad, arq))
-				as_antic_stop(ad);
-		}
-	}
+	/*
+	 * set expire time (only used for reads) and add to fifo list
+	 */
+	arq->expires = jiffies + ad->fifo_expire[data_dir];
+	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
 
+	as_update_arq(ad, arq); /* keep state machine up to date */
 	arq->state = AS_RQ_QUEUED;
 }
 
@@ -1536,23 +1479,8 @@ static void as_merged_request(request_queue_t *q, struct request *req)
 	 * if the merge was a front merge, we need to reposition request
 	 */
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
-		/*
-		 * Note! We should really be moving any old aliased requests
-		 * off this request and try to insert them into the rbtree. We
-		 * currently don't bother. Ditto the next function.
-		 */
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 		/*
 		 * Note! At this stage of this and the next function, our next
 		 * request may not be optimal - eg the request may have "grown"
@@ -1579,18 +1507,8 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 	as_add_arq_hash(ad, arq);
 
 	if (rq_rb_key(req) != arq->rb_key) {
-		struct as_rq *alias, *next_arq = NULL;
-
-		if (ad->next_arq[arq->is_sync] == arq)
-			next_arq = as_find_next_arq(ad, arq);
-
 		as_del_arq_rb(ad, arq);
-		if ((alias = as_add_arq_rb(ad, arq))) {
-			list_del_init(&arq->fifo);
-			as_add_aliased_request(ad, arq, alias);
-			if (next_arq)
-				ad->next_arq[arq->is_sync] = next_arq;
-		}
+		as_add_arq_rb(ad, arq);
 	}
 
 	/*
@@ -1610,18 +1528,6 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
 	}
 
 	/*
-	 * Transfer list of aliases
-	 */
-	while (!list_empty(&next->queuelist)) {
-		struct request *__rq = list_entry_rq(next->queuelist.next);
-		struct as_rq *__arq = RQ_DATA(__rq);
-
-		list_move_tail(&__rq->queuelist, &req->queuelist);
-
-		WARN_ON(__arq->state != AS_RQ_QUEUED);
-	}
-
-	/*
 	 * kill knowledge of next, this one is a goner
 	 */
 	as_remove_queued_request(q, next);
-- 
cgit v1.1


From 88ee5ef157202624de2b43b3512fdcb54fda1ab5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Sat, 12 Nov 2005 11:09:12 +0100
Subject: [BLOCK] ll_rw_blk: fastpath get_request()

Originally from: Nick Piggin <nickpiggin@yahoo.com.au>

Move current_io_context out of the get_request fastpth.  Also try to
streamline a few other things in this area.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c | 70 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d4beb9a..97f4e7e 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1908,40 +1908,40 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = current_io_context(GFP_ATOMIC);
-	int priv;
+	struct io_context *ioc = NULL;
+	int may_queue, priv;
 
-	if (rl->count[rw]+1 >= q->nr_requests) {
-		/*
-		 * The queue will fill after this allocation, so set it as
-		 * full, and mark this process as "batching". This process
-		 * will be allowed to complete a batch of requests, others
-		 * will be blocked.
-		 */
-		if (!blk_queue_full(q, rw)) {
-			ioc_set_batching(q, ioc);
-			blk_set_queue_full(q, rw);
-		}
-	}
+	may_queue = elv_may_queue(q, rw, bio);
+	if (may_queue == ELV_MQUEUE_NO)
+		goto rq_starved;
 
-	switch (elv_may_queue(q, rw, bio)) {
-		case ELV_MQUEUE_NO:
-			goto rq_starved;
-		case ELV_MQUEUE_MAY:
-			break;
-		case ELV_MQUEUE_MUST:
-			goto get_rq;
-	}
-
-	if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
-		/*
-		 * The queue is full and the allocating process is not a
-		 * "batcher", and not exempted by the IO scheduler
-		 */
-		goto out;
+	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
+		if (rl->count[rw]+1 >= q->nr_requests) {
+			ioc = current_io_context(GFP_ATOMIC);
+			/*
+			 * The queue will fill after this allocation, so set
+			 * it as full, and mark this process as "batching".
+			 * This process will be allowed to complete a batch of
+			 * requests, others will be blocked.
+			 */
+			if (!blk_queue_full(q, rw)) {
+				ioc_set_batching(q, ioc);
+				blk_set_queue_full(q, rw);
+			} else {
+				if (may_queue != ELV_MQUEUE_MUST
+						&& !ioc_batching(q, ioc)) {
+					/*
+					 * The queue is full and the allocating
+					 * process is not a "batcher", and not
+					 * exempted by the IO scheduler
+					 */
+					goto out;
+				}
+			}
+		}
+		set_queue_congested(q, rw);
 	}
 
-get_rq:
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
@@ -1952,8 +1952,6 @@ get_rq:
 
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
-	if (rl->count[rw] >= queue_congestion_on_threshold(q))
-		set_queue_congested(q, rw);
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
@@ -1962,7 +1960,7 @@ get_rq:
 	spin_unlock_irq(q->queue_lock);
 
 	rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
-	if (!rq) {
+	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
@@ -1987,6 +1985,12 @@ rq_starved:
 		goto out;
 	}
 
+	/*
+	 * ioc may be NULL here, and ioc_batching will be false. That's
+	 * OK, if the queue is under the request limit then requests need
+	 * not count toward the nr_batch_requests limit. There will always
+	 * be some limit enforced by BLK_BATCH_TIME.
+	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	
-- 
cgit v1.1


From 80cfd548eed68cf90c5ae9cfcd6b02230cece756 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 09:43:28 +0100
Subject: [BLOCK] bio: check for same page merge possibilities in
 __bio_add_page()

For filesystems with a blocksize < page size, we can merge same page
calls into the bio_vec at the end of the bio. This saves segments
on systems with a page size > the "normal" 4kb fs block size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/bio.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 38d3e80..dfe242a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
 		return 0;
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (((bio->bi_size + len) >> 9) > max_sectors)
 		return 0;
 
-	if (((bio->bi_size + len) >> 9) > max_sectors)
+	/*
+	 * For filesystems with a blocksize smaller than the pagesize
+	 * we will often be called with the same page as last time and
+	 * a consecutive offset.  Optimize this special case.
+	 */
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == prev->bv_page &&
+		    offset == prev->bv_offset + prev->bv_len) {
+			prev->bv_len += len;
+			if (q->merge_bvec_fn &&
+			    q->merge_bvec_fn(q, bio, prev) < len) {
+				prev->bv_len -= len;
+				return 0;
+			}
+
+			goto done;
+		}
+	}
+
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return 0;
 
 	/*
@@ -382,6 +403,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
 	bio->bi_hw_segments++;
+ done:
 	bio->bi_size += len;
 	return len;
 }
-- 
cgit v1.1


From 64100099ed22f71cce656c5c2caecf5c9cf255dc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 09:46:02 +0100
Subject: [BLOCK] mark some block/ variables cons

the patch below marks various read-only variables in block/* as const,
so that gcc can optimize the use of them; eg gcc will replace the use by
the value directly now and will even remove the memory usage of these.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c      | 16 ++++++++--------
 block/deadline-iosched.c |  8 ++++----
 block/ll_rw_blk.c        |  2 +-
 block/scsi_ioctl.c       |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee0bb41..74fae2d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -25,15 +25,15 @@
 /*
  * tunables
  */
-static int cfq_quantum = 4;		/* max queue in one round of service */
-static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
-static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
-static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
-static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
+static const int cfq_quantum = 4;		/* max queue in one round of service */
+static const int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
+static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
+static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 
-static int cfq_slice_sync = HZ / 10;
+static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
-static int cfq_slice_async_rq = 2;
+static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 100;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
@@ -45,7 +45,7 @@ static int cfq_slice_idle = HZ / 100;
 /*
  * disable queueing at the driver/hardware level
  */
-static int cfq_max_depth = 2;
+static const int cfq_max_depth = 2;
 
 /*
  * for the hash of cfqq inside the cfqd
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9cbec09..27e494b 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -19,10 +19,10 @@
 /*
  * See Documentation/block/deadline-iosched.txt
  */
-static int read_expire = HZ / 2;  /* max time before a read is submitted. */
-static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-static int writes_starved = 2;    /* max times reads can starve a write */
-static int fifo_batch = 16;       /* # of sequential requests treated as one
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
 
 static const int deadline_hash_shift = 5;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 97f4e7e..e02c88c 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1039,7 +1039,7 @@ void blk_queue_invalidate_tags(request_queue_t *q)
 
 EXPORT_SYMBOL(blk_queue_invalidate_tags);
 
-static char *rq_flags[] = {
+static const char * const rq_flags[] = {
 	"REQ_RW",
 	"REQ_FAILFAST",
 	"REQ_SORTED",
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 1d8852f..c2ac36d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(scsi_command_size);
 
 static int sg_get_version(int __user *p)
 {
-	static int sg_version_num = 30527;
+	static const int sg_version_num = 30527;
 	return put_user(sg_version_num, p);
 }
 
-- 
cgit v1.1


From 8ffdc6550c47f75ca4e6c9f30a2a89063e035cf2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:49:03 +0100
Subject: [BLOCK] add @uptodate to end_that_request_last() and @error to
 rq_end_io_fn()

add @uptodate argument to end_that_request_last() and @error
to rq_end_io_fn().  there's no generic way to pass error code
to request completion function, making generic error handling
of non-fs request difficult (rq->errors is driver-specific and
each driver uses it differently).  this patch adds @uptodate
to end_that_request_last() and @error to rq_end_io_fn().

for fs requests, this doesn't really matter, so just using the
same uptodate argument used in the last call to
end_that_request_first() should suffice.  imho, this can also
help the generic command-carrying request jens is working on.

Signed-off-by: tejun heo <htejun@gmail.com>
Signed-Off-By: Jens Axboe <axboe@suse.de>
---
 block/elevator.c                |  2 +-
 block/ll_rw_blk.c               | 22 +++++++++++++++-------
 drivers/block/DAC960.c          |  2 +-
 drivers/block/cciss.c           |  2 +-
 drivers/block/cpqarray.c        |  2 +-
 drivers/block/floppy.c          |  2 +-
 drivers/block/nbd.c             |  2 +-
 drivers/block/sx8.c             |  2 +-
 drivers/block/ub.c              |  2 +-
 drivers/block/viodasd.c         |  2 +-
 drivers/cdrom/cdu31a.c          |  2 +-
 drivers/ide/ide-cd.c            |  4 ++--
 drivers/ide/ide-io.c            |  6 +++---
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/mmc/mmc_block.c         |  4 ++--
 drivers/s390/block/dasd.c       |  2 +-
 drivers/s390/char/tape_block.c  |  2 +-
 drivers/scsi/ide-scsi.c         |  4 ++--
 drivers/scsi/scsi_lib.c         |  2 +-
 drivers/scsi/sd.c               |  2 +-
 include/linux/blkdev.h          |  6 +++---
 21 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a..85a11ce 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -498,7 +498,7 @@ struct request *elv_next_request(request_queue_t *q)
 			blkdev_dequeue_request(rq);
 			rq->flags |= REQ_QUIET;
 			end_that_request_chunk(rq, 0, nr_bytes);
-			end_that_request_last(rq);
+			end_that_request_last(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index e02c88c..8b1ae69 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -344,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq)
+static void blk_pre_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -362,7 +362,7 @@ static void blk_pre_flush_end_io(struct request *flush_rq)
 	}
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq)
+static void blk_post_flush_end_io(struct request *flush_rq, int error)
 {
 	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
@@ -2317,7 +2317,7 @@ EXPORT_SYMBOL(blk_rq_map_kern);
  */
 void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
 			   struct request *rq, int at_head,
-			   void (*done)(struct request *))
+			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
@@ -2521,7 +2521,7 @@ EXPORT_SYMBOL(blk_put_request);
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
  */
-void blk_end_sync_rq(struct request *rq)
+void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->waiting;
 
@@ -3183,9 +3183,17 @@ EXPORT_SYMBOL(end_that_request_chunk);
 /*
  * queue lock must be held
  */
-void end_that_request_last(struct request *req)
+void end_that_request_last(struct request *req, int uptodate)
 {
 	struct gendisk *disk = req->rq_disk;
+	int error;
+
+	/*
+	 * extend uptodate bool to allow < 0 value to be direct io error
+	 */
+	error = 0;
+	if (end_io_error(uptodate))
+		error = !uptodate ? -EIO : uptodate;
 
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
@@ -3200,7 +3208,7 @@ void end_that_request_last(struct request *req)
 		disk->in_flight--;
 	}
 	if (req->end_io)
-		req->end_io(req);
+		req->end_io(req, error);
 	else
 		__blk_put_request(req->q, req);
 }
@@ -3212,7 +3220,7 @@ void end_request(struct request *req, int uptodate)
 	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
 		add_disk_randomness(req->rq_disk);
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 }
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 70eaa5c..21097a3 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3471,7 +3471,7 @@ static inline boolean DAC960_ProcessCompletedRequest(DAC960_Command_T *Command,
 
 	 if (!end_that_request_first(Request, UpToDate, Command->BlockCount)) {
 
- 	 	end_that_request_last(Request);
+ 	 	end_that_request_last(Request, UpToDate);
 
 		if (Command->Completion) {
 			complete(Command->Completion);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c3441b3..d2815b7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2310,7 +2310,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 	printk("Done with %p\n", cmd->rq);
 #endif /* CCISS_DEBUG */ 
 
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, status ? 1 : -EIO);
 	cmd_free(h,cmd,1);
 }
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index cf1822a..9bddb68 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1036,7 +1036,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 	complete_buffers(cmd->rq->bio, ok);
 
         DBGPX(printk("Done with %p\n", cmd->rq););
-	end_that_request_last(cmd->rq);
+	end_that_request_last(cmd->rq, ok ? 1 : -EIO);
 }
 
 /*
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f7e765a..a5b857c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2301,7 +2301,7 @@ static void floppy_end_request(struct request *req, int uptodate)
 	add_disk_randomness(req->rq_disk);
 	floppy_off((long)req->rq_disk->private_data);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	/* We're done with the request */
 	current_req = NULL;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268dd..485345c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -136,7 +136,7 @@ static void nbd_end_request(struct request *req)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
-		end_that_request_last(req);
+		end_that_request_last(req, uptodate);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1ded3b4..9251f41 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -770,7 +770,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	rc = end_that_request_first(req, uptodate, req->hard_nr_sectors);
 	assert(rc == 0);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 10740a0..a05fe58 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -951,7 +951,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 static void ub_end_rq(struct request *rq, int uptodate)
 {
 	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
-	end_that_request_last(rq);
+	end_that_request_last(rq, uptodate);
 }
 
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2d518aa..063f030 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -305,7 +305,7 @@ static void viodasd_end_request(struct request *req, int uptodate,
 	if (end_that_request_first(req, uptodate, num_sectors))
 		return;
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/cdrom/cdu31a.c b/drivers/cdrom/cdu31a.c
index ac96de1..378e88d 100644
--- a/drivers/cdrom/cdu31a.c
+++ b/drivers/cdrom/cdu31a.c
@@ -1402,7 +1402,7 @@ static void do_cdu31a_request(request_queue_t * q)
 			if (!end_that_request_first(req, 1, nblock)) {
 				spin_lock_irq(q->queue_lock);
 				blkdev_dequeue_request(req);
-				end_that_request_last(req);
+				end_that_request_last(req, 1);
 				spin_unlock_irq(q->queue_lock);
 			}
 			continue;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 70aeb3a..d31117e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -614,7 +614,7 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
 			 */
 			spin_lock_irqsave(&ide_lock, flags);
 			end_that_request_chunk(failed, 0, failed->data_len);
-			end_that_request_last(failed);
+			end_that_request_last(failed, 0);
 			spin_unlock_irqrestore(&ide_lock, flags);
 		}
 
@@ -1735,7 +1735,7 @@ end_request:
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	HWGROUP(drive)->rq = NULL;
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ecfafcd..8435b44 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -89,7 +89,7 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
 
 		blkdev_dequeue_request(rq);
 		HWGROUP(drive)->rq = NULL;
-		end_that_request_last(rq);
+		end_that_request_last(rq, uptodate);
 		ret = 0;
 	}
 	return ret;
@@ -247,7 +247,7 @@ static void ide_complete_pm_request (ide_drive_t *drive, struct request *rq)
 	}
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
-	end_that_request_last(rq);
+	end_that_request_last(rq, 1);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
@@ -379,7 +379,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 	blkdev_dequeue_request(rq);
 	HWGROUP(drive)->rq = NULL;
 	rq->errors = err;
-	end_that_request_last(rq);
+	end_that_request_last(rq, !rq->errors);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5b..4f52252 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -466,7 +466,7 @@ static void i2o_block_end_request(struct request *req, int uptodate,
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 
 	if (likely(dev)) {
 		dev->open_queue_depth--;
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index abcf191..8e380c1 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -263,7 +263,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			 */
 			add_disk_randomness(req->rq_disk);
 			blkdev_dequeue_request(req);
-			end_that_request_last(req);
+			end_that_request_last(req, 1);
 		}
 		spin_unlock_irq(&md->lock);
 	} while (ret);
@@ -289,7 +289,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 	add_disk_randomness(req->rq_disk);
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	spin_unlock_irq(&md->lock);
 
 	return 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7008d32..fdb6138 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1035,7 +1035,7 @@ dasd_end_request(struct request *req, int uptodate)
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
 	add_disk_randomness(req->rq_disk);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 /*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1efc9f2..559d514 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -78,7 +78,7 @@ tapeblock_end_request(struct request *req, int uptodate)
 {
 	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
 		BUG();
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 }
 
 static void
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 4cb1f3e..3c688ef 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -1046,7 +1046,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 
 	/* kill current request */
 	blkdev_dequeue_request(req);
-	end_that_request_last(req);
+	end_that_request_last(req, 0);
 	if (req->flags & REQ_SENSE)
 		kfree(scsi->pc->buffer);
 	kfree(scsi->pc);
@@ -1056,7 +1056,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 	/* now nuke the drive queue */
 	while ((req = elv_next_request(drive->queue))) {
 		blkdev_dequeue_request(req);
-		end_that_request_last(req);
+		end_that_request_last(req, 0);
 	}
 
 	HWGROUP(drive)->rq = NULL;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a7f3f0c..53551f1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -791,7 +791,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(q, req);
-	end_that_request_last(req);
+	end_that_request_last(req, uptodate);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	/*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3d3ad7d..d651150 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -748,7 +748,7 @@ static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
 		 * force journal abort of barriers
 		 */
 		end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
-		end_that_request_last(rq);
+		end_that_request_last(rq, -EOPNOTSUPP);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a18500d..a0ce8c5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -102,7 +102,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
 struct request;
-typedef void (rq_end_io_fn)(struct request *);
+typedef void (rq_end_io_fn)(struct request *, int);
 
 struct request_list {
 	int count[2];
@@ -560,7 +560,7 @@ extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(request_queue_t *, struct request *);
-extern void blk_end_sync_rq(struct request *rq);
+extern void blk_end_sync_rq(struct request *rq, int error);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
@@ -614,7 +614,7 @@ static inline void blk_run_address_space(struct address_space *mapping)
  */
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
-extern void end_that_request_last(struct request *);
+extern void end_that_request_last(struct request *, int);
 extern void end_request(struct request *req, int uptodate);
 
 /*
-- 
cgit v1.1


From 52d9e675361261a1eb1716b02222ec6177ec342b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:49:58 +0100
Subject: [BLOCK] ll_rw_blk: separate out bio init part from __make_request

Separate out bio initialization part from __make_request.  It
will be used by the following blk_ordered_reimpl.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c | 62 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 8b1ae69..65c4efc 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -36,6 +36,8 @@
 static void blk_unplug_work(void *data);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
+static void init_request_from_bio(struct request *req, struct bio *bio);
+static int __make_request(request_queue_t *q, struct bio *bio);
 
 /*
  * For the allocated request tables
@@ -1667,8 +1669,6 @@ static int blk_init_free_list(request_queue_t *q)
 	return 0;
 }
 
-static int __make_request(request_queue_t *, struct bio *);
-
 request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
@@ -2659,6 +2659,36 @@ void blk_attempt_remerge(request_queue_t *q, struct request *rq)
 
 EXPORT_SYMBOL(blk_attempt_remerge);
 
+static void init_request_from_bio(struct request *req, struct bio *bio)
+{
+	req->flags |= REQ_CMD;
+
+	/*
+	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
+	 */
+	if (bio_rw_ahead(bio) || bio_failfast(bio))
+		req->flags |= REQ_FAILFAST;
+
+	/*
+	 * REQ_BARRIER implies no merging, but lets make it explicit
+	 */
+	if (unlikely(bio_barrier(bio)))
+		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+
+	req->errors = 0;
+	req->hard_sector = req->sector = bio->bi_sector;
+	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
+	req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
+	req->nr_phys_segments = bio_phys_segments(req->q, bio);
+	req->nr_hw_segments = bio_hw_segments(req->q, bio);
+	req->buffer = bio_data(bio);	/* see ->buffer comment above */
+	req->waiting = NULL;
+	req->bio = req->biotail = bio;
+	req->ioprio = bio_prio(bio);
+	req->rq_disk = bio->bi_bdev->bd_disk;
+	req->start_time = jiffies;
+}
+
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req;
@@ -2754,33 +2784,7 @@ get_rq:
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
-
-	req->flags |= REQ_CMD;
-
-	/*
-	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
-	 */
-	if (bio_rw_ahead(bio) || bio_failfast(bio))
-		req->flags |= REQ_FAILFAST;
-
-	/*
-	 * REQ_BARRIER implies no merging, but lets make it explicit
-	 */
-	if (unlikely(barrier))
-		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
-
-	req->errors = 0;
-	req->hard_sector = req->sector = sector;
-	req->hard_nr_sectors = req->nr_sectors = nr_sectors;
-	req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
-	req->nr_phys_segments = bio_phys_segments(q, bio);
-	req->nr_hw_segments = bio_hw_segments(q, bio);
-	req->buffer = bio_data(bio);	/* see ->buffer comment above */
-	req->waiting = NULL;
-	req->bio = req->biotail = bio;
-	req->ioprio = prio;
-	req->rq_disk = bio->bi_bdev->bd_disk;
-	req->start_time = jiffies;
+	init_request_from_bio(req, bio);
 
 	spin_lock_irq(q->queue_lock);
 	if (elv_queue_empty(q))
-- 
cgit v1.1


From 797e7dbbee0a91fa1349192f18ad5c454997d876 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:51:03 +0100
Subject: [BLOCK] reimplement handling of barrier request

Reimplement handling of barrier requests.

* Flexible handling to deal with various capabilities of
  target devices.
* Retry support for falling back.
* Tagged queues which don't support ordered tag can do ordered.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/elevator.c         |  84 +++++++----
 block/ll_rw_blk.c        | 384 ++++++++++++++++++++++++++++++-----------------
 include/linux/blkdev.h   |  82 +++++++---
 include/linux/elevator.h |   1 +
 4 files changed, 359 insertions(+), 192 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 85a11ce..39dcccc 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -304,15 +304,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
 
 	rq->flags &= ~REQ_STARTED;
 
-	/*
-	 * if this is the flush, requeue the original instead and drop the flush
-	 */
-	if (rq->flags & REQ_BAR_FLUSH) {
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		rq = rq->end_io_data;
-	}
-
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0);
 }
 
 static void elv_drain_elevator(request_queue_t *q)
@@ -332,8 +324,19 @@ static void elv_drain_elevator(request_queue_t *q)
 void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		       int plug)
 {
+	struct list_head *pos;
+	unsigned ordseq;
+
+	if (q->ordcolor)
+		rq->flags |= REQ_ORDERED_COLOR;
+
 	if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
 		/*
+		 * toggle ordered color
+		 */
+		q->ordcolor ^= 1;
+
+		/*
 		 * barriers implicitly indicate back insertion
 		 */
 		if (where == ELEVATOR_INSERT_SORT)
@@ -393,6 +396,30 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 
+	case ELEVATOR_INSERT_REQUEUE:
+		/*
+		 * If ordered flush isn't in progress, we do front
+		 * insertion; otherwise, requests should be requeued
+		 * in ordseq order.
+		 */
+		rq->flags |= REQ_SOFTBARRIER;
+
+		if (q->ordseq == 0) {
+			list_add(&rq->queuelist, &q->queue_head);
+			break;
+		}
+
+		ordseq = blk_ordered_req_seq(rq);
+
+		list_for_each(pos, &q->queue_head) {
+			struct request *pos_rq = list_entry_rq(pos);
+			if (ordseq <= blk_ordered_req_seq(pos_rq))
+				break;
+		}
+
+		list_add_tail(&rq->queuelist, pos);
+		break;
+
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
@@ -422,25 +449,16 @@ static inline struct request *__elv_next_request(request_queue_t *q)
 {
 	struct request *rq;
 
-	if (unlikely(list_empty(&q->queue_head) &&
-		     !q->elevator->ops->elevator_dispatch_fn(q, 0)))
-		return NULL;
-
-	rq = list_entry_rq(q->queue_head.next);
-
-	/*
-	 * if this is a barrier write and the device has to issue a
-	 * flush sequence to support it, check how far we are
-	 */
-	if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
-		BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
 
-		if (q->ordered == QUEUE_ORDERED_FLUSH &&
-		    !blk_barrier_preflush(rq))
-			rq = blk_start_pre_flush(q, rq);
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
 	}
-
-	return rq;
 }
 
 struct request *elv_next_request(request_queue_t *q)
@@ -593,7 +611,21 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
+		struct request *first_rq = list_entry_rq(q->queue_head.next);
+
 		q->in_flight--;
+
+		/*
+		 * Check if the queue is waiting for fs requests to be
+		 * drained for flush sequence.
+		 */
+		if (q->ordseq && q->in_flight == 0 &&
+		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
+		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
+			q->request_fn(q);
+		}
+
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 65c4efc..91d3b48 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -290,8 +290,8 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 
 /**
  * blk_queue_ordered - does this queue support ordered writes
- * @q:     the request queue
- * @flag:  see below
+ * @q:        the request queue
+ * @ordered:  one of QUEUE_ORDERED_*
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
@@ -300,28 +300,30 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
  *   feature should call this function and indicate so.
  *
  **/
-void blk_queue_ordered(request_queue_t *q, int flag)
-{
-	switch (flag) {
-		case QUEUE_ORDERED_NONE:
-			if (q->flush_rq)
-				kmem_cache_free(request_cachep, q->flush_rq);
-			q->flush_rq = NULL;
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_TAG:
-			q->ordered = flag;
-			break;
-		case QUEUE_ORDERED_FLUSH:
-			q->ordered = flag;
-			if (!q->flush_rq)
-				q->flush_rq = kmem_cache_alloc(request_cachep,
-								GFP_KERNEL);
-			break;
-		default:
-			printk("blk_queue_ordered: bad value %d\n", flag);
-			break;
+int blk_queue_ordered(request_queue_t *q, unsigned ordered,
+		      prepare_flush_fn *prepare_flush_fn)
+{
+	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
+	    prepare_flush_fn == NULL) {
+		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
+		return -EINVAL;
+	}
+
+	if (ordered != QUEUE_ORDERED_NONE &&
+	    ordered != QUEUE_ORDERED_DRAIN &&
+	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
+	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
+	    ordered != QUEUE_ORDERED_TAG &&
+	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
+	    ordered != QUEUE_ORDERED_TAG_FUA) {
+		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
+		return -EINVAL;
 	}
+
+	q->next_ordered = ordered;
+	q->prepare_flush_fn = prepare_flush_fn;
+
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_queue_ordered);
@@ -346,167 +348,265 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
-static void blk_pre_flush_end_io(struct request *flush_rq, int error)
+inline unsigned blk_ordered_cur_seq(request_queue_t *q)
 {
-	struct request *rq = flush_rq->end_io_data;
-	request_queue_t *q = rq->q;
-
-	elv_completed_request(q, flush_rq);
-
-	rq->flags |= REQ_BAR_PREFLUSH;
-
-	if (!flush_rq->errors)
-		elv_requeue_request(q, rq);
-	else {
-		q->end_flush_fn(q, flush_rq);
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		q->request_fn(q);
-	}
+	if (!q->ordseq)
+		return 0;
+	return 1 << ffz(q->ordseq);
 }
 
-static void blk_post_flush_end_io(struct request *flush_rq, int error)
+unsigned blk_ordered_req_seq(struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
 	request_queue_t *q = rq->q;
 
-	elv_completed_request(q, flush_rq);
+	BUG_ON(q->ordseq == 0);
 
-	rq->flags |= REQ_BAR_POSTFLUSH;
+	if (rq == &q->pre_flush_rq)
+		return QUEUE_ORDSEQ_PREFLUSH;
+	if (rq == &q->bar_rq)
+		return QUEUE_ORDSEQ_BAR;
+	if (rq == &q->post_flush_rq)
+		return QUEUE_ORDSEQ_POSTFLUSH;
 
-	q->end_flush_fn(q, flush_rq);
-	clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-	q->request_fn(q);
+	if ((rq->flags & REQ_ORDERED_COLOR) ==
+	    (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
+		return QUEUE_ORDSEQ_DRAIN;
+	else
+		return QUEUE_ORDSEQ_DONE;
 }
 
-struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
+void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
-
-	BUG_ON(!blk_barrier_rq(rq));
+	struct request *rq;
+	int uptodate;
 
-	if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
-		return NULL;
+	if (error && !q->orderr)
+		q->orderr = error;
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+	BUG_ON(q->ordseq & seq);
+	q->ordseq |= seq;
 
-	/*
-	 * prepare_flush returns 0 if no flush is needed, just mark both
-	 * pre and post flush as done in that case
-	 */
-	if (!q->prepare_flush_fn(q, flush_rq)) {
-		rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
-		clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
-		return rq;
-	}
+	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
+		return;
 
 	/*
-	 * some drivers dequeue requests right away, some only after io
-	 * completion. make sure the request is dequeued.
+	 * Okay, sequence complete.
 	 */
-	if (!list_empty(&rq->queuelist))
-		blkdev_dequeue_request(rq);
+	rq = q->orig_bar_rq;
+	uptodate = q->orderr ? q->orderr : 1;
 
-	flush_rq->end_io_data = rq;
-	flush_rq->end_io = blk_pre_flush_end_io;
+	q->ordseq = 0;
 
-	__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-	return flush_rq;
+	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
+	end_that_request_last(rq, uptodate);
 }
 
-static void blk_start_post_flush(request_queue_t *q, struct request *rq)
+static void pre_flush_end_io(struct request *rq, int error)
 {
-	struct request *flush_rq = q->flush_rq;
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
+}
 
-	BUG_ON(!blk_barrier_rq(rq));
+static void bar_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
+}
 
-	rq_init(q, flush_rq);
-	flush_rq->elevator_private = NULL;
-	flush_rq->flags = REQ_BAR_FLUSH;
-	flush_rq->rq_disk = rq->rq_disk;
-	flush_rq->rl = NULL;
+static void post_flush_end_io(struct request *rq, int error)
+{
+	elv_completed_request(rq->q, rq);
+	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
+}
 
-	if (q->prepare_flush_fn(q, flush_rq)) {
-		flush_rq->end_io_data = rq;
-		flush_rq->end_io = blk_post_flush_end_io;
+static void queue_flush(request_queue_t *q, unsigned which)
+{
+	struct request *rq;
+	rq_end_io_fn *end_io;
 
-		__elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
-		q->request_fn(q);
+	if (which == QUEUE_ORDERED_PREFLUSH) {
+		rq = &q->pre_flush_rq;
+		end_io = pre_flush_end_io;
+	} else {
+		rq = &q->post_flush_rq;
+		end_io = post_flush_end_io;
 	}
+
+	rq_init(q, rq);
+	rq->flags = REQ_HARDBARRIER;
+	rq->elevator_private = NULL;
+	rq->rq_disk = q->bar_rq.rq_disk;
+	rq->rl = NULL;
+	rq->end_io = end_io;
+	q->prepare_flush_fn(q, rq);
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
 }
 
-static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
-					int sectors)
+static inline struct request *start_ordered(request_queue_t *q,
+					    struct request *rq)
 {
-	if (sectors > rq->nr_sectors)
-		sectors = rq->nr_sectors;
+	q->bi_size = 0;
+	q->orderr = 0;
+	q->ordered = q->next_ordered;
+	q->ordseq |= QUEUE_ORDSEQ_STARTED;
+
+	/*
+	 * Prep proxy barrier request.
+	 */
+	blkdev_dequeue_request(rq);
+	q->orig_bar_rq = rq;
+	rq = &q->bar_rq;
+	rq_init(q, rq);
+	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
+	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
+	rq->elevator_private = NULL;
+	rq->rl = NULL;
+	init_request_from_bio(rq, q->orig_bar_rq->bio);
+	rq->end_io = bar_end_io;
+
+	/*
+	 * Queue ordered sequence.  As we stack them at the head, we
+	 * need to queue in reverse order.  Note that we rely on that
+	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
+	 * request gets inbetween ordered sequence.
+	 */
+	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
+		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	else
+		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
+
+	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+		rq = &q->pre_flush_rq;
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	rq->nr_sectors -= sectors;
-	return rq->nr_sectors;
+	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+	else
+		rq = NULL;
+
+	return rq;
 }
 
-static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
-				     int sectors, int queue_locked)
+int blk_do_ordered(request_queue_t *q, struct request **rqp)
 {
-	if (q->ordered != QUEUE_ORDERED_FLUSH)
-		return 0;
-	if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
-		return 0;
-	if (blk_barrier_postflush(rq))
-		return 0;
+	struct request *rq = *rqp, *allowed_rq;
+	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
-	if (!blk_check_end_barrier(q, rq, sectors)) {
-		unsigned long flags = 0;
+	if (!q->ordseq) {
+		if (!is_barrier)
+			return 1;
 
-		if (!queue_locked)
-			spin_lock_irqsave(q->queue_lock, flags);
+		if (q->next_ordered != QUEUE_ORDERED_NONE) {
+			*rqp = start_ordered(q, rq);
+			return 1;
+		} else {
+			/*
+			 * This can happen when the queue switches to
+			 * ORDERED_NONE while this request is on it.
+			 */
+			blkdev_dequeue_request(rq);
+			end_that_request_first(rq, -EOPNOTSUPP,
+					       rq->hard_nr_sectors);
+			end_that_request_last(rq, -EOPNOTSUPP);
+			*rqp = NULL;
+			return 0;
+		}
+	}
 
-		blk_start_post_flush(q, rq);
+	if (q->ordered & QUEUE_ORDERED_TAG) {
+		if (is_barrier && rq != &q->bar_rq)
+			*rqp = NULL;
+		return 1;
+	}
 
-		if (!queue_locked)
-			spin_unlock_irqrestore(q->queue_lock, flags);
+	switch (blk_ordered_cur_seq(q)) {
+	case QUEUE_ORDSEQ_PREFLUSH:
+		allowed_rq = &q->pre_flush_rq;
+		break;
+	case QUEUE_ORDSEQ_BAR:
+		allowed_rq = &q->bar_rq;
+		break;
+	case QUEUE_ORDSEQ_POSTFLUSH:
+		allowed_rq = &q->post_flush_rq;
+		break;
+	default:
+		allowed_rq = NULL;
+		break;
 	}
 
+	if (rq != allowed_rq &&
+	    (blk_fs_request(rq) || rq == &q->pre_flush_rq ||
+	     rq == &q->post_flush_rq))
+		*rqp = NULL;
+
 	return 1;
 }
 
-/**
- * blk_complete_barrier_rq - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   Used in driver end_io handling to determine whether to postpone
- *   completion of a barrier request until a post flush has been done. This
- *   is the unlocked variant, used if the caller doesn't already hold the
- *   queue lock.
- **/
-int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
+static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 0);
+	request_queue_t *q = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	/*
+	 * This is dry run, restore bio_sector and size.  We'll finish
+	 * this request again with the original bi_end_io after an
+	 * error occurs or post flush is complete.
+	 */
+	q->bi_size += bytes;
+
+	if (bio->bi_size)
+		return 1;
+
+	/* Rewind bvec's */
+	bio->bi_idx = 0;
+	bio_for_each_segment(bvec, bio, i) {
+		bvec->bv_len += bvec->bv_offset;
+		bvec->bv_offset = 0;
+	}
+
+	/* Reset bio */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio->bi_size = q->bi_size;
+	bio->bi_sector -= (q->bi_size >> 9);
+	q->bi_size = 0;
+
+	return 0;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq);
 
-/**
- * blk_complete_barrier_rq_locked - complete possible barrier request
- * @q:  the request queue for the device
- * @rq:  the request
- * @sectors:  number of sectors to complete
- *
- * Description:
- *   See blk_complete_barrier_rq(). This variant must be used if the caller
- *   holds the queue lock.
- **/
-int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
-				   int sectors)
+static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
+				    unsigned int nbytes, int error)
 {
-	return __blk_complete_barrier_rq(q, rq, sectors, 1);
+	request_queue_t *q = rq->q;
+	bio_end_io_t *endio;
+	void *private;
+
+	if (&q->bar_rq != rq)
+		return 0;
+
+	/*
+	 * Okay, this is the barrier request in progress, dry finish it.
+	 */
+	if (error && !q->orderr)
+		q->orderr = error;
+
+	endio = bio->bi_end_io;
+	private = bio->bi_private;
+	bio->bi_end_io = flush_dry_bio_endio;
+	bio->bi_private = q;
+
+	bio_endio(bio, nbytes, error);
+
+	bio->bi_end_io = endio;
+	bio->bi_private = private;
+
+	return 1;
 }
-EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
 
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
@@ -1047,6 +1147,7 @@ static const char * const rq_flags[] = {
 	"REQ_SORTED",
 	"REQ_SOFTBARRIER",
 	"REQ_HARDBARRIER",
+	"REQ_FUA",
 	"REQ_CMD",
 	"REQ_NOMERGE",
 	"REQ_STARTED",
@@ -1066,6 +1167,7 @@ static const char * const rq_flags[] = {
 	"REQ_PM_SUSPEND",
 	"REQ_PM_RESUME",
 	"REQ_PM_SHUTDOWN",
+	"REQ_ORDERED_COLOR",
 };
 
 void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1643,8 +1745,6 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
-	blk_queue_ordered(q, QUEUE_ORDERED_NONE);
-
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2714,7 +2814,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 	spin_lock_prefetch(q->queue_lock);
 
 	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
+	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
@@ -3075,7 +3175,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
-			bio_endio(bio, nbytes, error);
+			if (!ordered_bio_endio(req, bio, nbytes, error))
+				bio_endio(bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
@@ -3130,7 +3231,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
-		bio_endio(bio, bio_nbytes, error);
+		if (!ordered_bio_endio(req, bio, bio_nbytes, error))
+			bio_endio(bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a0ce8c5..15db0f1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -207,6 +207,7 @@ enum rq_flag_bits {
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
+	__REQ_FUA,		/* forced unit access */
 	__REQ_CMD,		/* is a regular fs rw request */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
@@ -230,9 +231,7 @@ enum rq_flag_bits {
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
-	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
-	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
-	__REQ_BAR_FLUSH,	/* rq is the flush request */
+	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -241,6 +240,7 @@ enum rq_flag_bits {
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
+#define REQ_FUA		(1 << __REQ_FUA)
 #define REQ_CMD		(1 << __REQ_CMD)
 #define REQ_NOMERGE	(1 << __REQ_NOMERGE)
 #define REQ_STARTED	(1 << __REQ_STARTED)
@@ -260,9 +260,7 @@ enum rq_flag_bits {
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
-#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
-#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
-#define REQ_BAR_FLUSH	(1 << __REQ_BAR_FLUSH)
+#define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -292,8 +290,7 @@ struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
 typedef void (activity_fn) (void *data, int rw);
 typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
-typedef int (prepare_flush_fn) (request_queue_t *, struct request *);
-typedef void (end_flush_fn) (request_queue_t *, struct request *);
+typedef void (prepare_flush_fn) (request_queue_t *, struct request *);
 
 enum blk_queue_state {
 	Queue_down,
@@ -335,7 +332,6 @@ struct request_queue
 	activity_fn		*activity_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
-	end_flush_fn		*end_flush_fn;
 
 	/*
 	 * Dispatch queue sorting
@@ -420,14 +416,11 @@ struct request_queue
 	/*
 	 * reserved for flush operations
 	 */
-	struct request		*flush_rq;
-	unsigned char		ordered;
-};
-
-enum {
-	QUEUE_ORDERED_NONE,
-	QUEUE_ORDERED_TAG,
-	QUEUE_ORDERED_FLUSH,
+	unsigned int		ordered, next_ordered, ordseq;
+	int			orderr, ordcolor;
+	struct request		pre_flush_rq, bar_rq, post_flush_rq;
+	struct request		*orig_bar_rq;
+	unsigned int		bi_size;
 };
 
 #define RQ_INACTIVE		(-1)
@@ -445,12 +438,51 @@ enum {
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
-#define QUEUE_FLAG_FLUSH	9	/* doing barrier flush sequence */
+
+enum {
+	/*
+	 * Hardbarrier is supported with one of the following methods.
+	 *
+	 * NONE		: hardbarrier unsupported
+	 * DRAIN	: ordering by draining is enough
+	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
+	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
+	 * TAG		: ordering by tag is enough
+	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
+	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
+	 */
+	QUEUE_ORDERED_NONE	= 0x00,
+	QUEUE_ORDERED_DRAIN	= 0x01,
+	QUEUE_ORDERED_TAG	= 0x02,
+
+	QUEUE_ORDERED_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_POSTFLUSH	= 0x20,
+	QUEUE_ORDERED_FUA	= 0x40,
+
+	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
+			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+
+	/*
+	 * Ordered operation sequence
+	 */
+	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
+	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
+	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
+	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
+	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
+	QUEUE_ORDSEQ_DONE	= 0x20,
+};
 
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
-#define blk_queue_flushing(q)	test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags)
+#define blk_queue_flushing(q)	((q)->ordseq)
 
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
@@ -466,8 +498,7 @@ enum {
 
 #define blk_sorted_rq(rq)	((rq)->flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
-#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
-#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+#define blk_fua_rq(rq)		((rq)->flags & REQ_FUA)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -665,11 +696,12 @@ extern void blk_queue_prep_rq(request_queue_t *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern void blk_queue_ordered(request_queue_t *, int);
+extern int blk_queue_ordered(request_queue_t *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
-extern struct request *blk_start_pre_flush(request_queue_t *,struct request *);
-extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int);
-extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int);
+extern int blk_do_ordered(request_queue_t *, struct request **);
+extern unsigned blk_ordered_cur_seq(request_queue_t *);
+extern unsigned blk_ordered_req_seq(struct request *);
+extern void blk_ordered_complete_seq(request_queue_t *, unsigned, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index a74c27e..fb80fa4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -130,6 +130,7 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *);
 #define ELEVATOR_INSERT_FRONT	1
 #define ELEVATOR_INSERT_BACK	2
 #define ELEVATOR_INSERT_SORT	3
+#define ELEVATOR_INSERT_REQUEUE	4
 
 /*
  * return values from elevator_may_queue_fn
-- 
cgit v1.1


From 461d4e90c8cd049718884cd17c955e231140d3be Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:52:55 +0100
Subject: [BLOCK] update SCSI to use new blk_ordered for barriers

All ordered request related stuff delegated to HLD.  Midlayer
now doens't deal with ordered setting or prepare_flush
callback.  sd.c updated to deal with blk_queue_ordered
setting.  Currently, ordered tag isn't used as SCSI midlayer
cannot guarantee request ordering.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/hosts.c       |  9 -------
 drivers/scsi/scsi_lib.c    | 46 ------------------------------------
 drivers/scsi/sd.c          | 58 ++++++++++++++++------------------------------
 include/scsi/scsi_driver.h |  1 -
 include/scsi/scsi_host.h   |  1 -
 5 files changed, 20 insertions(+), 95 deletions(-)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5b9c2c5..66783c8 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -347,17 +347,8 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->cmd_per_lun = sht->cmd_per_lun;
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
-	shost->ordered_flush = sht->ordered_flush;
 	shost->ordered_tag = sht->ordered_tag;
 
-	/*
-	 * hosts/devices that do queueing must support ordered tags
-	 */
-	if (shost->can_queue > 1 && shost->ordered_flush) {
-		printk(KERN_ERR "scsi: ordered flushes don't support queueing\n");
-		shost->ordered_flush = 0;
-	}
-
 	if (sht->max_host_blocked)
 		shost->max_host_blocked = sht->max_host_blocked;
 	else
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 53551f1..7a38b10 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -932,9 +932,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	int sense_valid = 0;
 	int sense_deferred = 0;
 
-	if (blk_complete_barrier_rq(q, req, good_bytes >> 9))
-		return;
-
 	/*
 	 * Free up any indirection buffers we allocated for DMA purposes. 
 	 * For the case of a READ, we need to copy the data out of the
@@ -1199,38 +1196,6 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
 	return BLKPREP_KILL;
 }
 
-static int scsi_prepare_flush_fn(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_driver *drv;
-
-	if (sdev->sdev_state == SDEV_RUNNING) {
-		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
-
-		if (drv->prepare_flush)
-			return drv->prepare_flush(q, rq);
-	}
-
-	return 0;
-}
-
-static void scsi_end_flush_fn(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct request *flush_rq = rq->end_io_data;
-	struct scsi_driver *drv;
-
-	if (flush_rq->errors) {
-		printk("scsi: barrier error, disabling flush support\n");
-		blk_queue_ordered(q, QUEUE_ORDERED_NONE);
-	}
-
-	if (sdev->sdev_state == SDEV_RUNNING) {
-		drv = *(struct scsi_driver **) rq->rq_disk->private_data;
-		drv->end_flush(q, rq);
-	}
-}
-
 static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
 			       sector_t *error_sector)
 {
@@ -1703,17 +1668,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
 
-	/*
-	 * ordered tags are superior to flush ordering
-	 */
-	if (shost->ordered_tag)
-		blk_queue_ordered(q, QUEUE_ORDERED_TAG);
-	else if (shost->ordered_flush) {
-		blk_queue_ordered(q, QUEUE_ORDERED_FLUSH);
-		q->prepare_flush_fn = scsi_prepare_flush_fn;
-		q->end_flush_fn = scsi_end_flush_fn;
-	}
-
 	if (!shost->use_clustering)
 		clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	return q;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index d651150..2eefc9e 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -121,8 +121,7 @@ static void sd_shutdown(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *);
 static int sd_issue_flush(struct device *, sector_t *);
-static void sd_end_flush(request_queue_t *, struct request *);
-static int sd_prepare_flush(request_queue_t *, struct request *);
+static void sd_prepare_flush(request_queue_t *, struct request *);
 static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
 			     unsigned char *buffer);
 
@@ -137,8 +136,6 @@ static struct scsi_driver sd_template = {
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
 	.issue_flush		= sd_issue_flush,
-	.prepare_flush		= sd_prepare_flush,
-	.end_flush		= sd_end_flush,
 };
 
 /*
@@ -729,42 +726,13 @@ static int sd_issue_flush(struct device *dev, sector_t *error_sector)
 	return ret;
 }
 
-static void sd_end_flush(request_queue_t *q, struct request *flush_rq)
+static void sd_prepare_flush(request_queue_t *q, struct request *rq)
 {
-	struct request *rq = flush_rq->end_io_data;
-	struct scsi_cmnd *cmd = rq->special;
-	unsigned int bytes = rq->hard_nr_sectors << 9;
-
-	if (!flush_rq->errors) {
-		spin_unlock(q->queue_lock);
-		scsi_io_completion(cmd, bytes, 0);
-		spin_lock(q->queue_lock);
-	} else if (blk_barrier_postflush(rq)) {
-		spin_unlock(q->queue_lock);
-		scsi_io_completion(cmd, 0, bytes);
-		spin_lock(q->queue_lock);
-	} else {
-		/*
-		 * force journal abort of barriers
-		 */
-		end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
-		end_that_request_last(rq, -EOPNOTSUPP);
-	}
-}
-
-static int sd_prepare_flush(request_queue_t *q, struct request *rq)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_disk *sdkp = dev_get_drvdata(&sdev->sdev_gendev);
-
-	if (!sdkp || !sdkp->WCE)
-		return 0;
-
 	memset(rq->cmd, 0, sizeof(rq->cmd));
-	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
+	rq->flags |= REQ_BLOCK_PC;
 	rq->timeout = SD_TIMEOUT;
 	rq->cmd[0] = SYNCHRONIZE_CACHE;
-	return 1;
+	rq->cmd_len = 10;
 }
 
 static void sd_rescan(struct device *dev)
@@ -1462,6 +1430,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	unsigned char *buffer;
+	unsigned ordered;
 
 	SCSI_LOG_HLQUEUE(3, printk("sd_revalidate_disk: disk=%s\n", disk->disk_name));
 
@@ -1498,7 +1467,20 @@ static int sd_revalidate_disk(struct gendisk *disk)
 		sd_read_write_protect_flag(sdkp, disk->disk_name, buffer);
 		sd_read_cache_type(sdkp, disk->disk_name, buffer);
 	}
-		
+
+	/*
+	 * We now have all cache related info, determine how we deal
+	 * with ordered requests.  Note that as the current SCSI
+	 * dispatch function can alter request order, we cannot use
+	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
+	 */
+	if (sdkp->WCE)
+		ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+	else
+		ordered = QUEUE_ORDERED_DRAIN;
+
+	blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);
+
 	set_capacity(disk, sdkp->capacity);
 	kfree(buffer);
 
@@ -1598,6 +1580,7 @@ static int sd_probe(struct device *dev)
 	strcpy(gd->devfs_name, sdp->devfs_name);
 
 	gd->private_data = &sdkp->driver;
+	gd->queue = sdkp->device->request_queue;
 
 	sd_revalidate_disk(gd);
 
@@ -1605,7 +1588,6 @@ static int sd_probe(struct device *dev)
 	gd->flags = GENHD_FL_DRIVERFS;
 	if (sdp->removable)
 		gd->flags |= GENHD_FL_REMOVABLE;
-	gd->queue = sdkp->device->request_queue;
 
 	dev_set_drvdata(dev, sdkp);
 	add_disk(gd);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 850dfa8..02e26c1 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -15,7 +15,6 @@ struct scsi_driver {
 	void (*rescan)(struct device *);
 	int (*issue_flush)(struct device *, sector_t *);
 	int (*prepare_flush)(struct request_queue *, struct request *);
-	void (*end_flush)(struct request_queue *, struct request *);
 };
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6cbb198..25f637b 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -392,7 +392,6 @@ struct scsi_host_template {
 	/*
 	 * ordered write support
 	 */
-	unsigned ordered_flush:1;
 	unsigned ordered_tag:1;
 
 	/*
-- 
cgit v1.1


From 007365ad60387df30f02f01fdc2b6e6432f6c265 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:53:52 +0100
Subject: [BLOCK] scsi: add FUA support to sd

Add FUA support for barriers to SCSI disk.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/sd.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2eefc9e..32d4d8d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -102,6 +102,7 @@ struct scsi_disk {
 	u8		write_prot;
 	unsigned	WCE : 1;	/* state of disk WCE bit */
 	unsigned	RCD : 1;	/* state of disk RCD bit, unused */
+	unsigned	DPOFUA : 1;	/* state of disk DPOFUA bit */
 };
 
 static DEFINE_IDR(sd_index_idr);
@@ -343,6 +344,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 	
 	if (block > 0xffffffff) {
 		SCpnt->cmnd[0] += READ_16 - READ_6;
+		SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
 		SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
 		SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
 		SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
@@ -362,6 +364,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 			this_count = 0xffff;
 
 		SCpnt->cmnd[0] += READ_10 - READ_6;
+		SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
 		SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
 		SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
 		SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
@@ -370,6 +373,17 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
 		SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
 		SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
 	} else {
+		if (unlikely(blk_fua_rq(rq))) {
+			/*
+			 * This happens only if this drive failed
+			 * 10byte rw command with ILLEGAL_REQUEST
+			 * during operation and thus turned off
+			 * use_10_for_rw.
+			 */
+			printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive\n");
+			return 0;
+		}
+
 		SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
 		SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);
 		SCpnt->cmnd[3] = (unsigned char) block & 0xff;
@@ -1395,10 +1409,18 @@ sd_read_cache_type(struct scsi_disk *sdkp, char *diskname,
 			sdkp->RCD = 0;
 		}
 
+		sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
+		if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
+			printk(KERN_NOTICE "SCSI device %s: uses "
+			       "READ/WRITE(6), disabling FUA\n", diskname);
+			sdkp->DPOFUA = 0;
+		}
+
 		ct =  sdkp->RCD + 2*sdkp->WCE;
 
-		printk(KERN_NOTICE "SCSI device %s: drive cache: %s\n",
-		       diskname, types[ct]);
+		printk(KERN_NOTICE "SCSI device %s: drive cache: %s%s\n",
+		       diskname, types[ct],
+		       sdkp->DPOFUA ? " w/ FUA" : "");
 
 		return;
 	}
@@ -1475,7 +1497,8 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
 	 */
 	if (sdkp->WCE)
-		ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+		ordered = sdkp->DPOFUA
+			? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
 	else
 		ordered = QUEUE_ORDERED_DRAIN;
 
-- 
cgit v1.1


From 93c9338713d4e11102cd09b4670ad42a336b06a3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:55:00 +0100
Subject: [BLOCK] update libata to use new blk_ordered for barriers

Reflect changes in SCSI midlayer and updated to use new
ordered request implementation

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/ahci.c         | 1 -
 drivers/scsi/ata_piix.c     | 1 -
 drivers/scsi/sata_mv.c      | 1 -
 drivers/scsi/sata_nv.c      | 1 -
 drivers/scsi/sata_promise.c | 1 -
 drivers/scsi/sata_sil.c     | 1 -
 drivers/scsi/sata_sil24.c   | 1 -
 drivers/scsi/sata_sis.c     | 1 -
 drivers/scsi/sata_svw.c     | 1 -
 drivers/scsi/sata_sx4.c     | 1 -
 drivers/scsi/sata_uli.c     | 1 -
 drivers/scsi/sata_via.c     | 1 -
 drivers/scsi/sata_vsc.c     | 1 -
 13 files changed, 13 deletions(-)

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 887eaa2..d113290 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -214,7 +214,6 @@ static struct scsi_host_template ahci_sht = {
 	.dma_boundary		= AHCI_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations ahci_ops = {
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 0ea2787..4b647ee 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -185,7 +185,6 @@ static struct scsi_host_template piix_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index b2bf16a..cd54244 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -374,7 +374,6 @@ static struct scsi_host_template mv_sht = {
 	.dma_boundary		= MV_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations mv5_ops = {
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index 4954896..c0cf52c 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -235,7 +235,6 @@ static struct scsi_host_template nv_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations nv_ops = {
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index da7fa04..3d1ea09 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -114,7 +114,6 @@ static struct scsi_host_template pdc_ata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations pdc_sata_ops = {
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index d205348..b017f85 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -147,7 +147,6 @@ static struct scsi_host_template sil_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations sil_ops = {
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index a0ad3ed..9231301 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -292,7 +292,6 @@ static struct scsi_host_template sil24_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1, /* NCQ not supported yet */
 };
 
 static const struct ata_port_operations sil24_ops = {
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index 32e1262..2df8c56 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -99,7 +99,6 @@ static struct scsi_host_template sis_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations sis_ops = {
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index 6e7f7c8..6683735 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -303,7 +303,6 @@ static struct scsi_host_template k2_sata_sht = {
 	.proc_info		= k2_sata_proc_info,
 #endif
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 94b253b..bc87c16 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -194,7 +194,6 @@ static struct scsi_host_template pdc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations pdc_20621_ops = {
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index b2422a0..9635ca7 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -87,7 +87,6 @@ static struct scsi_host_template uli_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations uli_ops = {
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index c762156..6d5b0a7 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -106,7 +106,6 @@ static struct scsi_host_template svia_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 static const struct ata_port_operations svia_sata_ops = {
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index fcfa486..2e2c3b7 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -235,7 +235,6 @@ static struct scsi_host_template vsc_sata_sht = {
 	.dma_boundary		= ATA_DMA_BOUNDARY,
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
-	.ordered_flush		= 1,
 };
 
 
-- 
cgit v1.1


From 9a3dccc42556537a48f39ee9a9e7ab90a933f766 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:56:18 +0100
Subject: [BLOCK] add FUA support to libata

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/libata-core.c | 31 +++++++++++++++++++++++++------
 drivers/scsi/libata-scsi.c | 32 ++++++++++++++++++++++++++------
 drivers/scsi/libata.h      |  4 +++-
 include/linux/ata.h        |  6 +++++-
 include/linux/libata.h     |  3 ++-
 5 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea1025..bdfb0a8 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -562,16 +562,28 @@ static const u8 ata_rw_cmds[] = {
 	ATA_CMD_WRITE_MULTI,
 	ATA_CMD_READ_MULTI_EXT,
 	ATA_CMD_WRITE_MULTI_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_MULTI_FUA_EXT,
 	/* pio */
 	ATA_CMD_PIO_READ,
 	ATA_CMD_PIO_WRITE,
 	ATA_CMD_PIO_READ_EXT,
 	ATA_CMD_PIO_WRITE_EXT,
+	0,
+	0,
+	0,
+	0,
 	/* dma */
 	ATA_CMD_READ,
 	ATA_CMD_WRITE,
 	ATA_CMD_READ_EXT,
-	ATA_CMD_WRITE_EXT
+	ATA_CMD_WRITE_EXT,
+	0,
+	0,
+	0,
+	ATA_CMD_WRITE_FUA_EXT
 };
 
 /**
@@ -584,25 +596,32 @@ static const u8 ata_rw_cmds[] = {
  *	LOCKING:
  *	caller.
  */
-void ata_rwcmd_protocol(struct ata_queued_cmd *qc)
+int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 {
 	struct ata_taskfile *tf = &qc->tf;
 	struct ata_device *dev = qc->dev;
+	u8 cmd;
 
-	int index, lba48, write;
+	int index, fua, lba48, write;
  
+	fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
 	lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
 	write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;
 
 	if (dev->flags & ATA_DFLAG_PIO) {
 		tf->protocol = ATA_PROT_PIO;
-		index = dev->multi_count ? 0 : 4;
+		index = dev->multi_count ? 0 : 8;
 	} else {
 		tf->protocol = ATA_PROT_DMA;
-		index = 8;
+		index = 16;
 	}
 
-	tf->command = ata_rw_cmds[index + lba48 + write];
+	cmd = ata_rw_cmds[index + fua + lba48 + write];
+	if (cmd) {
+		tf->command = cmd;
+		return 0;
+	}
+	return -1;
 }
 
 static const char * const xfer_mode_str[] = {
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be..2c644cb 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1080,11 +1080,13 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	    scsicmd[0] == WRITE_16)
 		tf->flags |= ATA_TFLAG_WRITE;
 
-	/* Calculate the SCSI LBA and transfer length. */
+	/* Calculate the SCSI LBA, transfer length and FUA. */
 	switch (scsicmd[0]) {
 	case READ_10:
 	case WRITE_10:
 		scsi_10_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	case READ_6:
 	case WRITE_6:
@@ -1099,6 +1101,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 	case READ_16:
 	case WRITE_16:
 		scsi_16_lba_len(scsicmd, &block, &n_block);
+		if (unlikely(scsicmd[1] & (1 << 3)))
+			tf->flags |= ATA_TFLAG_FUA;
 		break;
 	default:
 		DPRINTK("no-byte command\n");
@@ -1142,7 +1146,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 			tf->device |= (block >> 24) & 0xf;
 		}
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		qc->nsect = n_block;
 		tf->nsect = n_block & 0xff;
@@ -1160,7 +1165,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
 		if ((block >> 28) || (n_block > 256))
 			goto out_of_range;
 
-		ata_rwcmd_protocol(qc);
+		if (unlikely(ata_rwcmd_protocol(qc) < 0))
+			goto invalid_fld;
 
 		/* Convert LBA to CHS */
 		track = (u32)block / dev->sectors;
@@ -1695,6 +1701,7 @@ static unsigned int ata_msense_rw_recovery(u8 **ptr_io, const u8 *last)
 unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 				  unsigned int buflen)
 {
+	struct ata_device *dev = args->dev;
 	u8 *scsicmd = args->cmd->cmnd, *p, *last;
 	const u8 sat_blk_desc[] = {
 		0, 0, 0, 0,	/* number of blocks: sat unspecified */
@@ -1703,6 +1710,7 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 	};
 	u8 pg, spg;
 	unsigned int ebd, page_control, six_byte, output_len, alloc_len, minlen;
+	u8 dpofua;
 
 	VPRINTK("ENTER\n");
 
@@ -1771,9 +1779,17 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 
 	if (minlen < 1)
 		return 0;
+
+	dpofua = 0;
+	if (ata_id_has_fua(args->id) && dev->flags & ATA_DFLAG_LBA48 &&
+	    (!(dev->flags & ATA_DFLAG_PIO) || dev->multi_count))
+		dpofua = 1 << 4;
+
 	if (six_byte) {
 		output_len--;
 		rbuf[0] = output_len;
+		if (minlen > 2)
+			rbuf[2] |= dpofua;
 		if (ebd) {
 			if (minlen > 3)
 				rbuf[3] = sizeof(sat_blk_desc);
@@ -1786,6 +1802,8 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
 		rbuf[0] = output_len >> 8;
 		if (minlen > 1)
 			rbuf[1] = output_len;
+		if (minlen > 3)
+			rbuf[3] |= dpofua;
 		if (ebd) {
 			if (minlen > 7)
 				rbuf[7] = sizeof(sat_blk_desc);
@@ -2446,7 +2464,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
 		if (xlat_func)
 			ata_scsi_translate(ap, dev, cmd, done, xlat_func);
 		else
-			ata_scsi_simulate(dev->id, cmd, done);
+			ata_scsi_simulate(ap, dev, cmd, done);
 	} else
 		ata_scsi_translate(ap, dev, cmd, done, atapi_xlat);
 
@@ -2469,14 +2487,16 @@ out_unlock:
  *	spin_lock_irqsave(host_set lock)
  */
 
-void ata_scsi_simulate(u16 *id,
+void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
 		      struct scsi_cmnd *cmd,
 		      void (*done)(struct scsi_cmnd *))
 {
 	struct ata_scsi_args args;
 	const u8 *scsicmd = cmd->cmnd;
 
-	args.id = id;
+	args.ap = ap;
+	args.dev = dev;
+	args.id = dev->id;
 	args.cmd = cmd;
 	args.done = done;
 
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 251e53b..e03ce48 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -32,6 +32,8 @@
 #define DRV_VERSION	"1.20"	/* must be exactly four chars */
 
 struct ata_scsi_args {
+	struct ata_port		*ap;
+	struct ata_device	*dev;
 	u16			*id;
 	struct scsi_cmnd	*cmd;
 	void			(*done)(struct scsi_cmnd *);
@@ -41,7 +43,7 @@ struct ata_scsi_args {
 extern int atapi_enabled;
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
-extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc);
+extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern int ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b7..f63dad4 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -129,6 +129,7 @@ enum {
 	ATA_CMD_READ_EXT	= 0x25,
 	ATA_CMD_WRITE		= 0xCA,
 	ATA_CMD_WRITE_EXT	= 0x35,
+	ATA_CMD_WRITE_FUA_EXT	= 0x3D,
 	ATA_CMD_PIO_READ	= 0x20,
 	ATA_CMD_PIO_READ_EXT	= 0x24,
 	ATA_CMD_PIO_WRITE	= 0x30,
@@ -137,6 +138,7 @@ enum {
 	ATA_CMD_READ_MULTI_EXT	= 0x29,
 	ATA_CMD_WRITE_MULTI	= 0xC5,
 	ATA_CMD_WRITE_MULTI_EXT	= 0x39,
+	ATA_CMD_WRITE_MULTI_FUA_EXT = 0xCE,
 	ATA_CMD_SET_FEATURES	= 0xEF,
 	ATA_CMD_PACKET		= 0xA0,
 	ATA_CMD_VERIFY		= 0x40,
@@ -192,6 +194,7 @@ enum {
 	ATA_TFLAG_DEVICE	= (1 << 2), /* enable r/w to device reg */
 	ATA_TFLAG_WRITE		= (1 << 3), /* data dir: host->dev==1 (write) */
 	ATA_TFLAG_LBA		= (1 << 4), /* enable LBA */
+	ATA_TFLAG_FUA		= (1 << 5), /* enable FUA */
 };
 
 enum ata_tf_protocols {
@@ -245,7 +248,8 @@ struct ata_taskfile {
 #define ata_id_is_sata(id)	((id)[93] == 0)
 #define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6))
 #define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5))
-#define ata_id_has_flush(id) ((id)[83] & (1 << 12))
+#define ata_id_has_fua(id)	((id)[84] & (1 << 6))
+#define ata_id_has_flush(id)	((id)[83] & (1 << 12))
 #define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13))
 #define ata_id_has_lba48(id)	((id)[83] & (1 << 10))
 #define ata_id_has_wcache(id)	((id)[82] & (1 << 5))
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e17..6db2c08 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -480,7 +480,8 @@ extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
 extern void ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eng_timeout(struct ata_port *ap);
-extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
+extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
+			      struct scsi_cmnd *cmd,
 			      void (*done)(struct scsi_cmnd *));
 extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
-- 
cgit v1.1


From 3e087b575496b8aa445192f58e7d996b1cdfa121 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:57:31 +0100
Subject: [BLOCK] update IDE to use new blk_ordered for barriers

Update IDE to use new blk_ordered.  This change makes the
following behavior changes.

* Partial completion of the barrier request is handled as
  failure of the whole ordered sequence.  No more partial
  completion for barrier requests.

* Any failure of pre or post flush request results in failure
  of the whole ordered sequence.

So, successfully completed ordered sequence guarantees that
all requests prior to the barrier made to physical medium and,
then, the while barrier request made to the physical medium.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/ide/ide-disk.c | 137 +++++++++++++++++++------------------------------
 drivers/ide/ide-io.c   |   5 +-
 2 files changed, 54 insertions(+), 88 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4e57679..4b44172 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -681,50 +681,9 @@ static ide_proc_entry_t idedisk_proc[] = {
 
 #endif	/* CONFIG_PROC_FS */
 
-static void idedisk_end_flush(request_queue_t *q, struct request *flush_rq)
+static void idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
-	struct request *rq = flush_rq->end_io_data;
-	int good_sectors = rq->hard_nr_sectors;
-	int bad_sectors;
-	sector_t sector;
-
-	if (flush_rq->errors & ABRT_ERR) {
-		printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
-		blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE);
-		blk_queue_issue_flush_fn(drive->queue, NULL);
-		good_sectors = 0;
-	} else if (flush_rq->errors) {
-		good_sectors = 0;
-		if (blk_barrier_preflush(rq)) {
-			sector = ide_get_error_location(drive,flush_rq->buffer);
-			if ((sector >= rq->hard_sector) &&
-			    (sector < rq->hard_sector + rq->hard_nr_sectors))
-				good_sectors = sector - rq->hard_sector;
-		}
-	}
-
-	if (flush_rq->errors)
-		printk(KERN_ERR "%s: failed barrier write: "
-				"sector=%Lx(good=%d/bad=%d)\n",
-				drive->name, (unsigned long long)rq->sector,
-				good_sectors,
-				(int) (rq->hard_nr_sectors-good_sectors));
-
-	bad_sectors = rq->hard_nr_sectors - good_sectors;
-
-	if (good_sectors)
-		__ide_end_request(drive, rq, 1, good_sectors);
-	if (bad_sectors)
-		__ide_end_request(drive, rq, 0, bad_sectors);
-}
-
-static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
-{
-	ide_drive_t *drive = q->queuedata;
-
-	if (!drive->wcache)
-		return 0;
 
 	memset(rq->cmd, 0, sizeof(rq->cmd));
 
@@ -735,9 +694,8 @@ static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
 		rq->cmd[0] = WIN_FLUSH_CACHE;
 
 
-	rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER;
+	rq->flags |= REQ_DRIVE_TASK;
 	rq->buffer = rq->cmd;
-	return 1;
 }
 
 static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -794,27 +752,64 @@ static int set_nowerr(ide_drive_t *drive, int arg)
 	return 0;
 }
 
+static void update_ordered(ide_drive_t *drive)
+{
+	struct hd_driveid *id = drive->id;
+	unsigned ordered = QUEUE_ORDERED_NONE;
+	prepare_flush_fn *prep_fn = NULL;
+	issue_flush_fn *issue_fn = NULL;
+
+	if (drive->wcache) {
+		unsigned long long capacity;
+		int barrier;
+		/*
+		 * We must avoid issuing commands a drive does not
+		 * understand or we may crash it. We check flush cache
+		 * is supported. We also check we have the LBA48 flush
+		 * cache if the drive capacity is too large. By this
+		 * time we have trimmed the drive capacity if LBA48 is
+		 * not available so we don't need to recheck that.
+		 */
+		capacity = idedisk_capacity(drive);
+		barrier = ide_id_has_flush_cache(id) &&
+			(drive->addressing == 0 || capacity <= (1ULL << 28) ||
+			 ide_id_has_flush_cache_ext(id));
+
+		printk(KERN_INFO "%s: cache flushes %ssupported\n",
+		       drive->name, barrier ? "" : "not");
+
+		if (barrier) {
+			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			prep_fn = idedisk_prepare_flush;
+			issue_fn = idedisk_issue_flush;
+		}
+	} else
+		ordered = QUEUE_ORDERED_DRAIN;
+
+	blk_queue_ordered(drive->queue, ordered, prep_fn);
+	blk_queue_issue_flush_fn(drive->queue, issue_fn);
+}
+
 static int write_cache(ide_drive_t *drive, int arg)
 {
 	ide_task_t args;
-	int err;
-
-	if (!ide_id_has_flush_cache(drive->id))
-		return 1;
+	int err = 1;
 
-	memset(&args, 0, sizeof(ide_task_t));
-	args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ?
+	if (ide_id_has_flush_cache(drive->id)) {
+		memset(&args, 0, sizeof(ide_task_t));
+		args.tfRegister[IDE_FEATURE_OFFSET]	= (arg) ?
 			SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE;
-	args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
-	args.command_type			= IDE_DRIVE_TASK_NO_DATA;
-	args.handler				= &task_no_data_intr;
+		args.tfRegister[IDE_COMMAND_OFFSET]	= WIN_SETFEATURES;
+		args.command_type		= IDE_DRIVE_TASK_NO_DATA;
+		args.handler			= &task_no_data_intr;
+		err = ide_raw_taskfile(drive, &args, NULL);
+		if (err == 0)
+			drive->wcache = arg;
+	}
 
-	err = ide_raw_taskfile(drive, &args, NULL);
-	if (err)
-		return err;
+	update_ordered(drive);
 
-	drive->wcache = arg;
-	return 0;
+	return err;
 }
 
 static int do_idedisk_flushcache (ide_drive_t *drive)
@@ -888,7 +883,6 @@ static void idedisk_setup (ide_drive_t *drive)
 {
 	struct hd_driveid *id = drive->id;
 	unsigned long long capacity;
-	int barrier;
 
 	idedisk_add_settings(drive);
 
@@ -992,31 +986,6 @@ static void idedisk_setup (ide_drive_t *drive)
 		drive->wcache = 1;
 
 	write_cache(drive, 1);
-
-	/*
-	 * We must avoid issuing commands a drive does not understand
-	 * or we may crash it. We check flush cache is supported. We also
-	 * check we have the LBA48 flush cache if the drive capacity is
-	 * too large. By this time we have trimmed the drive capacity if
-	 * LBA48 is not available so we don't need to recheck that.
-	 */
-	barrier = 0;
-	if (ide_id_has_flush_cache(id))
-		barrier = 1;
-	if (drive->addressing == 1) {
-		/* Can't issue the correct flush ? */
-		if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id))
-			barrier = 0;
-	}
-
-	printk(KERN_INFO "%s: cache flushes %ssupported\n",
-		drive->name, barrier ? "" : "not ");
-	if (barrier) {
-		blk_queue_ordered(drive->queue, QUEUE_ORDERED_FLUSH);
-		drive->queue->prepare_flush_fn = idedisk_prepare_flush;
-		drive->queue->end_flush_fn = idedisk_end_flush;
-		blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
-	}
 }
 
 static void ide_cacheflush_p(ide_drive_t *drive)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 8435b44..b5dc6df 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -119,10 +119,7 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 	if (!nr_sectors)
 		nr_sectors = rq->hard_cur_sectors;
 
-	if (blk_complete_barrier_rq_locked(drive->queue, rq, nr_sectors))
-		ret = rq->nr_sectors != 0;
-	else
-		ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
+	ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
 
 	spin_unlock_irqrestore(&ide_lock, flags);
 	return ret;
-- 
cgit v1.1


From ff5b8cf1491330836d75eede4e5632caa32b776a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 6 Jan 2006 09:58:37 +0100
Subject: [BLOCK] I/O barrier documentation update

Update documentation to match new barrier implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 Documentation/block/biodoc.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 303c57a..8e63831 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -263,14 +263,8 @@ A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
 The generic i/o scheduler would make sure that it places the barrier request and
 all other requests coming after it after all the previous requests in the
 queue. Barriers may be implemented in different ways depending on the
-driver. A SCSI driver for example could make use of ordered tags to
-preserve the necessary ordering with a lower impact on throughput. For IDE
-this might be two sync cache flush: a pre and post flush when encountering
-a barrier write.
-
-There is a provision for queues to indicate what kind of barriers they
-can provide. This is as of yet unmerged, details will be added here once it
-is in the kernel.
+driver. For more details regarding I/O barriers, please read barrier.txt
+in this directory.
 
 1.2.2 Request Priority/Latency
 
-- 
cgit v1.1


From 15fc858a0067c800f410a24551a7b461978abf0b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 10:00:50 +0100
Subject: [BLOCK] Correct blk_execute_rq_nowait() prototype

---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 15db0f1..fb09853 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -613,8 +613,7 @@ extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_io
 extern int blk_execute_rq(request_queue_t *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
-				  struct request *, int,
-				  void (*done)(struct request *));
+				  struct request *, int, rq_end_io_fn *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
-- 
cgit v1.1


From e650c305ec3178818b317dad37a6d9c7fa8ba28d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 12:38:30 +0100
Subject: [SCSI] scsi_end_async() needs to take an uptodate parameter

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/scsi/scsi_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7a38b10..ba93d6e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -308,7 +308,7 @@ struct scsi_io_context {
 
 static kmem_cache_t *scsi_io_context_cache;
 
-static void scsi_end_async(struct request *req)
+static void scsi_end_async(struct request *req, int uptodate)
 {
 	struct scsi_io_context *sioc = req->end_io_data;
 
-- 
cgit v1.1


From bd6a59b22fd3bd044bb14978b885bcd042a10e8e Mon Sep 17 00:00:00 2001
From: Joshua Kwan <joshk@triplehelix.org>
Date: Fri, 6 Jan 2006 00:09:45 -0800
Subject: [PATCH] hfsplus oops fix

nls_utf8 is available, and the check in hfsplus_fill_super checks the wrong
pointer for NULLness (it checks the saved nls, not the new one that it
needs to use.)

Signed-off-by: Joshua Kwan <joshk@triplehelix.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hfsplus/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8093351..6daaf7c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,7 +320,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	/* temporarily use utf8 to correctly find the hidden dir below */
 	nls = sbi->nls;
 	sbi->nls = load_nls("utf8");
-	if (!nls) {
+	if (!sbi->nls) {
 		printk("HFS+: unable to load nls for utf8\n");
 		err = -EINVAL;
 		goto cleanup;
-- 
cgit v1.1


From 4b2f0260c74324abca76ccaa42d426af163125e7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 6 Jan 2006 00:09:47 -0800
Subject: [PATCH] nbd: fix TX/RX race condition

Janos Haar of First NetCenter Bt.  reported numerous crashes involving the
NBD driver.  With his help, this was tracked down to bogus bio vectors
which in turn was the result of a race condition between the
receive/transmit routines in the NBD driver.

The bug manifests itself like this:

CPU0				CPU1
do_nbd_request
	add req to queuelist
	nbd_send_request
		send req head
		for each bio
			kmap
			send
				nbd_read_stat
					nbd_find_request
					nbd_end_request
			kunmap

When CPU1 finishes nbd_end_request, the request and all its associated
bio's are freed.  So when CPU0 calls kunmap whose argument is derived from
the last bio, it may crash.

Under normal circumstances, the race occurs only on the last bio.  However,
if an error is encountered on the remote NBD server (such as an incorrect
magic number in the request), or if there were a bug in the server, it is
possible for the nbd_end_request to occur any time after the request's
addition to the queuelist.

The following patch fixes this problem by making sure that requests are not
added to the queuelist until after they have been completed transmission.

In order for the receiving side to be ready for responses involving
requests still being transmitted, the patch introduces the concept of the
active request.

When a response matches the current active request, its processing is
delayed until after the tranmission has come to a stop.

This has been tested by Janos and it has been successful in curing this
race condition.

From: Herbert Xu <herbert@gondor.apana.org.au>

  Here is an updated patch which removes the active_req wait in
  nbd_clear_queue and the associated memory barrier.

  I've also clarified this in the comment.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: <djani22@dynamicweb.hu>
Cc: Paul Clements <Paul.Clements@SteelEye.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/nbd.c | 122 ++++++++++++++++++++++++++--------------------------
 include/linux/nbd.h |   8 ++++
 2 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268dd..d5c8ee7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -54,11 +54,15 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/ioctl.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
 #include <net/sock.h>
 
 #include <linux/devfs_fs_kernel.h>
 
 #include <asm/uaccess.h>
+#include <asm/system.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
@@ -230,14 +234,6 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 	request.len = htonl(size);
 	memcpy(request.handle, &req, sizeof(req));
 
-	down(&lo->tx_lock);
-
-	if (!sock || !lo->sock) {
-		printk(KERN_ERR "%s: Attempted send on closed socket\n",
-				lo->disk->disk_name);
-		goto error_out;
-	}
-
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
@@ -276,11 +272,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			}
 		}
 	}
-	up(&lo->tx_lock);
 	return 0;
 
 error_out:
-	up(&lo->tx_lock);
 	return 1;
 }
 
@@ -289,9 +283,14 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
 	struct request *req;
 	struct list_head *tmp;
 	struct request *xreq;
+	int err;
 
 	memcpy(&xreq, handle, sizeof(xreq));
 
+	err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+	if (unlikely(err))
+		goto out;
+
 	spin_lock(&lo->queue_lock);
 	list_for_each(tmp, &lo->queue_head) {
 		req = list_entry(tmp, struct request, queuelist);
@@ -302,7 +301,11 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
 		return req;
 	}
 	spin_unlock(&lo->queue_lock);
-	return NULL;
+
+	err = -ENOENT;
+
+out:
+	return ERR_PTR(err);
 }
 
 static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
@@ -331,7 +334,11 @@ static struct request *nbd_read_stat(struct nbd_device *lo)
 		goto harderror;
 	}
 	req = nbd_find_request(lo, reply.handle);
-	if (req == NULL) {
+	if (unlikely(IS_ERR(req))) {
+		result = PTR_ERR(req);
+		if (result != -ENOENT)
+			goto harderror;
+
 		printk(KERN_ERR "%s: Unexpected reply (%p)\n",
 				lo->disk->disk_name, reply.handle);
 		result = -EBADR;
@@ -395,19 +402,24 @@ static void nbd_clear_que(struct nbd_device *lo)
 
 	BUG_ON(lo->magic != LO_MAGIC);
 
-	do {
-		req = NULL;
-		spin_lock(&lo->queue_lock);
-		if (!list_empty(&lo->queue_head)) {
-			req = list_entry(lo->queue_head.next, struct request, queuelist);
-			list_del_init(&req->queuelist);
-		}
-		spin_unlock(&lo->queue_lock);
-		if (req) {
-			req->errors++;
-			nbd_end_request(req);
-		}
-	} while (req);
+	/*
+	 * Because we have set lo->sock to NULL under the tx_lock, all
+	 * modifications to the list must have completed by now.  For
+	 * the same reason, the active_req must be NULL.
+	 *
+	 * As a consequence, we don't need to take the spin lock while
+	 * purging the list here.
+	 */
+	BUG_ON(lo->sock);
+	BUG_ON(lo->active_req);
+
+	while (!list_empty(&lo->queue_head)) {
+		req = list_entry(lo->queue_head.next, struct request,
+				 queuelist);
+		list_del_init(&req->queuelist);
+		req->errors++;
+		nbd_end_request(req);
+	}
 }
 
 /*
@@ -435,11 +447,6 @@ static void do_nbd_request(request_queue_t * q)
 
 		BUG_ON(lo->magic != LO_MAGIC);
 
-		if (!lo->file) {
-			printk(KERN_ERR "%s: Request when not-ready\n",
-					lo->disk->disk_name);
-			goto error_out;
-		}
 		nbd_cmd(req) = NBD_CMD_READ;
 		if (rq_data_dir(req) == WRITE) {
 			nbd_cmd(req) = NBD_CMD_WRITE;
@@ -453,32 +460,34 @@ static void do_nbd_request(request_queue_t * q)
 		req->errors = 0;
 		spin_unlock_irq(q->queue_lock);
 
-		spin_lock(&lo->queue_lock);
-
-		if (!lo->file) {
-			spin_unlock(&lo->queue_lock);
-			printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n",
-					lo->disk->disk_name);
+		down(&lo->tx_lock);
+		if (unlikely(!lo->sock)) {
+			up(&lo->tx_lock);
+			printk(KERN_ERR "%s: Attempted send on closed socket\n",
+			       lo->disk->disk_name);
 			req->errors++;
 			nbd_end_request(req);
 			spin_lock_irq(q->queue_lock);
 			continue;
 		}
 
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
+		lo->active_req = req;
 
 		if (nbd_send_req(lo, req) != 0) {
 			printk(KERN_ERR "%s: Request send failed\n",
 					lo->disk->disk_name);
-			if (nbd_find_request(lo, (char *)&req) != NULL) {
-				/* we still own req */
-				req->errors++;
-				nbd_end_request(req);
-			} else /* we're racing with nbd_clear_que */
-				printk(KERN_DEBUG "nbd: can't find req\n");
+			req->errors++;
+			nbd_end_request(req);
+		} else {
+			spin_lock(&lo->queue_lock);
+			list_add(&req->queuelist, &lo->queue_head);
+			spin_unlock(&lo->queue_lock);
 		}
 
+		lo->active_req = NULL;
+		up(&lo->tx_lock);
+		wake_up_all(&lo->active_wq);
+
 		spin_lock_irq(q->queue_lock);
 		continue;
 
@@ -529,17 +538,10 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 		down(&lo->tx_lock);
 		lo->sock = NULL;
 		up(&lo->tx_lock);
-		spin_lock(&lo->queue_lock);
 		file = lo->file;
 		lo->file = NULL;
-		spin_unlock(&lo->queue_lock);
 		nbd_clear_que(lo);
-		spin_lock(&lo->queue_lock);
-		if (!list_empty(&lo->queue_head)) {
-			printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
-			error = -EBUSY;
-		}
-		spin_unlock(&lo->queue_lock);
+		BUG_ON(!list_empty(&lo->queue_head));
 		if (file)
 			fput(file);
 		return error;
@@ -598,24 +600,19 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 			lo->sock = NULL;
 		}
 		up(&lo->tx_lock);
-		spin_lock(&lo->queue_lock);
 		file = lo->file;
 		lo->file = NULL;
-		spin_unlock(&lo->queue_lock);
 		nbd_clear_que(lo);
 		printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
 		if (file)
 			fput(file);
 		return lo->harderror;
 	case NBD_CLEAR_QUE:
-		down(&lo->tx_lock);
-		if (lo->sock) {
-			up(&lo->tx_lock);
-			return 0; /* probably should be error, but that would
-				   * break "nbd-client -d", so just return 0 */
-		}
-		up(&lo->tx_lock);
-		nbd_clear_que(lo);
+		/*
+		 * This is for compatibility only.  The queue is always cleared
+		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
+		 */
+		BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
 		return 0;
 	case NBD_PRINT_DEBUG:
 		printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
@@ -688,6 +685,7 @@ static int __init nbd_init(void)
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
 		init_MUTEX(&nbd_dev[i].tx_lock);
+		init_waitqueue_head(&nbd_dev[i].active_wq);
 		nbd_dev[i].blksize = 1024;
 		nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */
 		disk->major = NBD_MAJOR;
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 090e210..f95d51f 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -37,18 +37,26 @@ enum {
 /* userspace doesn't need the nbd_device structure */
 #ifdef __KERNEL__
 
+#include <linux/wait.h>
+
 /* values for flags field */
 #define NBD_READ_ONLY 0x0001
 #define NBD_WRITE_NOCHK 0x0002
 
+struct request;
+
 struct nbd_device {
 	int flags;
 	int harderror;		/* Code of hard error			*/
 	struct socket * sock;
 	struct file * file; 	/* If == NULL, device is not ready, yet	*/
 	int magic;
+
 	spinlock_t queue_lock;
 	struct list_head queue_head;/* Requests are added here...	*/
+	struct request *active_req;
+	wait_queue_head_t active_wq;
+
 	struct semaphore tx_lock;
 	struct gendisk *disk;
 	int blksize;
-- 
cgit v1.1


From 1f1e030bf75774b6a283518e1534d598e14147d4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 6 Jan 2006 00:09:49 -0800
Subject: [PATCH] knfsd: fix hash function for IP addresses on 64bit
 little-endian machines.

The hash.h hash_long function, when used on a 64 bit machine, ignores many
of the middle-order bits.  (The prime chosen it too bit-sparse).

IP addresses for clients of an NFS server are very likely to differ only in
the low-order bits.  As addresses are stored in network-byte-order, these
bits become middle-order bits in a little-endian 64bit 'long', and so do
not contribute to the hash.  Thus you can have the situation where all
clients appear on one hash chain.

So, until hash_long is fixed (or maybe forever), us a hash function that
works well on IP addresses - xor the bytes together.

Thanks to "Iozone" <capps@iozone.org> for identifying this problem.

Cc: "Iozone" <capps@iozone.org>

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/svcauth_unix.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index cac2e77..3e6c694 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -101,10 +101,22 @@ static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
 	}
 }
 
+#if IP_HASHBITS == 8
+/* hash_long on a 64 bit machine is currently REALLY BAD for
+ * IP addresses in reverse-endian (i.e. on a little-endian machine).
+ * So use a trivial but reliable hash instead
+ */
+static inline int hash_ip(unsigned long ip)
+{
+	int hash = ip ^ (ip>>16);
+	return (hash ^ (hash>>8)) & 0xff;
+}
+#endif
+
 static inline int ip_map_hash(struct ip_map *item)
 {
 	return hash_str(item->m_class, IP_HASHBITS) ^ 
-		hash_long((unsigned long)item->m_addr.s_addr, IP_HASHBITS);
+		hash_ip((unsigned long)item->m_addr.s_addr);
 }
 static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp)
 {
-- 
cgit v1.1


From 817c41d76e9eaf72044268b0e545a547abadc0bb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:09:50 -0800
Subject: [PATCH] alpha: dma_map_page() fix

Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index 680f7ec..9dc7256 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -16,7 +16,7 @@
 #define dma_free_coherent(dev, size, va, addr)		\
 		pci_free_consistent(alpha_gendev_to_pci(dev), size, va, addr)
 #define dma_map_page(dev, page, off, size, dir)		\
-		pci_map_single(alpha_gendev_to_pci(dev), page, off, size, dir)
+		pci_map_page(alpha_gendev_to_pci(dev), page, off, size, dir)
 #define dma_unmap_page(dev, addr, size, dir)		\
 		pci_unmap_page(alpha_gendev_to_pci(dev), addr, size, dir)
 #define dma_map_sg(dev, sg, nents, dir)			\
-- 
cgit v1.1


From a576219aca70e6700705a9836e098dbecd25fb56 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:09:50 -0800
Subject: [PATCH] swsusp: resume_store() retval fix

- This function returns -EINVAL all the time.  Fix.

- Decruftify it a bit too.

- Writing to it doesn't seem to do what it's suppoed to do.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a..4d944b2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -363,30 +363,28 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 {
-	int len;
-	char *p;
 	unsigned int maj, min;
-	int error = -EINVAL;
 	dev_t res;
+	int ret = -EINVAL;
 
-	p = memchr(buf, '\n', n);
-	len = p ? p - buf : n;
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
 
-	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
-		res = MKDEV(maj,min);
-		if (maj == MAJOR(res) && min == MINOR(res)) {
-			down(&pm_sem);
-			swsusp_resume_device = res;
-			up(&pm_sem);
-			printk("Attempting manual resume\n");
-			noresume = 0;
-			software_resume();
-		}
-	}
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
 
-	return error >= 0 ? n : error;
+	down(&pm_sem);
+	swsusp_resume_device = res;
+	up(&pm_sem);
+	printk("Attempting manual resume\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+out:
+	return ret;
 }
 
 power_attr(resume);
-- 
cgit v1.1


From 47f3a867f6310d6abfa185ab12baaba7ed1d69af Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 6 Jan 2006 00:10:32 -0800
Subject: [PATCH] mm: fix __alloc_pages cpuset ALLOC_* flags

Two changes to the setting of the ALLOC_CPUSET flag in
mm/page_alloc.c:__alloc_pages()

- A bug fix - the "ignoring mins" case should not be honoring ALLOC_CPUSET.
  This case of all cases, since it is handling a request that will free up
  more memory than is asked for (exiting tasks, e.g.) should be allowed to
  escape cpuset constraints when memory is tight.

- A logic change to make it simpler.  Honor cpusets even on GFP_ATOMIC
  (!wait) requests.  With this, cpuset confinement applies to all requests
  except ALLOC_NO_WATERMARKS, so that in a subsequent cleanup patch, I can
  remove the ALLOC_CPUSET flag entirely.  Since I don't know any real reason
  this logic has to be either way, I am choosing the path of the simplest
  code.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c..1e49dc7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -903,8 +903,7 @@ restart:
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
-	if (wait)
-		alloc_flags |= ALLOC_CPUSET;
+	alloc_flags |= ALLOC_CPUSET;
 
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +925,7 @@ restart:
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
 			page = get_page_from_freelist(gfp_mask, order,
-				zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+				zonelist, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
-- 
cgit v1.1


From 5ac24eefd1d89bc6aa2817741c3bd5d4205b2efd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:33 -0800
Subject: [PATCH] memhotplug: __add_section remove unused pgdat definition

__add_section defines an unused pointer to the zones pgdat.  Remove this
definition.  This fixes a compile warning.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8..a918f77 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 				  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
-	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
 	int ret;
 
-- 
cgit v1.1


From 98a38ebdda69f1498be4f618d8d919695c8d6352 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:35 -0800
Subject: [PATCH] memhotplug: register_ and unregister_memory_notifier should
 be global

Both register_memory_notifer and unregister_memory_notifier are global and
declared so in linux/memory.h.  Update the HOTPLUG specific definitions to
match.  This fixes a compile warning when HOTPLUG is enabled.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7e1d077..19fe931 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -49,12 +49,12 @@ static struct kset_uevent_ops memory_uevent_ops = {
 
 static struct notifier_block *memory_chain;
 
-static int register_memory_notifier(struct notifier_block *nb)
+int register_memory_notifier(struct notifier_block *nb)
 {
         return notifier_chain_register(&memory_chain, nb);
 }
 
-static void unregister_memory_notifier(struct notifier_block *nb)
+void unregister_memory_notifier(struct notifier_block *nb)
 {
         notifier_chain_unregister(&memory_chain, nb);
 }
-- 
cgit v1.1


From 900b2b463dc6e65ec474d6880412c63c25b3aea9 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:35 -0800
Subject: [PATCH] memhotplug: register_memory should be global

register_memory is global and declared so in linux/memory.h.  Update the
HOTPLUG specific definition to match.  This fixes a compile warning when
HOTPLUG is enabled.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/memory.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 19fe931..58801d7 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -62,8 +62,7 @@ void unregister_memory_notifier(struct notifier_block *nb)
 /*
  * register_memory - Setup a sysfs device for a memory block
  */
-static int
-register_memory(struct memory_block *memory, struct mem_section *section,
+int register_memory(struct memory_block *memory, struct mem_section *section,
 		struct node *root)
 {
 	int error;
-- 
cgit v1.1


From d7339071f6a8b50101d7ba327926b770f22d5d8b Mon Sep 17 00:00:00 2001
From: Hans Reiser <reiser@namesys.com>
Date: Fri, 6 Jan 2006 00:10:36 -0800
Subject: [PATCH] reiser4: vfs: add truncate_inode_pages_range()

This patch makes truncate_inode_pages_range from truncate_inode_pages.
truncate_inode_pages became a one-liner call to truncate_inode_pages_range.

Reiser4 needs truncate_inode_pages_ranges because it tries to keep
correspondence between existences of metadata pointing to data pages and pages
to which those metadata point to.  So, when metadata of certain part of file
is removed from filesystem tree, only pages of corresponding range are to be
truncated.

(Needed by the madvise(MADV_REMOVE) patch)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  2 ++
 mm/truncate.c      | 44 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a06a84d..92acae9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -896,6 +896,8 @@ extern unsigned long do_brk(unsigned long, unsigned long);
 /* filemap.c */
 extern unsigned long page_unuse(struct page *);
 extern void truncate_inode_pages(struct address_space *, loff_t);
+extern void truncate_inode_pages_range(struct address_space *,
+				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab5..7dee327 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
  * @mapping: mapping to truncate
  * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
  *
- * Truncate the page cache at a set offset, removing the pages that are beyond
- * that offset (and zeroing out partial pages).
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
  *
  * Truncate takes two passes - the first pass is nonblocking.  It will not
  * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
  * We pass down the cache-hot hint to the page freeing code.  Even if the
  * mapping is large, it is probably the case that the final pages are the most
  * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
  */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+				loff_t lstart, loff_t lend)
 {
 	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	pgoff_t end;
 	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 	if (mapping->nrpages == 0)
 		return;
 
+	BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+	end = (lend >> PAGE_CACHE_SHIFT);
+
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+	while (next <= end &&
+	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
 
+			if (page_index > end) {
+				next = page_index;
+				break;
+			}
+
 			if (page_index > next)
 				next = page_index;
 			next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 			next = start;
 			continue;
 		}
+		if (pvec.pages[0]->index > end) {
+			pagevec_release(&pvec);
+			break;
+		}
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
+			if (page->index > end)
+				break;
 			lock_page(page);
 			wait_on_page_writeback(page);
 			if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 		pagevec_release(&pvec);
 	}
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
 
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 
 /**
-- 
cgit v1.1


From f6b3ec238d12c8cc6cc71490c6e3127988460349 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:38 -0800
Subject: [PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing
 store

Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store.  Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.

"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released.  However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli

Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.

This feature is also useful for supporting hot-plug memory on UML.

Concerns raised by Andrew Morton:

- "We have no plan for holepunching!  If we _do_ have such a plan (or
  might in the future) then what would the API look like?  I think
  sys_holepunch(fd, start, len), so we should start out with that."

- Using madvise is very weird, because people will ask "why do I need to
  mmap my file before I can stick a hole in it?"

- None of the other madvise operations call into the filesystem in this
  manner.  A broad question is: is this capability an MM operation or a
  filesytem operation?  truncate, for example, is a filesystem operation
  which sometimes has MM side-effects.  madvise is an mm operation and with
  this patch, it gains FS side-effects, only they're really, really
  significant ones."

Comments:

- Andrea suggested the fs operation too but then it's more efficient to
  have it as a mm operation with fs side effects, because they don't
  immediatly know fd and physical offset of the range.  It's possible to
  fixup in userland and to use the fs operation but it's more expensive,
  the vmas are already in the kernel and we can use them.

Short term plan &  Future Direction:

- We seem to need this interface only for shmfs/tmpfs files in the short
  term.  We have to add hooks into the filesystem for correctness and
  completeness.  This is what this patch does.

- In the future, plan is to support both fs and mmap apis also.  This
  also involves (other) filesystem specific functions to be implemented.

- Current patch doesn't support VM_NONLINEAR - which can be addressed in
  the future.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mman.h   |  1 +
 include/asm-arm/mman.h     |  1 +
 include/asm-arm26/mman.h   |  1 +
 include/asm-cris/mman.h    |  1 +
 include/asm-frv/mman.h     |  1 +
 include/asm-h8300/mman.h   |  1 +
 include/asm-i386/mman.h    |  1 +
 include/asm-ia64/mman.h    |  1 +
 include/asm-m32r/mman.h    |  1 +
 include/asm-m68k/mman.h    |  1 +
 include/asm-mips/mman.h    |  1 +
 include/asm-parisc/mman.h  |  1 +
 include/asm-powerpc/mman.h |  1 +
 include/asm-s390/mman.h    |  1 +
 include/asm-sh/mman.h      |  1 +
 include/asm-sparc/mman.h   |  1 +
 include/asm-sparc64/mman.h |  1 +
 include/asm-v850/mman.h    |  1 +
 include/asm-x86_64/mman.h  |  1 +
 include/asm-xtensa/mman.h  |  1 +
 include/linux/fs.h         |  1 +
 include/linux/mm.h         |  1 +
 mm/madvise.c               | 35 +++++++++++++++++++++++++++++++++++
 mm/memory.c                | 25 ++++++++++++++++++++++++-
 mm/shmem.c                 | 32 ++++++++++++++++++++++++--------
 25 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/include/asm-alpha/mman.h b/include/asm-alpha/mman.h
index eb9c279..f643953 100644
--- a/include/asm-alpha/mman.h
+++ b/include/asm-alpha/mman.h
@@ -42,6 +42,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define	MADV_SPACEAVAIL	5		/* ensure resources are available */
 #define MADV_DONTNEED	6		/* don't need these pages */
+#define MADV_REMOVE	7		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-arm/mman.h b/include/asm-arm/mman.h
index 8e4f69c..f0bebca 100644
--- a/include/asm-arm/mman.h
+++ b/include/asm-arm/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-arm26/mman.h b/include/asm-arm26/mman.h
index cc27b82..0ed7780 100644
--- a/include/asm-arm26/mman.h
+++ b/include/asm-arm26/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-cris/mman.h b/include/asm-cris/mman.h
index 8570e72..5a382b8 100644
--- a/include/asm-cris/mman.h
+++ b/include/asm-cris/mman.h
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-frv/mman.h b/include/asm-frv/mman.h
index c684720..8af4a41 100644
--- a/include/asm-frv/mman.h
+++ b/include/asm-frv/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-h8300/mman.h b/include/asm-h8300/mman.h
index 63f727a..744a8fb 100644
--- a/include/asm-h8300/mman.h
+++ b/include/asm-h8300/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-i386/mman.h b/include/asm-i386/mman.h
index 196619a..ba4941e 100644
--- a/include/asm-i386/mman.h
+++ b/include/asm-i386/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h
index 1c0a73a..828beb2 100644
--- a/include/asm-ia64/mman.h
+++ b/include/asm-ia64/mman.h
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-m32r/mman.h b/include/asm-m32r/mman.h
index 011f6d9..12e2974 100644
--- a/include/asm-m32r/mman.h
+++ b/include/asm-m32r/mman.h
@@ -37,6 +37,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-m68k/mman.h b/include/asm-m68k/mman.h
index f831c4e..ea262ab 100644
--- a/include/asm-m68k/mman.h
+++ b/include/asm-m68k/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-mips/mman.h b/include/asm-mips/mman.h
index 6206095..dd17c8b 100644
--- a/include/asm-mips/mman.h
+++ b/include/asm-mips/mman.h
@@ -65,6 +65,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff --git a/include/asm-parisc/mman.h b/include/asm-parisc/mman.h
index e829607..736b0ab 100644
--- a/include/asm-parisc/mman.h
+++ b/include/asm-parisc/mman.h
@@ -38,6 +38,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_REMOVE     8		/* remove these pages & resources */
 
 /* The range 12-64 is reserved for page size specification. */
 #define MADV_4K_PAGES   12              /* Use 4K pages  */
diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h
index f5e5342..a2e34c2 100644
--- a/include/asm-powerpc/mman.h
+++ b/include/asm-powerpc/mman.h
@@ -44,6 +44,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-s390/mman.h b/include/asm-s390/mman.h
index ea86bd1..c8d5409 100644
--- a/include/asm-s390/mman.h
+++ b/include/asm-s390/mman.h
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL        0x2             /* read-ahead aggressively */
 #define MADV_WILLNEED  0x3              /* pre-fault pages */
 #define MADV_DONTNEED  0x4              /* discard these pages */
+#define MADV_REMOVE    0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sh/mman.h b/include/asm-sh/mman.h
index 3ebab5f..693bd55 100644
--- a/include/asm-sh/mman.h
+++ b/include/asm-sh/mman.h
@@ -35,6 +35,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sparc/mman.h b/include/asm-sparc/mman.h
index 138eb81..98435ad 100644
--- a/include/asm-sparc/mman.h
+++ b/include/asm-sparc/mman.h
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h
index 01cecf5..cb4b615 100644
--- a/include/asm-sparc64/mman.h
+++ b/include/asm-sparc64/mman.h
@@ -54,6 +54,7 @@
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
 #define MADV_FREE	0x5		/* (Solaris) contents can be freed */
+#define MADV_REMOVE	0x6		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-v850/mman.h b/include/asm-v850/mman.h
index e2b9008..edc7996 100644
--- a/include/asm-v850/mman.h
+++ b/include/asm-v850/mman.h
@@ -32,6 +32,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-x86_64/mman.h b/include/asm-x86_64/mman.h
index 78e60a4..d0e97b7 100644
--- a/include/asm-x86_64/mman.h
+++ b/include/asm-x86_64/mman.h
@@ -36,6 +36,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON	MAP_ANONYMOUS
diff --git a/include/asm-xtensa/mman.h b/include/asm-xtensa/mman.h
index 9a95a45..082a750 100644
--- a/include/asm-xtensa/mman.h
+++ b/include/asm-xtensa/mman.h
@@ -72,6 +72,7 @@
 #define MADV_SEQUENTIAL	0x2		/* read-ahead aggressively */
 #define MADV_WILLNEED	0x3		/* pre-fault pages */
 #define MADV_DONTNEED	0x4		/* discard these pages */
+#define MADV_REMOVE	0x5		/* remove these pages & resources */
 
 /* compatibility flags */
 #define MAP_ANON       MAP_ANONYMOUS
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed9a41a..115e72b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1050,6 +1050,7 @@ struct inode_operations {
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
+	void (*truncate_range)(struct inode *, loff_t, loff_t);
 };
 
 struct seq_file;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 92acae9..6c9be99 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -690,6 +690,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
+extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf04..ae0ae3e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	struct address_space *mapping;
+        loff_t offset, endoff;
+
+	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+		return -EINVAL;
+
+	if (!vma->vm_file || !vma->vm_file->f_mapping
+		|| !vma->vm_file->f_mapping->host) {
+			return -EINVAL;
+	}
+
+	mapping = vma->vm_file->f_mapping;
+
+	offset = (loff_t)(start - vma->vm_start)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	endoff = (loff_t)(end - vma->vm_start - 1)
+			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+	return  vmtruncate_range(mapping->host, offset, endoff);
+}
+
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	case MADV_RANDOM:
 		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
+	case MADV_REMOVE:
+		error = madvise_remove(vma, start, end);
+		break;
 
 	case MADV_WILLNEED:
 		error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
  *		some pages ahead.
  *  MADV_DONTNEED - the application is finished with the given range,
  *		so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *		pages and associated backing store.
  *
  * return values:
  *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07..e249088 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1770,9 +1770,32 @@ out_big:
 out_busy:
 	return -ETXTBSY;
 }
-
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * If the underlying filesystem is not going to provide
+	 * a way to truncate a range of blocks (punch a hole) -
+	 * we should return failure right now.
+	 */
+	if (!inode->i_op || !inode->i_op->truncate_range)
+		return -ENOSYS;
+
+	down(&inode->i_sem);
+	down_write(&inode->i_alloc_sem);
+	unmap_mapping_range(mapping, offset, (end - offset), 1);
+	truncate_inode_pages_range(mapping, offset, end);
+	inode->i_op->truncate_range(inode, offset, end);
+	up_write(&inode->i_alloc_sem);
+	up(&inode->i_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
diff --git a/mm/shmem.c b/mm/shmem.c
index d9fc277..65c148e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
 	} while (next);
 }
 
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
 	long nr_swaps_freed = 0;
 	int offset;
 	int freed;
+	int punch_hole = 0;
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (idx >= info->next_index)
 		return;
 
 	spin_lock(&info->lock);
 	info->flags |= SHMEM_TRUNCATE;
-	limit = info->next_index;
-	info->next_index = idx;
+	if (likely(end == (loff_t) -1)) {
+		limit = info->next_index;
+		info->next_index = idx;
+	} else {
+		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		if (limit > info->next_index)
+			limit = info->next_index;
+		punch_hole = 1;
+	}
+
 	topdir = info->i_indirect;
-	if (topdir && idx <= SHMEM_NR_DIRECT) {
+	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
 		info->i_indirect = NULL;
 		nr_pages_to_free++;
 		list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
 			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(page_private(subdir) > offset);
+			if (!punch_hole)
+				BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
-		else if (subdir) {
+		else if (subdir && !page_private(subdir)) {
 			dir[diroff] = NULL;
 			nr_pages_to_free++;
 			list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
 		 * Also, though shmem_getpage checks i_size before adding to
 		 * cache, no recheck after: so fix the narrow window there too.
 		 */
-		truncate_inode_pages(inode->i_mapping, inode->i_size);
+		truncate_inode_pages_range(inode->i_mapping, start, end);
 	}
 
 	spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
 	}
 }
 
+static void shmem_truncate(struct inode *inode)
+{
+	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
+
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
+	.truncate_range	= shmem_truncate_range,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
-- 
cgit v1.1


From f0916794f00be44154102dedaeafe68b743078a2 Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:40 -0800
Subject: [PATCH] Hugetlb: Remove duplicate i_size check

cleanup

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7..acb8641 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -350,19 +350,12 @@ static struct page *find_lock_huge_page(struct address_space *mapping,
 {
 	struct page *page;
 	int err;
-	struct inode *inode = mapping->host;
-	unsigned long size;
 
 retry:
 	page = find_lock_page(mapping, idx);
 	if (page)
 		goto out;
 
-	/* Check to make sure the mapping hasn't been truncated */
-	size = i_size_read(inode) >> HPAGE_SHIFT;
-	if (idx >= size)
-		goto out;
-
 	if (hugetlb_get_quota(mapping))
 		goto out;
 	page = alloc_huge_page();
-- 
cgit v1.1


From 85ef47f74afe96c8c23eaa605f28cc01443c905f Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:42 -0800
Subject: [PATCH] Hugetlb: Rename find_lock_page to find_or_alloc_huge_page

find_lock_huge_page() isn't a great name, since it does extra things not
analagous to find_lock_page().  Rename it find_or_alloc_huge_page() which is
closer to the mark.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index acb8641..fdbbbb9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -345,8 +345,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_lock_huge_page(struct address_space *mapping,
-			unsigned long idx)
+static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+						unsigned long idx)
 {
 	struct page *page;
 	int err;
@@ -398,7 +398,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_lock_huge_page(mapping, idx);
+	page = find_or_alloc_huge_page(mapping, idx);
 	if (!page)
 		goto out;
 
-- 
cgit v1.1


From 86e5216f8d8aa258ba836caffe2613d79cc9aead Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:43 -0800
Subject: [PATCH] Hugetlb: Reorganize hugetlb_fault to prepare for COW

This patch splits the "no_page()" type activity into its own function,
hugetlb_no_page().  hugetlb_fault() becomes the entry point for hugetlb faults
and delegates to the appropriate handler depending on the type of fault.
Right now we still have only hugetlb_no_page() but a later patch introduces a
COW fault.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fdbbbb9..cf82251 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -376,20 +376,15 @@ out:
 	return page;
 }
 
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep)
 {
 	int ret = VM_FAULT_SIGBUS;
 	unsigned long idx;
 	unsigned long size;
-	pte_t *pte;
 	struct page *page;
 	struct address_space *mapping;
 
-	pte = huge_pte_alloc(mm, address);
-	if (!pte)
-		goto out;
-
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -408,11 +403,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto backout;
 
 	ret = VM_FAULT_MINOR;
-	if (!pte_none(*pte))
+	if (!pte_none(*ptep))
 		goto backout;
 
 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+	set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
 	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 out:
@@ -426,6 +421,27 @@ backout:
 	goto out;
 }
 
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access)
+{
+	pte_t *ptep;
+	pte_t entry;
+
+	ptep = huge_pte_alloc(mm, address);
+	if (!ptep)
+		return VM_FAULT_OOM;
+
+	entry = *ptep;
+	if (pte_none(entry))
+		return hugetlb_no_page(mm, vma, address, ptep);
+
+	/*
+	 * We could get here if another thread instantiated the pte
+	 * before the test above.
+	 */
+	return VM_FAULT_MINOR;
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
-- 
cgit v1.1


From 1e8f889b10d8d2223105719e36ce45688fedbd59 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 6 Jan 2006 00:10:44 -0800
Subject: [PATCH] Hugetlb: Copy on Write support

Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be
supported.  This helps us to safely use hugetlb pages in many more
applications.  The patch makes the following changes.  If needed, I also have
it broken out according to the following paragraphs.

1. Add a pair of functions to set/clear write access on huge ptes.  The
   writable check in make_huge_pte is moved out to the caller for use by COW
   later.

2. Hugetlb copy-on-write requires special case handling in the following
   situations:

   - copy_hugetlb_page_range() - Copied pages must be write protected so
     a COW fault will be triggered (if necessary) if those pages are written
     to.

   - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the
     page cache.  MAP_PRIVATE pages still need to be locked however.

3. Provide hugetlb_cow() and calls from hugetlb_fault() and
   hugetlb_no_page() which handles the COW fault by making the actual copy.

4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps
   will be allowed.  Make MAP_HUGETLB exempt from the depricated VM_RESERVED
   mapping check.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c |   3 --
 mm/hugetlb.c         | 127 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 108 insertions(+), 22 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3..8c41315 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	loff_t len, vma_len;
 	int ret;
 
-	if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
-		return -EINVAL;
-
 	if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
 		return -EINVAL;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf82251..da8a211 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+				int writable)
 {
 	pte_t entry;
 
-	if (vma->vm_flags & VM_WRITE) {
+	if (writable) {
 		entry =
 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	} else {
@@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
 	return entry;
 }
 
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+				   unsigned long address, pte_t *ptep)
+{
+	pte_t entry;
+
+	entry = pte_mkwrite(pte_mkdirty(*ptep));
+	ptep_set_access_flags(vma, address, ptep, entry, 1);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+}
+
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			    struct vm_area_struct *vma)
 {
 	pte_t *src_pte, *dst_pte, entry;
 	struct page *ptepage;
 	unsigned long addr;
+	int cow;
+
+	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		src_pte = huge_pte_offset(src, addr);
@@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
 		if (!pte_none(*src_pte)) {
+			if (cow)
+				ptep_set_wrprotect(src, addr, src_pte);
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-						unsigned long idx)
+				unsigned long idx, int shared)
 {
 	struct page *page;
 	int err;
@@ -364,26 +382,80 @@ retry:
 		goto out;
 	}
 
-	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-	if (err) {
-		put_page(page);
-		hugetlb_put_quota(mapping);
-		if (err == -EEXIST)
-			goto retry;
-		page = NULL;
+	if (shared) {
+		err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+		if (err) {
+			put_page(page);
+			hugetlb_put_quota(mapping);
+			if (err == -EEXIST)
+				goto retry;
+			page = NULL;
+		}
+	} else {
+		/* Caller expects a locked page */
+		lock_page(page);
 	}
 out:
 	return page;
 }
 
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, pte_t pte)
+{
+	struct page *old_page, *new_page;
+	int i, avoidcopy;
+
+	old_page = pte_page(pte);
+
+	/* If no-one else is actually using this page, avoid the copy
+	 * and just make the page writable */
+	avoidcopy = (page_count(old_page) == 1);
+	if (avoidcopy) {
+		set_huge_ptep_writable(vma, address, ptep);
+		return VM_FAULT_MINOR;
+	}
+
+	page_cache_get(old_page);
+	new_page = alloc_huge_page();
+
+	if (!new_page) {
+		page_cache_release(old_page);
+
+		/* Logically this is OOM, not a SIGBUS, but an OOM
+		 * could cause the kernel to go killing other
+		 * processes which won't help the hugepage situation
+		 * at all (?) */
+		return VM_FAULT_SIGBUS;
+	}
+
+	spin_unlock(&mm->page_table_lock);
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+		copy_user_highpage(new_page + i, old_page + i,
+				   address + i*PAGE_SIZE);
+	spin_lock(&mm->page_table_lock);
+
+	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	if (likely(pte_same(*ptep, pte))) {
+		/* Break COW */
+		set_huge_pte_at(mm, address, ptep,
+				make_huge_pte(vma, new_page, 1));
+		/* Make the old page be freed below */
+		new_page = old_page;
+	}
+	page_cache_release(new_page);
+	page_cache_release(old_page);
+	return VM_FAULT_MINOR;
+}
+
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep)
+			unsigned long address, pte_t *ptep, int write_access)
 {
 	int ret = VM_FAULT_SIGBUS;
 	unsigned long idx;
 	unsigned long size;
 	struct page *page;
 	struct address_space *mapping;
+	pte_t new_pte;
 
 	mapping = vma->vm_file->f_mapping;
 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(mapping, idx);
+	page = find_or_alloc_huge_page(mapping, idx,
+			vma->vm_flags & VM_SHARED);
 	if (!page)
 		goto out;
 
+	BUG_ON(!PageLocked(page));
+
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
@@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto backout;
 
 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-	set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+				&& (vma->vm_flags & VM_SHARED)));
+	set_huge_pte_at(mm, address, ptep, new_pte);
+
+	if (write_access && !(vma->vm_flags & VM_SHARED)) {
+		/* Optimization, do the COW without a second fault */
+		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+	}
+
 	spin_unlock(&mm->page_table_lock);
 	unlock_page(page);
 out:
@@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	pte_t *ptep;
 	pte_t entry;
+	int ret;
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
@@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	entry = *ptep;
 	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep);
+		return hugetlb_no_page(mm, vma, address, ptep, write_access);
 
-	/*
-	 * We could get here if another thread instantiated the pte
-	 * before the test above.
-	 */
-	return VM_FAULT_MINOR;
+	ret = VM_FAULT_MINOR;
+
+	spin_lock(&mm->page_table_lock);
+	/* Check for a racing update before calling hugetlb_cow */
+	if (likely(pte_same(entry, *ptep)))
+		if (write_access && !pte_write(entry))
+			ret = hugetlb_cow(mm, vma, address, ptep, entry);
+	spin_unlock(&mm->page_table_lock);
+
+	return ret;
 }
 
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-- 
cgit v1.1


From 96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:45 -0800
Subject: [PATCH] mm: dequeue a huge page near to this node

This was discussed at
http://marc.theaimsgroup.com/?l=linux-kernel&m=113166526217117&w=2

This patch changes the dequeueing to select a huge page near the node
executing instead of always beginning to check for free nodes from node 0.
This will result in a placement of the huge pages near the executing
processor improving performance.

The existing implementation can place the huge pages far away from the
executing processor causing significant degradation of performance.  The
search starting from zero also means that the lower zones quickly run out
of memory.  Selecting a huge page near the process distributed the huge
pages better.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index da8a211..e93bd63 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,14 +40,16 @@ static struct page *dequeue_huge_page(void)
 {
 	int nid = numa_node_id();
 	struct page *page = NULL;
+	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+	struct zone **z;
 
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
+	for (z = zonelist->zones; *z; z++) {
+		nid = (*z)->zone_pgdat->node_id;
+		if (!list_empty(&hugepage_freelists[nid]))
+			break;
 	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
+
+	if (*z) {
 		page = list_entry(hugepage_freelists[nid].next,
 				  struct page, lru);
 		list_del(&page->lru);
-- 
cgit v1.1


From 5da7ca86078964cbfe6c83efc1205904587706fe Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:46 -0800
Subject: [PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of
zones ordered by NUMA distance.  The hugetlb layer will walk that list looking
for a zone that has available huge pages but is also in the nodeset of the
current cpuset.

This patch does not contain the folding of find_or_alloc_huge_page() that was
controversial in the earlier discussion.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h   |  4 ++--
 include/linux/mempolicy.h |  8 ++++++++
 mm/hugetlb.c              | 24 ++++++++++++++----------
 mm/mempolicy.c            | 39 ++++++++++++++++++++++++++++++---------
 4 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1056717..68d82ad 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(void);
+struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
 						do { } while (0)
-#define alloc_huge_page()			({ NULL; })
+#define alloc_huge_page(vma, addr)		({ NULL; })
 #define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8b67cf8..817db64 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -156,6 +156,8 @@ extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
+extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr);
 
 #else
 
@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
 {
 }
 
+static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+}
+
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e93bd63..eb40556 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+				unsigned long address)
 {
 	int nid = numa_node_id();
 	struct page *page = NULL;
-	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+	struct zonelist *zonelist = huge_zonelist(vma, address);
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
 	int i;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page();
+	page = dequeue_huge_page(vma, addr);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	spin_lock(&hugetlb_lock);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
-		struct page *page = dequeue_huge_page();
+		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
 		update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-				unsigned long idx, int shared)
+static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
+			unsigned long addr, struct address_space *mapping,
+			unsigned long idx, int shared)
 {
 	struct page *page;
 	int err;
@@ -378,7 +382,7 @@ retry:
 
 	if (hugetlb_get_quota(mapping))
 		goto out;
-	page = alloc_huge_page();
+	page = alloc_huge_page(vma, addr);
 	if (!page) {
 		hugetlb_put_quota(mapping);
 		goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	page_cache_get(old_page);
-	new_page = alloc_huge_page();
+	new_page = alloc_huge_page(vma, address);
 
 	if (!new_page) {
 		page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(mapping, idx,
+	page = find_or_alloc_huge_page(vma, address, mapping, idx,
 			vma->vm_flags & VM_SHARED);
 	if (!page)
 		goto out;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402c..45c51ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
 	return nid;
 }
 
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+		 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+	if (vma) {
+		unsigned long off;
+
+		off = vma->vm_pgoff;
+		off += (addr - vma->vm_start) >> shift;
+		return offset_il_node(pol, vma, off);
+	} else
+		return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+	if (pol->policy == MPOL_INTERLEAVE) {
+		unsigned nid;
+
+		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+	}
+	return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
-		if (vma) {
-			unsigned long off;
-			off = vma->vm_pgoff;
-			off += (addr - vma->vm_start) >> PAGE_SHIFT;
-			nid = offset_il_node(pol, vma, off);
-		} else {
-			/* fall back to process interleaving */
-			nid = interleave_nodes(pol);
-		}
+
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
-- 
cgit v1.1


From 21abb1478a87e26f5fa71dbcb7cf4264272c2248 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:47 -0800
Subject: [PATCH] Remove old node based policy interface from mempolicy.c

mempolicy.c contains provisional interface for huge page allocation based on
node numbers.  This is in use in SLES9 but was never used (AFAIK) in upstream
versions of Linux.

Huge page allocations now use zonelists to figure out where to allocate pages.
 The use of zonelists allows us to find the closest hugepage which was the
consideration of the NUMA distance for huge page allocations.

Remove the obsolete functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 19 -------------------
 mm/mempolicy.c            | 48 -----------------------------------------------
 2 files changed, 67 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 817db64..b972f98 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -110,14 +110,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 #define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
 
 /*
- * Hugetlb policy. i386 hugetlb so far works with node numbers
- * instead of zone lists, so give it special interfaces for now.
- */
-extern int mpol_first_node(struct vm_area_struct *vma, unsigned long addr);
-extern int mpol_node_valid(int nid, struct vm_area_struct *vma,
-			unsigned long addr);
-
-/*
  * Tree of shared policies for a shared memory region.
  * Maintain the policies in a pseudo mm that contains vmas. The vmas
  * carry the policy. As a special twist the pseudo mm is indexed in pages, not
@@ -184,17 +176,6 @@ static inline struct mempolicy *mpol_copy(struct mempolicy *old)
 	return NULL;
 }
 
-static inline int mpol_first_node(struct vm_area_struct *vma, unsigned long a)
-{
-	return numa_node_id();
-}
-
-static inline int
-mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long a)
-{
-	return 1;
-}
-
 struct shared_policy {};
 
 static inline int mpol_set_shared_policy(struct shared_policy *info,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 45c51ac..96714e2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -961,54 +961,6 @@ void __mpol_free(struct mempolicy *p)
 }
 
 /*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_DEFAULT:
-		return numa_node_id();
-	case MPOL_BIND:
-		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-	case MPOL_INTERLEAVE:
-		return interleave_nodes(pol);
-	case MPOL_PREFERRED:
-		return pol->v.preferred_node >= 0 ?
-				pol->v.preferred_node : numa_node_id();
-	}
-	BUG();
-	return 0;
-}
-
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
-
-	switch (pol->policy) {
-	case MPOL_PREFERRED:
-	case MPOL_DEFAULT:
-	case MPOL_INTERLEAVE:
-		return 1;
-	case MPOL_BIND: {
-		struct zone **z;
-		for (z = pol->v.zonelist->zones; *z; z++)
-			if ((*z)->zone_pgdat->node_id == nid)
-				return 1;
-		return 0;
-	}
-	default:
-		BUG();
-		return 0;
-	}
-}
-
-/*
  * Shared memory backing store policy support.
  *
  * Remember policies even when nobody has shared memory mapped.
-- 
cgit v1.1


From 6bda666a03f063968833760c5bb5c13062ab9291 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:10:49 -0800
Subject: [PATCH] hugepages: fold find_or_alloc_pages into huge_no_page()

The number of parameters for find_or_alloc_page increases significantly after
policy support is added to huge pages.  Simplify the code by folding
find_or_alloc_huge_page() into hugetlb_no_page().

Adam Litke objected to this piece in an earlier patch but I think this is a
good simplification.  Diffstat shows that we can get rid of almost half of the
lines of find_or_alloc_page().  If we can find no consensus then lets simply
drop this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 66 ++++++++++++++++++++++--------------------------------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eb40556..f4c43d7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -368,43 +368,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	flush_tlb_range(vma, start, end);
 }
 
-static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
-			unsigned long addr, struct address_space *mapping,
-			unsigned long idx, int shared)
-{
-	struct page *page;
-	int err;
-
-retry:
-	page = find_lock_page(mapping, idx);
-	if (page)
-		goto out;
-
-	if (hugetlb_get_quota(mapping))
-		goto out;
-	page = alloc_huge_page(vma, addr);
-	if (!page) {
-		hugetlb_put_quota(mapping);
-		goto out;
-	}
-
-	if (shared) {
-		err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
-		if (err) {
-			put_page(page);
-			hugetlb_put_quota(mapping);
-			if (err == -EEXIST)
-				goto retry;
-			page = NULL;
-		}
-	} else {
-		/* Caller expects a locked page */
-		lock_page(page);
-	}
-out:
-	return page;
-}
-
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
@@ -471,12 +434,31 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(vma, address, mapping, idx,
-			vma->vm_flags & VM_SHARED);
-	if (!page)
-		goto out;
+retry:
+	page = find_lock_page(mapping, idx);
+	if (!page) {
+		if (hugetlb_get_quota(mapping))
+			goto out;
+		page = alloc_huge_page(vma, address);
+		if (!page) {
+			hugetlb_put_quota(mapping);
+			goto out;
+		}
 
-	BUG_ON(!PageLocked(page));
+		if (vma->vm_flags & VM_SHARED) {
+			int err;
+
+			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+			if (err) {
+				put_page(page);
+				hugetlb_put_quota(mapping);
+				if (err == -EEXIST)
+					goto retry;
+				goto out;
+			}
+		} else
+			lock_page(page);
+	}
 
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
-- 
cgit v1.1


From 9f3fd602aef96c2a490e3bfd669d06475aeba8d8 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:50 -0800
Subject: [PATCH] mm: kvaddr_to_nid not used in common code

kvaddr_to_nid() isn't used in common code nor in i386 code.  Remove these
definitions.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h | 5 -----
 include/linux/mmzone.h    | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 620a906..74f595d 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -76,11 +76,6 @@ static inline int pfn_to_nid(unsigned long pfn)
  * Following are macros that each numa implmentation must define.
  */
 
-/*
- * Given a kernel address, find the home node of the underlying memory.
- */
-#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
-
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9f22090d..3c49f78 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -564,11 +564,6 @@ static inline int valid_section_nr(unsigned long nr)
 	return valid_section(__nr_to_section(nr));
 }
 
-/*
- * Given a kernel address, find the home node of the underlying memory.
- */
-#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
-
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return __nr_to_section(pfn_to_section_nr(pfn));
-- 
cgit v1.1


From d5afa6dcf74c0efb60ce07c63d0a727be93c67c5 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:50 -0800
Subject: [PATCH] mm: pfn_to_pgdat not used in common code

pfn_to_pgdat() isn't used in common code.  Remove definition.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3c49f78..28f8496 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -596,11 +596,6 @@ static inline int pfn_valid(unsigned long pfn)
 #define pfn_to_nid		early_pfn_to_nid
 #endif
 
-#define pfn_to_pgdat(pfn)						\
-({									\
-	NODE_DATA(pfn_to_nid(pfn));					\
-})
-
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
 void sparse_init(void);
 #else
-- 
cgit v1.1


From a94b3ab7eab4edcc9b2cb474b188f774c331adf7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Fri, 6 Jan 2006 00:10:51 -0800
Subject: [PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES

The NODES_SPAN_OTHER_NODES config option was created so that DISCONTIGMEM
could handle pSeries numa layouts.  However, support for DISCONTIGMEM has
been replaced by SPARSEMEM on powerpc.  As a result, this config option and
supporting code is no longer needed.

I have already sent a patch to Paul that removes the option from powerpc
specific code.  This removes the arch independent piece.  Doesn't really
matter which is applied first.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 ------
 mm/page_alloc.c        | 2 --
 2 files changed, 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 28f8496..d294b57 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -603,12 +603,6 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
-#else
-#define early_pfn_in_nid(pfn, nid)	(1)
-#endif
-
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1e49dc7..07825c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1708,8 +1708,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
 		if (!early_pfn_valid(pfn))
 			continue;
-		if (!early_pfn_in_nid(pfn, nid))
-			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 1);
-- 
cgit v1.1


From 03b00ebcc804180829d513df9e92e5fe8f72aacf Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Fri, 6 Jan 2006 00:10:52 -0800
Subject: [PATCH] Shut up warnings in ipc/shm.c

Fix two warnings in ipc/shm.c

ipc/shm.c:122: warning: statement with no effect
ipc/shm.c:560: warning: statement with no effect

by converting the macros to empty inline functions.  For safety, let's do
all three.  This also has the advantage that typechecking gets performed
even without CONFIG_SHMEM enabled.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c9be99..75ec04e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -634,9 +634,24 @@ struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #else
 #define shmem_nopage filemap_nopage
-#define shmem_lock(a, b, c) 	({0;})	/* always in memory, no need to lock */
-#define shmem_set_policy(a, b)	(0)
-#define shmem_get_policy(a, b)	(NULL)
+
+static inline int shmem_lock(struct file *file, int lock,
+			     struct user_struct *user)
+{
+	return 0;
+}
+
+static inline int shmem_set_policy(struct vm_area_struct *vma,
+				   struct mempolicy *new)
+{
+	return 0;
+}
+
+static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+						 unsigned long addr)
+{
+	return NULL;
+}
 #endif
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
 
-- 
cgit v1.1


From 2bdaf115b1c364d89484b59d5b937973f1c5a5c3 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:53 -0800
Subject: [PATCH] flatmem split out memory model

There are three places we define pfn_to_nid().  Two in linux/mmzone.h and one
in asm/mmzone.h.  These in essence represent the three memory models.  The
definition in linux/mmzone.h under !NEED_MULTIPLE_NODES is both the FLATMEM
definition and the optimisation for single NUMA nodes; the one under SPARSEMEM
is the NUMA sparsemem one; the one in asm/mmzone.h under DISCONTIGMEM is the
discontigmem one.  This is not in the least bit obvious, particularly the
connection between the non-NUMA optimisations and the memory models.

Two patches:

flatmem-split-out-memory-model: simplifies the selection of pfn_to_nid()
implementations.  The selection is based primarily off the memory model
selected.  Optimisations for non-NUMA are applied where needed.

sparse-provide-pfn_to_nid: implement pfn_to_nid() for SPARSEMEM

This patch:

pfn_to_nid is memory model specific

The pfn_to_nid() call is memory model specific.  It represents the locality
identifier for the memory passed.  Classically this would be a NUMA node,
but not a chunk of memory under DISCONTIGMEM.

The SPARSEMEM and FLATMEM memory model non-NUMA versions of pfn_to_nid()
are folded together under NEED_MULTIPLE_NODES, while DISCONTIGMEM has its
own optimisation.  This is all very confusing.

This patch splits out each implementation of pfn_to_nid() so that we can
see them and the optimisations to each.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d294b57..ee9f7b7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,7 +435,6 @@ extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 #define MAX_NODES_SHIFT		1
-#define pfn_to_nid(pfn)		(0)
 
 #else /* CONFIG_NEED_MULTIPLE_NODES */
 
@@ -470,6 +469,10 @@ extern struct pglist_data contig_page_data;
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
+#ifdef CONFIG_FLATMEM
+#define pfn_to_nid(pfn)		(0)
+#endif
+
 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
 
@@ -594,6 +597,8 @@ static inline int pfn_valid(unsigned long pfn)
  */
 #ifdef CONFIG_NUMA
 #define pfn_to_nid		early_pfn_to_nid
+#else
+#define pfn_to_nid(pfn)		(0)
 #endif
 
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
-- 
cgit v1.1


From 161599ff39a3c3cdea0a1be05ac53accd2c45cdd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:10:54 -0800
Subject: [PATCH] sparsemem: provide pfn_to_nid

Before SPARSEMEM is initialised we cannot provide an efficient pfn_to_nid()
implmentation; before initialisation is complete we use early_pfn_to_nid()
to provide location information.  Until recently there was no non-init user
of this functionality.  Provide a post init pfn_to_nid() implementation.

Note that this implmentation assumes that the pfn passed has been validated
with pfn_valid().  The current single user of this function already has
this check.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee9f7b7..8cba76c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -596,7 +596,11 @@ static inline int pfn_valid(unsigned long pfn)
  * this restriction.
  */
 #ifdef CONFIG_NUMA
-#define pfn_to_nid		early_pfn_to_nid
+#define pfn_to_nid(pfn)							\
+({									\
+	unsigned long __pfn_to_nid_pfn = (pfn);				\
+	page_to_nid(pfn_to_page(__pfn_to_nid_pfn));			\
+})
 #else
 #define pfn_to_nid(pfn)		(0)
 #endif
-- 
cgit v1.1


From c484d41042e6ccb88089ca41e3b3eed1bafdae21 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 6 Jan 2006 00:10:55 -0800
Subject: [PATCH] mm: free_pages_and_swap_cache opt

Minor optimization (though it doesn't help in the PREEMPT case, severely
constrained by small ZAP_BLOCK_SIZE).  free_pages_and_swap_cache works in
chunks of 16, calling release_pages which works in chunks of PAGEVEC_SIZE.
But PAGEVEC_SIZE was dropped from 16 to 14 in 2.6.10, so we're now doing more
spin_lock_irq'ing than necessary: use PAGEVEC_SIZE throughout.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap_state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57..fc2aecb 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 
 #include <asm/pgtable.h>
 
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
  */
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
-	int chunk = 16;
 	struct page **pagep = pages;
 
 	lru_add_drain();
 	while (nr) {
-		int todo = min(chunk, nr);
+		int todo = min(nr, PAGEVEC_SIZE);
 		int i;
 
 		for (i = 0; i < todo; i++)
-- 
cgit v1.1


From c54ad30c784b84d0275152d0ca80985b21471811 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:56 -0800
Subject: [PATCH] mm: pagealloc opt

Slightly optimise some page allocation and freeing functions by taking
advantage of knowing whether or not interrupts are disabled.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07825c6..680cbe5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -375,11 +375,10 @@ static int
 free_pages_bulk(struct zone *zone, int count,
 		struct list_head *list, unsigned int order)
 {
-	unsigned long flags;
 	struct page *page = NULL;
 	int ret = 0;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
@@ -389,12 +388,13 @@ free_pages_bulk(struct zone *zone, int count,
 		__free_pages_bulk(page, zone, order);
 		ret++;
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	spin_unlock(&zone->lock);
 	return ret;
 }
 
 void __free_pages_ok(struct page *page, unsigned int order)
 {
+	unsigned long flags;
 	LIST_HEAD(list);
 	int i;
 	int reserved = 0;
@@ -415,7 +415,9 @@ void __free_pages_ok(struct page *page, unsigned int order)
 	list_add(&page->lru, &list);
 	mod_page_state(pgfree, 1 << order);
 	kernel_map_pages(page, 1<<order, 0);
+	local_irq_save(flags);
 	free_pages_bulk(page_zone(page), 1, &list, order);
+	local_irq_restore(flags);
 }
 
 
@@ -539,12 +541,11 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
 			unsigned long count, struct list_head *list)
 {
-	unsigned long flags;
 	int i;
 	int allocated = 0;
 	struct page *page;
 	
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		page = __rmqueue(zone, order);
 		if (page == NULL)
@@ -552,7 +553,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		allocated++;
 		list_add_tail(&page->lru, list);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	spin_unlock(&zone->lock);
 	return allocated;
 }
 
@@ -589,6 +590,7 @@ void drain_remote_pages(void)
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+	unsigned long flags;
 	struct zone *zone;
 	int i;
 
@@ -600,8 +602,10 @@ static void __drain_pages(unsigned int cpu)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
+			local_irq_save(flags);
 			pcp->count -= free_pages_bulk(zone, pcp->count,
 						&pcp->list, 0);
+			local_irq_restore(flags);
 		}
 	}
 }
@@ -744,7 +748,7 @@ again:
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
-		if (pcp->count) {
+		if (likely(pcp->count)) {
 			page = list_entry(pcp->list.next, struct page, lru);
 			list_del(&page->lru);
 			pcp->count--;
-- 
cgit v1.1


From 77a8a78834561398fb4cb1480afa7b0e80b1dd53 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:57 -0800
Subject: [PATCH] mm: set_page_refs opt

Inline set_page_refs.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/internal.h   | 19 +++++++++++++++++--
 mm/page_alloc.c | 17 -----------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e..85004f5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,20 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-/* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+	set_page_count(page, 1);
+#else
+	int i;
+
+	/*
+	 * We need to reference all the pages for this order, otherwise if
+	 * anyone accesses one of the pages with (get/put) it will be freed.
+	 * - eg: access_process_vm()
+	 */
+	for (i = 0; i < (1 << order); i++)
+		set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 680cbe5..6d513fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -453,23 +453,6 @@ expand(struct zone *zone, struct page *page,
 	return page;
 }
 
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
 /*
  * This page is about to be returned from the page allocator
  */
-- 
cgit v1.1


From 92be2e33b155ee76399f51f41fb061f850d02f08 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:57 -0800
Subject: [PATCH] mm: microopt conditions

Micro optimise some conditionals where we don't need lazy evaluation.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d513fa..b0647b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -336,9 +336,9 @@ static inline void __free_pages_bulk (struct page *page,
 
 static inline int free_pages_check(const char *function, struct page *page)
 {
-	if (	page_mapcount(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private |
@@ -348,7 +348,7 @@ static inline int free_pages_check(const char *function, struct page *page)
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved )))
+			1 << PG_reserved ))))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -458,9 +458,9 @@ expand(struct zone *zone, struct page *page,
  */
 static int prep_new_page(struct page *page, int order)
 {
-	if (	page_mapcount(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
+	if (unlikely(page_mapcount(page) |
+		(page->mapping != NULL)  |
+		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private	|
@@ -471,7 +471,7 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_reserved )))
+			1 << PG_reserved ))))
 		bad_page(__FUNCTION__, page);
 
 	/*
-- 
cgit v1.1


From 13e7444b0ec59f96d81a4e8c379d5f38fc5f2cc1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:58 -0800
Subject: [PATCH] mm: remove bad_range

bad_range is supposed to be a temporary check.  It would be a pity to throw it
out.  Make it depend on CONFIG_DEBUG_VM instead.

CONFIG_HOLES_IN_ZONE systems were relying on this to check pfn_valid in the
page allocator.  Add that to page_is_buddy instead.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 lib/Kconfig.debug |  3 ++-
 mm/page_alloc.c   | 26 +++++++++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 156822e..1cedc23 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -172,7 +172,8 @@ config DEBUG_VM
 	bool "Debug VM"
 	depends on DEBUG_KERNEL
 	help
-	  Enable this to debug the virtual-memory system.
+	  Enable this to turn on extended checks in the virtual-memory system
+          that may impact performance.
 
 	  If unsure, say N.
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b0647b5..088712f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -81,6 +81,7 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
+#ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
@@ -122,6 +123,13 @@ static int bad_range(struct zone *zone, struct page *page)
 	return 0;
 }
 
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+	return 0;
+}
+#endif
+
 static void bad_page(const char *function, struct page *page)
 {
 	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
- * (b) the buddy is on the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (a) the buddy is not in a hole &&
+ * (b) the buddy is free &&
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
  * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
 {
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		return 0;
+#endif
+
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
             page_count(page) == 0)
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
 		struct free_area *area;
 		struct page *buddy;
 
-		combined_idx = __find_combined_index(page_idx, order);
 		buddy = __page_find_buddy(page, page_idx, order);
-
-		if (bad_range(zone, buddy))
-			break;
 		if (!page_is_buddy(buddy, order))
 			break;		/* Move the buddy up one level. */
+
 		list_del(&buddy->lru);
 		area = zone->free_area + order;
 		area->nr_free--;
 		rmv_page_order(buddy);
+		combined_idx = __find_combined_index(page_idx, order);
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
-- 
cgit v1.1


From 2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:10:59 -0800
Subject: [PATCH] mm: remove pcp low

struct per_cpu_pages.low is useless.  Remove it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 1 -
 mm/page_alloc.c        | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8cba76c..0d1a598 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -46,7 +46,6 @@ struct zone_padding {
 
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
-	int low;		/* low watermark, refill needed */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	struct list_head list;	/* the list of pages */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 088712f..7cff958 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -740,7 +740,7 @@ again:
 		page = NULL;
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
+		if (!pcp->count)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (likely(pcp->count)) {
@@ -1345,10 +1345,9 @@ void show_free_areas(void)
 			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+				printk("cpu %d %s: high %d, batch %d used:%d\n",
 					cpu,
 					temperature ? "cold" : "hot",
-					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
 					pageset->pcp[temperature].batch,
 					pageset->pcp[temperature].count);
@@ -1790,14 +1789,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 
 	pcp = &p->pcp[0];		/* hot */
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 
 	pcp = &p->pcp[1];		/* cold*/
 	pcp->count = 0;
-	pcp->low = 0;
 	pcp->high = 2 * batch;
 	pcp->batch = max(1UL, batch/2);
 	INIT_LIST_HEAD(&pcp->list);
@@ -2193,12 +2190,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 				seq_printf(m,
 					   "\n    cpu: %i pcp: %i"
 					   "\n              count: %i"
-					   "\n              low:   %i"
 					   "\n              high:  %i"
 					   "\n              batch: %i",
 					   i, j,
 					   pageset->pcp[j].count,
-					   pageset->pcp[j].low,
 					   pageset->pcp[j].high,
 					   pageset->pcp[j].batch);
 			}
-- 
cgit v1.1


From a86b1f53166a260ced8f3c8c526945bf496f2e78 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:00 -0800
Subject: [PATCH] mm: page_state fixes

read_page_state and __get_page_state only traverse online CPUs, which will
cause results to fluctuate when CPUs are plugged in or out.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7cff958..3796187 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1169,12 +1169,11 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
 	memset(ret, 0, sizeof(*ret));
-	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
@@ -1227,7 +1226,7 @@ unsigned long __read_page_state(unsigned long offset)
 	unsigned long ret = 0;
 	int cpu;
 
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu) {
 		unsigned long in;
 
 		in = (unsigned long)&per_cpu(page_states, cpu) + offset;
-- 
cgit v1.1


From 085cc7d5de3cc662da7ea78296464a0d52f3f01f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:01 -0800
Subject: [PATCH] mm: page_alloc cleanups

Small cleanups that does not change generated code with the gcc's I've tested
with.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3796187..925b0b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -447,8 +447,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
  *
  * -- wli
  */
-static inline struct page *
-expand(struct zone *zone, struct page *page,
+static inline void expand(struct zone *zone, struct page *page,
  	int low, int high, struct free_area *area)
 {
 	unsigned long size = 1 << high;
@@ -462,7 +461,6 @@ expand(struct zone *zone, struct page *page,
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
-	return page;
 }
 
 /*
@@ -522,7 +520,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 		rmv_page_order(page);
 		area->nr_free--;
 		zone->free_pages -= 1UL << order;
-		return expand(zone, page, order, current_order, area);
+		expand(zone, page, order, current_order, area);
+		return page;
 	}
 
 	return NULL;
@@ -537,19 +536,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list)
 {
 	int i;
-	int allocated = 0;
-	struct page *page;
 	
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
-		page = __rmqueue(zone, order);
-		if (page == NULL)
+		struct page *page = __rmqueue(zone, order);
+		if (unlikely(page == NULL))
 			break;
-		allocated++;
 		list_add_tail(&page->lru, list);
 	}
 	spin_unlock(&zone->lock);
-	return allocated;
+	return i;
 }
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.1


From 008857c1a49ccffc31a54c3ea7e182833bd61304 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Fri, 6 Jan 2006 00:11:01 -0800
Subject: [PATCH] Cleanup bootmem allocator and fix alloc_bootmem_low

Patch cleans up the alloc_bootmem fix for swiotlb.  Patch removes
alloc_bootmem_*_limit api and fixes alloc_boot_*low api to do the right
thing -- allocate from low32 memory.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 46 ++++++++++++----------------------------------
 lib/swiotlb.c           |  3 +--
 mm/bootmem.c            | 38 +++++++++++++++++++++++++++++++-------
 3 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3b03b0b..993da8c 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -43,50 +43,38 @@ typedef struct bootmem_data {
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
 extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
 extern void __init free_bootmem (unsigned long addr, unsigned long size);
-extern void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
+extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __init __alloc_bootmem_low(unsigned long size,
+					 unsigned long align,
+					 unsigned long goal);
+extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
+					      unsigned long size,
+					      unsigned long align,
+					      unsigned long goal);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
 	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
-	__alloc_bootmem((x), SMP_CACHE_BYTES, 0)
+	__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem((x), PAGE_SIZE, 0)
-
-#define alloc_bootmem_limit(x, limit)						\
-	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_limit(x, limit)			\
-	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, 0, (limit))
-#define alloc_bootmem_pages_limit(x, limit)					\
-	__alloc_bootmem_limit((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_pages_limit(x, limit)		\
-	__alloc_bootmem_limit((x), PAGE_SIZE, 0, (limit))
-
+	__alloc_bootmem_low((x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long __init free_all_bootmem (void);
-
+extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
 extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
 extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
 extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
 extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
-extern void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
-
-#define alloc_bootmem_node_limit(pgdat, x, limit)				\
-	__alloc_bootmem_node_limit((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_pages_node_limit(pgdat, x, limit)				\
-	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
-#define alloc_bootmem_low_pages_node_limit(pgdat, x, limit)		\
-	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, 0, (limit))
-
+	__alloc_bootmem_low_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
@@ -123,15 +111,5 @@ extern void *__init alloc_large_system_hash(const char *tablename,
 #endif
 extern int __initdata hashdist;		/* Distribute hashes across NUMA nodes? */
 
-static inline void *__alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
-{
-	return __alloc_bootmem_limit(size, align, goal, 0);
-}
-
-static inline void *__alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align,
-				     unsigned long goal)
-{
-	return __alloc_bootmem_node_limit(pgdat, size, align, goal, 0);
-}
 
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1ff8dce..3b48205 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -142,8 +142,7 @@ swiotlb_init_with_default_size (size_t default_size)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages_limit(io_tlb_nslabs *
-					     (1 << IO_TLB_SHIFT), 0x100000000);
+	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465..cbb82ee 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -393,15 +393,14 @@ unsigned long __init free_all_bootmem (void)
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
-				unsigned long limit)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
 	pg_data_t *pgdat = pgdat_list;
 	void *ptr;
 
 	for_each_pgdat(pgdat)
 		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, limit)))
+						 align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -413,15 +412,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
 
 
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
-				     unsigned long goal, unsigned long limit)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
+				   unsigned long goal)
 {
 	void *ptr;
 
-	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return (ptr);
 
-	return __alloc_bootmem_limit(size, align, goal, limit);
+	return __alloc_bootmem(size, align, goal);
 }
 
+#define LOW32LIMIT 0xffffffff
+
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+	pg_data_t *pgdat = pgdat_list;
+	void *ptr;
+
+	for_each_pgdat(pgdat)
+		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+						 align, goal, LOW32LIMIT)))
+			return(ptr);
+
+	/*
+	 * Whoops, we cannot satisfy the allocation request.
+	 */
+	printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+	panic("Out of low memory");
+	return NULL;
+}
+
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+				       unsigned long align, unsigned long goal)
+{
+	return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
-- 
cgit v1.1


From a226f6c899799fe2c4919daa0767ac579c88f7bd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:08 -0800
Subject: [PATCH] FRV: Clean up bootmem allocator's page freeing algorithm

The attached patch cleans up the way the bootmem allocator frees pages.

A new function, __free_pages_bootmem(), is provided in mm/page_alloc.c that is
called from mm/bootmem.c to turn pages over to the main allocator.  All the
bits of code to initialise pages (clearing PG_reserved and setting the page
count) are moved to here.  The checks on page validity are removed, on the
assumption that the struct page arrays will have been prepared correctly.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/bootmem.c    | 20 ++++----------------
 mm/internal.h   |  2 ++
 mm/page_alloc.c | 36 +++++++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index cbb82ee..35c3229 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 		unsigned long v = ~map[i / BITS_PER_LONG];
 
 		if (gofast && v == ~0UL) {
-			int j, order;
+			int order;
 
 			page = pfn_to_page(pfn);
 			count += BITS_PER_LONG;
-			__ClearPageReserved(page);
 			order = ffs(BITS_PER_LONG) - 1;
-			set_page_refs(page, order);
-			for (j = 1; j < BITS_PER_LONG; j++) {
-				if (j + 16 < BITS_PER_LONG)
-					prefetchw(page + j + 16);
-				__ClearPageReserved(page + j);
-				set_page_count(page + j, 0);
-			}
-			__free_pages(page, order);
+			__free_pages_bootmem(page, order);
 			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		} else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 			for (m = 1; m && i < idx; m<<=1, page++, i++) {
 				if (v & m) {
 					count++;
-					__ClearPageReserved(page);
-					set_page_refs(page, 0);
-					__free_page(page);
+					__free_pages_bootmem(page, 0);
 				}
 			}
 		} else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 	count = 0;
 	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
 		count++;
-		__ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+		__free_pages_bootmem(page, 0);
 	}
 	total += count;
 	bdata->node_bootmem_map = NULL;
diff --git a/mm/internal.h b/mm/internal.h
index 85004f5..17256bb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -26,3 +26,5 @@ static inline void set_page_refs(struct page *page, int order)
 #endif /* CONFIG_MMU */
 }
 
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+						unsigned int order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 925b0b9..cdad324 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,8 @@ unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
+static void fastcall free_hot_cold_page(struct page *page, int cold);
+
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -432,6 +434,39 @@ void __free_pages_ok(struct page *page, unsigned int order)
 	local_irq_restore(flags);
 }
 
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	if (order == 0) {
+		__ClearPageReserved(page);
+		set_page_count(page, 0);
+
+		free_hot_cold_page(page, 0);
+	} else {
+		LIST_HEAD(list);
+		int loop;
+
+		for (loop = 0; loop < BITS_PER_LONG; loop++) {
+			struct page *p = &page[loop];
+
+			if (loop + 16 < BITS_PER_LONG)
+				prefetchw(p + 16);
+			__ClearPageReserved(p);
+			set_page_count(p, 0);
+		}
+
+		arch_free_page(page, order);
+
+		mod_page_state(pgfree, 1 << order);
+
+		list_add(&page->lru, &list);
+		kernel_map_pages(page, 1 << order, 0);
+		free_pages_bulk(page_zone(page), 1, &list, order);
+	}
+}
+
 
 /*
  * The order of subdivision here is critical for the IO subsystem.
@@ -671,7 +706,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 /*
  * Free a 0-order page
  */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
-- 
cgit v1.1


From bbfbb7cec9dd7266534b2b4b9c8be2fa425bbfc9 Mon Sep 17 00:00:00 2001
From: Nikita Danilov <nikita@clusterfs.com>
Date: Fri, 6 Jan 2006 00:11:08 -0800
Subject: [PATCH] find_lock_page(): call __lock_page() directly.

As find_lock_page() already checks with TestSetPageLocked() that page is
locked, there is no need to call lock_page() that will try-lock page again
(chances of page being unlocked in between are small).  Call __lock_page()
directly, this saves one atomic operation.

Also, mark truncate-while-slept path as unlikely while we are here.

(akpm: ug.  But this is actually a common path for normal old read()s against
a page which is under readahead I/O so ho-hum.)

Signed-off-by: Nikita Danilov <danilov@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 6e1d08a..4ef24a3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -555,11 +555,12 @@ repeat:
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			read_unlock_irq(&mapping->tree_lock);
-			lock_page(page);
+			__lock_page(page);
 			read_lock_irq(&mapping->tree_lock);
 
 			/* Has the page been truncated while we slept? */
-			if (page->mapping != mapping || page->index != offset) {
+			if (unlikely(page->mapping != mapping ||
+				     page->index != offset)) {
 				unlock_page(page);
 				page_cache_release(page);
 				goto repeat;
-- 
cgit v1.1


From 7756b9e4e321c3c83c7aa5b9532d3e7fd7ddeb4a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:11:09 -0800
Subject: [PATCH] kill last zone_reclaim() bits

Remove the last bits of Martin's ill-fated sys_set_zone_reclaim().

Cc: Martin Hicks <mort@wildopensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/unistd.h |  2 +-
 include/asm-ia64/unistd.h |  2 +-
 include/linux/swap.h      |  1 -
 mm/vmscan.c               | 80 -----------------------------------------------
 4 files changed, 2 insertions(+), 83 deletions(-)

diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 0f92e78..fe38b9a 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -256,7 +256,7 @@
 #define __NR_io_submit		248
 #define __NR_io_cancel		249
 #define __NR_fadvise64		250
-#define __NR_set_zone_reclaim	251
+/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
 #define __NR_epoll_create	254
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 6d96a67..2bf5434 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
 #define __NR_keyctl			1273
 #define __NR_ioprio_set			1274
 #define __NR_ioprio_get			1275
-#define __NR_set_zone_reclaim		1276
+/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 508668f..bd66417 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,7 +172,6 @@ extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, gfp_t);
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 795a050..b2baca7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,9 +74,6 @@ struct scan_control {
 
 	int may_writepage;
 
-	/* Can pages be swapped as part of reclaim? */
-	int may_swap;
-
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
 	 * In this context, it doesn't matter that we scan the
@@ -430,8 +427,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
 			if (!add_to_swap(page))
 				goto activate_locked;
 		}
@@ -952,7 +947,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 
 	sc.gfp_mask = gfp_mask;
 	sc.may_writepage = 0;
-	sc.may_swap = 1;
 
 	inc_page_state(allocstall);
 
@@ -1055,7 +1049,6 @@ loop_again:
 	total_reclaimed = 0;
 	sc.gfp_mask = GFP_KERNEL;
 	sc.may_writepage = 0;
-	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1353,76 +1346,3 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
-
-
-/*
- * Try to free up some pages from this zone through reclaim.
- */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-{
-	struct scan_control sc;
-	int nr_pages = 1 << order;
-	int total_reclaimed = 0;
-
-	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 0;
-	if (zone->all_unreclaimable)
-		return 0;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = 0;
-	sc.may_swap = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	/* scan at the highest priority */
-	sc.priority = 0;
-	disable_swap_token();
-
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
-	/* Don't reclaim the zone if there are other reclaimers active */
-	if (atomic_read(&zone->reclaim_in_progress) > 0)
-		goto out;
-
-	shrink_zone(zone, &sc);
-	total_reclaimed = sc.nr_reclaimed;
-
- out:
-	return total_reclaimed;
-}
-
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-				     unsigned int state)
-{
-	struct zone *z;
-	int i;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (node >= MAX_NUMNODES || !node_online(node))
-		return -EINVAL;
-
-	/* This will break if we ever add more zones */
-	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-		return -EINVAL;
-
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (!(zone & 1<<i))
-			continue;
-
-		z = &NODE_DATA(node)->node_zones[i];
-
-		if (state)
-			z->reclaim_pages = 1;
-		else
-			z->reclaim_pages = 0;
-	}
-
-	return 0;
-}
-- 
cgit v1.1


From 9328b8faae922e52073785ed6c1eaa8565648a0e Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:10 -0800
Subject: [PATCH] mm: dma32 zone statistics

Add dma32 to zone statistics.  Also attempt to arrange struct page_state a
bit better (visually).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h     | 11 +++++++++++
 include/linux/page-flags.h | 38 ++++++++++++++++++++++++--------------
 mm/page_alloc.c            | 14 +++++++++++---
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d1a598..8d6caa4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -397,6 +397,7 @@ static inline int is_normal_idx(int idx)
 {
 	return (idx == ZONE_NORMAL);
 }
+
 /**
  * is_highmem - helper function to quickly check if a struct zone is a 
  *              highmem zone or not.  This is an attempt to keep references
@@ -413,6 +414,16 @@ static inline int is_normal(struct zone *zone)
 	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
 }
 
+static inline int is_dma32(struct zone *zone)
+{
+	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+}
+
+static inline int is_dma(struct zone *zone)
+{
+	return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
+}
+
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 struct file;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 343083f..32d09c8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -97,32 +97,40 @@ struct page_state {
 	unsigned long pgpgout;		/* Disk writes */
 	unsigned long pswpin;		/* swap reads */
 	unsigned long pswpout;		/* swap writes */
-	unsigned long pgalloc_high;	/* page allocations */
 
+	unsigned long pgalloc_high;	/* page allocations */
 	unsigned long pgalloc_normal;
+	unsigned long pgalloc_dma32;
 	unsigned long pgalloc_dma;
+
 	unsigned long pgfree;		/* page freeings */
 	unsigned long pgactivate;	/* pages moved inactive->active */
 	unsigned long pgdeactivate;	/* pages moved active->inactive */
 
 	unsigned long pgfault;		/* faults (major+minor) */
 	unsigned long pgmajfault;	/* faults (major only) */
+
 	unsigned long pgrefill_high;	/* inspected in refill_inactive_zone */
 	unsigned long pgrefill_normal;
+	unsigned long pgrefill_dma32;
 	unsigned long pgrefill_dma;
 
 	unsigned long pgsteal_high;	/* total highmem pages reclaimed */
 	unsigned long pgsteal_normal;
+	unsigned long pgsteal_dma32;
 	unsigned long pgsteal_dma;
+
 	unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
 	unsigned long pgscan_kswapd_normal;
-
+	unsigned long pgscan_kswapd_dma32;
 	unsigned long pgscan_kswapd_dma;
+
 	unsigned long pgscan_direct_high;/* total highmem pages scanned */
 	unsigned long pgscan_direct_normal;
+	unsigned long pgscan_direct_dma32;
 	unsigned long pgscan_direct_dma;
-	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
 
+	unsigned long pginodesteal;	/* pages reclaimed via inode freeing */
 	unsigned long slabs_scanned;	/* slab objects scanned */
 	unsigned long kswapd_steal;	/* pages reclaimed by kswapd */
 	unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
@@ -150,17 +158,19 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 #define add_page_state(member,delta) mod_page_state(member, (delta))
 #define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
 
-#define mod_page_state_zone(zone, member, delta)				\
-	do {									\
-		unsigned offset;						\
-		if (is_highmem(zone))						\
-			offset = offsetof(struct page_state, member##_high);	\
-		else if (is_normal(zone))					\
-			offset = offsetof(struct page_state, member##_normal);	\
-		else								\
-			offset = offsetof(struct page_state, member##_dma);	\
-		__mod_page_state(offset, (delta));				\
-	} while (0)
+#define mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	unsigned offset;						\
+	if (is_highmem(zone))						\
+		offset = offsetof(struct page_state, member##_high);	\
+	else if (is_normal(zone))					\
+		offset = offsetof(struct page_state, member##_normal);	\
+	else if (is_dma32(zone))					\
+		offset = offsetof(struct page_state, member##_dma32);	\
+	else								\
+		offset = offsetof(struct page_state, member##_dma);	\
+	__mod_page_state(offset, (delta));				\
+ } while (0)
 
 /*
  * Manipulation of page state flags
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdad324..e12154d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2277,32 +2277,40 @@ static char *vmstat_text[] = {
 	"pgpgout",
 	"pswpin",
 	"pswpout",
-	"pgalloc_high",
 
+	"pgalloc_high",
 	"pgalloc_normal",
+	"pgalloc_dma32",
 	"pgalloc_dma",
+
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
 
 	"pgfault",
 	"pgmajfault",
+
 	"pgrefill_high",
 	"pgrefill_normal",
+	"pgrefill_dma32",
 	"pgrefill_dma",
 
 	"pgsteal_high",
 	"pgsteal_normal",
+	"pgsteal_dma32",
 	"pgsteal_dma",
+
 	"pgscan_kswapd_high",
 	"pgscan_kswapd_normal",
-
+	"pgscan_kswapd_dma32",
 	"pgscan_kswapd_dma",
+
 	"pgscan_direct_high",
 	"pgscan_direct_normal",
+	"pgscan_direct_dma32",
 	"pgscan_direct_dma",
-	"pginodesteal",
 
+	"pginodesteal",
 	"slabs_scanned",
 	"kswapd_steal",
 	"kswapd_inodesteal",
-- 
cgit v1.1


From 224abf92b2f439a9030f21d2926ec8047d1ffcdb Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:11 -0800
Subject: [PATCH] mm: bad_page optimisation

Cut down size slightly by not passing bad_page the function name (it should be
able to be determined by dump_stack()).  And cut down the number of printks in
bad_page.

Also, cut down some branching in the destroy_compound_page path.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e12154d..b9fd2c2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -132,16 +132,16 @@ static inline int bad_range(struct zone *zone, struct page *page)
 }
 #endif
 
-static void bad_page(const char *function, struct page *page)
-{
-	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
-		function, current->comm, page);
-	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-		(int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
-		page->mapping, page_mapcount(page), page_count(page));
-	printk(KERN_EMERG "Backtrace:\n");
+static void bad_page(struct page *page)
+{
+	printk(KERN_EMERG "Bad page state in process '%s'\n"
+		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+		"Trying to fix it up, but a reboot is needed\n"
+		"Backtrace:\n",
+		current->comm, page, (int)(2*sizeof(unsigned long)),
+		(unsigned long)page->flags, page->mapping,
+		page_mapcount(page), page_count(page));
 	dump_stack();
-	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
@@ -194,19 +194,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (!PageCompound(page))
-		return;
-
-	if (page[1].index != order)
-		bad_page(__FUNCTION__, page);
+	if (unlikely(page[1].index != order))
+		bad_page(page);
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		if (!PageCompound(p))
-			bad_page(__FUNCTION__, page);
-		if (page_private(p) != (unsigned long)page)
-			bad_page(__FUNCTION__, page);
+		if (unlikely(!PageCompound(p) |
+				(page_private(p) != (unsigned long)page)))
+			bad_page(page);
 		ClearPageCompound(p);
 	}
 }
@@ -316,7 +312,7 @@ static inline void __free_pages_bulk (struct page *page,
 	unsigned long page_idx;
 	int order_size = 1 << order;
 
-	if (unlikely(order))
+	if (unlikely(PageCompound(page)))
 		destroy_compound_page(page, order);
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -348,7 +344,7 @@ static inline void __free_pages_bulk (struct page *page,
 	zone->free_area[order].nr_free++;
 }
 
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -363,7 +359,7 @@ static inline int free_pages_check(const char *function, struct page *page)
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved ))))
-		bad_page(function, page);
+		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 	/*
@@ -422,7 +418,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
 #endif
 
 	for (i = 0 ; i < (1 << order) ; ++i)
-		reserved += free_pages_check(__FUNCTION__, page + i);
+		reserved += free_pages_check(page + i);
 	if (reserved)
 		return;
 
@@ -517,7 +513,7 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved ))))
-		bad_page(__FUNCTION__, page);
+		bad_page(page);
 
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
@@ -716,7 +712,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 
 	if (PageAnon(page))
 		page->mapping = NULL;
-	if (free_pages_check(__FUNCTION__, page))
+	if (free_pages_check(page))
 		return;
 
 	inc_page_state(pgfree);
-- 
cgit v1.1


From 9617d95e6e9ffd883cf90a89724fe60d7ab22f9a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:12 -0800
Subject: [PATCH] mm: rmap optimisation

Optimise rmap functions by minimising atomic operations when we know there
will be no concurrent modifications.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c            |  2 +-
 include/linux/rmap.h |  1 +
 mm/memory.c          |  6 +++---
 mm/rmap.c            | 49 ++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 22533cc..e75a954 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
-	page_add_anon_rmap(page, vma, address);
+	page_add_new_anon_rmap(page, vma, address);
 	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 33261f1..9d6fbee 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,6 +71,7 @@ void __anon_vma_link(struct vm_area_struct *);
  * rmap interfaces called when adding or removing pte of page
  */
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
diff --git a/mm/memory.c b/mm/memory.c
index e249088..d7ca7de 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 		lru_cache_add_active(new_page);
-		page_add_anon_rmap(new_page, vma, address);
+		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
 		new_page = old_page;
@@ -1978,7 +1978,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
-		page_add_anon_rmap(page, vma, address);
+		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2109,7 +2109,7 @@ retry:
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
-			page_add_anon_rmap(new_page, vma, address);
+			page_add_new_anon_rmap(new_page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6d..4107f64 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -435,6 +435,26 @@ int page_referenced(struct page *page, int is_locked)
 }
 
 /**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = vma->anon_vma;
+
+	BUG_ON(!anon_vma);
+	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	page->mapping = (struct address_space *) anon_vma;
+
+	page->index = linear_page_index(vma, address);
+
+	inc_page_state(nr_mapped);
+}
+
+/**
  * page_add_anon_rmap - add pte mapping to an anonymous page
  * @page:	the page to add the mapping to
  * @vma:	the vm area in which the mapping is added
@@ -445,20 +465,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	if (atomic_inc_and_test(&page->_mapcount)) {
-		struct anon_vma *anon_vma = vma->anon_vma;
-
-		BUG_ON(!anon_vma);
-		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-		page->mapping = (struct address_space *) anon_vma;
-
-		page->index = linear_page_index(vma, address);
-
-		inc_page_state(nr_mapped);
-	}
+	if (atomic_inc_and_test(&page->_mapcount))
+		__page_set_anon_rmap(page, vma, address);
 	/* else checking page index and mapping is racy */
 }
 
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @address:	the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+	struct vm_area_struct *vma, unsigned long address)
+{
+	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+	__page_set_anon_rmap(page, vma, address);
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page: the page to add the mapping to
-- 
cgit v1.1


From 41e9b63b35b52cf918a4ffdb8d77862ab824aa8b Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:13 -0800
Subject: [PATCH] mm: pfault optimisation

This atomic operation is superfluous: the pte will be added with the
referenced bit set, and the page will be referenced through this mapping after
the page fault handler returns anyway.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index d7ca7de..7197f9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1977,7 +1977,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto release;
 		inc_mm_counter(mm, anon_rss);
 		lru_cache_add_active(page);
-		SetPageReferenced(page);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
-- 
cgit v1.1


From 210fe530305ee50cd889fe9250168228b2994f32 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:11:14 -0800
Subject: [PATCH] vmscan: balancing fix

Revert a patch which went into 2.6.8-rc1.  The changelog for that patch was:

  The shrink_zone() logic can, under some circumstances, cause far too many
  pages to be reclaimed.  Say, we're scanning at high priority and suddenly
  hit a large number of reclaimable pages on the LRU.

  Change things so we bale out when SWAP_CLUSTER_MAX pages have been
  reclaimed.

Problem is, this change caused significant imbalance in inter-zone scan
balancing by truncating scans of larger zones.

Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL.  The zone
balancing algorithm would require that if we're scanning 100 pages of
ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL.  But this logic will
cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are
reclaimed.  Thus effectively causing smaller zones to be scanned relatively
harder than large ones.

Now I need to remember what the workload was which caused me to write this
patch originally, then fix it up in a different way...

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2baca7..5c8a412 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
 
 	unsigned long nr_mapped;	/* From page_state */
 
-	/* How many pages shrink_cache() should reclaim */
-	int nr_to_reclaim;
-
 	/* Ask shrink_caches, or shrink_zone to scan at this priority */
 	unsigned int priority;
 
@@ -656,7 +653,6 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 		if (current_is_kswapd())
 			mod_page_state(kswapd_steal, nr_freed);
 		mod_page_state_zone(zone, pgsteal, nr_freed);
-		sc->nr_to_reclaim -= nr_freed;
 
 		spin_lock_irq(&zone->lru_lock);
 		/*
@@ -856,8 +852,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	else
 		nr_inactive = 0;
 
-	sc->nr_to_reclaim = sc->swap_cluster_max;
-
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
@@ -871,8 +865,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= sc->nr_to_scan;
 			shrink_cache(zone, sc);
-			if (sc->nr_to_reclaim <= 0)
-				break;
 		}
 	}
 
-- 
cgit v1.1


From 80bfed904c690642db9d4178950735299160950b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:11:14 -0800
Subject: [PATCH] consolidate lru_add_drain() and lru_drain_cache()

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Rajesh Shah <rajesh.shah@intel.com>
Cc: Li Shaohua <shaohua.li@intel.com>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 73d3514..ee6d71c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
 	put_cpu_var(lru_add_active_pvecs);
 }
 
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
 
+	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	pvec = &per_cpu(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_pvecs);
+}
+
+void lru_add_drain(void)
+{
+	__lru_add_drain(get_cpu());
+	put_cpu();
 }
 
 /*
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-
-	/* CPU is dead, so no locking needed. */
-	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
-}
 
 /* Drop the CPU's cached committed space back into the central pool. */
 static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 	if (action == CPU_DEAD) {
 		atomic_add(*committed, &vm_committed_space);
 		*committed = 0;
-		lru_drain_cache((long)hcpu);
+		__lru_add_drain((long)hcpu);
 	}
 	return NOTIFY_OK;
 }
-- 
cgit v1.1


From f3fe65122da05e1cd4c9140340d96ea2f95d0c49 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 6 Jan 2006 00:11:15 -0800
Subject: [PATCH] mm: add populated_zone() helper

There are numerous places we check whether a zone is populated or not.

Provide a helper function to check for populated zones and convert all
checks for zone->present_pages.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 5 +++++
 mm/page_alloc.c        | 8 ++++----
 mm/vmscan.c            | 8 ++++----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8d6caa4..c34f4a2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -388,6 +388,11 @@ static inline struct zone *next_zone(struct zone *zone)
 #define for_each_zone(zone) \
 	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 
+static inline int populated_zone(struct zone *zone)
+{
+	return (!!zone->present_pages);
+}
+
 static inline int is_highmem_idx(int idx)
 {
 	return (idx == ZONE_HIGHMEM);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9fd2c2..8f3de5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1358,7 +1358,7 @@ void show_free_areas(void)
 		show_node(zone);
 		printk("%s per-cpu:", zone->name);
 
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk(" empty\n");
 			continue;
 		} else
@@ -1435,7 +1435,7 @@ void show_free_areas(void)
 
 		show_node(zone);
 		printk("%s: ", zone->name);
-		if (!zone->present_pages) {
+		if (!populated_zone(zone)) {
 			printk("empty\n");
 			continue;
 		}
@@ -2134,7 +2134,7 @@ static int frag_show(struct seq_file *m, void *arg)
 	int order;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
@@ -2167,7 +2167,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
 		int i;
 
-		if (!zone->present_pages)
+		if (!populated_zone(zone))
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5c8a412..7681d8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -897,7 +897,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (zone->present_pages == 0)
+		if (!populated_zone(zone))
 			continue;
 
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -1069,7 +1069,7 @@ loop_again:
 			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 				struct zone *zone = pgdat->node_zones + i;
 
-				if (zone->present_pages == 0)
+				if (!populated_zone(zone))
 					continue;
 
 				if (zone->all_unreclaimable &&
@@ -1106,7 +1106,7 @@ scan:
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 
-			if (zone->present_pages == 0)
+			if (!populated_zone(zone))
 				continue;
 
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1258,7 +1258,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
 	pg_data_t *pgdat;
 
-	if (zone->present_pages == 0)
+	if (!populated_zone(zone))
 		return;
 
 	pgdat = zone->zone_pgdat;
-- 
cgit v1.1


From 1a93205bdffd9d7278d4a66081cdb48452522a58 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:16 -0800
Subject: [PATCH] mm: simplify build_zonelists_node by removing the case
 statement.

Simplify build_zonelists_node by removing the case statement.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f3de5a..7adc952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1455,35 +1455,23 @@ void show_free_areas(void)
 
 /*
  * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
-{
-	switch (k) {
-		struct zone *zone;
-	default:
-		BUG();
-	case ZONE_HIGHMEM:
-		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->present_pages) {
+static int __init build_zonelists_node(pg_data_t *pgdat,
+			struct zonelist *zonelist, int j, int k)
+{
+	struct zone *zone;
+
+	BUG_ON(k > ZONE_HIGHMEM);
+	for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) {
+		if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-			BUG();
+			BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL);
 #endif
 			zonelist->zones[j++] = zone;
 		}
-	case ZONE_NORMAL:
-		zone = pgdat->node_zones + ZONE_NORMAL;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
-	case ZONE_DMA32:
-		zone = pgdat->node_zones + ZONE_DMA32;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
-	case ZONE_DMA:
-		zone = pgdat->node_zones + ZONE_DMA;
-		if (zone->present_pages)
-			zonelist->zones[j++] = zone;
 	}
-
 	return j;
 }
 
-- 
cgit v1.1


From 4be38e351c5f455f6f490f5aff29053e33ab4f99 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:17 -0800
Subject: [PATCH] mm: move determination of policy_zone into page allocator

Currently the function to build a zonelist for a BIND policy has the side
effect to set the policy_zone.  This seems to be a bit strange.  policy
zone seems to not be initialized elsewhere and therefore 0.  Do we police
ZONE_DMA if no bind policy has been used yet?

This patch moves the determination of the zone to apply policies to into
the page allocator.  We determine the zone while building the zonelist for
nodes.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 11 +++++++++++
 mm/mempolicy.c            | 15 +++------------
 mm/page_alloc.c           |  2 ++
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index b972f98..ed00b27 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,6 +151,14 @@ extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
 
+extern int policy_zone;
+
+static inline void check_highest_zone(int k)
+{
+	if (k > policy_zone)
+		policy_zone = k;
+}
+
 #else
 
 struct mempolicy {};
@@ -221,6 +229,9 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
 
+static inline void check_highest_zone(int k)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96714e2..0f1d2b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	if (!zl)
 		return NULL;
 	num = 0;
-	for_each_node_mask(nd, *nodes) {
-		int k;
-		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (!z->present_pages)
-				continue;
-			zl->zones[num++] = z;
-			if (k > policy_zone)
-				policy_zone = k;
-		}
-	}
+	for_each_node_mask(nd, *nodes)
+		zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 	zl->zones[num] = NULL;
 	return zl;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7adc952..512e3f4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -1470,6 +1471,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat,
 			BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL);
 #endif
 			zonelist->zones[j++] = zone;
+			check_highest_zone(k);
 		}
 	}
 	return j;
-- 
cgit v1.1


From 02a68a5ebc7dd823da7496116f42290103e1e4a9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:18 -0800
Subject: [PATCH] Fix zone policy determination

The use k in the inner loop means that the highest zone nr is always used
if any zone of a node is populated.  This means that the policy zone is not
correctly determined on arches that do no use HIGHMEM like ia64.

Change the loop to decrement k which also simplifies the BUG_ON.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 512e3f4..ca97899 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1465,15 +1465,19 @@ static int __init build_zonelists_node(pg_data_t *pgdat,
 	struct zone *zone;
 
 	BUG_ON(k > ZONE_HIGHMEM);
-	for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) {
+
+	do {
+		zone = pgdat->node_zones + k;
 		if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-			BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL);
+			BUG_ON(k > ZONE_NORMAL);
 #endif
 			zonelist->zones[j++] = zone;
 			check_highest_zone(k);
 		}
-	}
+		k--;
+
+	} while (k >= 0);
 	return j;
 }
 
-- 
cgit v1.1


From 070f80326a215d8e6c4fd6f175e28eb446c492bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:19 -0800
Subject: [PATCH] build_zonelists_node(): rename args

Give j and r meaningful names.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ca97899..7f58077 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1460,25 +1460,25 @@ void show_free_areas(void)
  * Add all populated zones of a node to the zonelist.
  */
 static int __init build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int j, int k)
+			struct zonelist *zonelist, int nr_zones, int zone_type)
 {
 	struct zone *zone;
 
-	BUG_ON(k > ZONE_HIGHMEM);
+	BUG_ON(zone_type > ZONE_HIGHMEM);
 
 	do {
-		zone = pgdat->node_zones + k;
+		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-			BUG_ON(k > ZONE_NORMAL);
+			BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-			zonelist->zones[j++] = zone;
-			check_highest_zone(k);
+			zonelist->zones[nr_zones++] = zone;
+			check_highest_zone(zone_type);
 		}
-		k--;
+		zone_type--;
 
-	} while (k >= 0);
-	return j;
+	} while (zone_type >= 0);
+	return nr_zones;
 }
 
 static inline int highest_zone(int zone_bits)
-- 
cgit v1.1


From d3cb487149bd706aa6aeb02042332a450978dc1c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 6 Jan 2006 00:11:20 -0800
Subject: [PATCH] atomic_long_t & include/asm-generic/atomic.h V2

Several counters already have the need to use 64 atomic variables on 64 bit
platforms (see mm_counter_t in sched.h).  We have to do ugly ifdefs to fall
back to 32 bit atomic on 32 bit platforms.

The VM statistics patch that I am working on will also make more extensive
use of atomic64.

This patch introduces a new type atomic_long_t by providing definitions in
asm-generic/atomic.h that works similar to the c "long" type.  Its 32 bits
on 32 bit platforms and 64 bits on 64 bit platforms.

Also cleans up the determination of the mm_counter_t in sched.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/atomic.h     |   1 +
 include/asm-arm/atomic.h       |   1 +
 include/asm-arm26/atomic.h     |   1 +
 include/asm-cris/atomic.h      |   1 +
 include/asm-frv/atomic.h       |   1 +
 include/asm-generic/atomic.h   | 116 +++++++++++++++++++++++++++++++++++++++++
 include/asm-h8300/atomic.h     |   1 +
 include/asm-i386/atomic.h      |   1 +
 include/asm-ia64/atomic.h      |   1 +
 include/asm-m32r/atomic.h      |   1 +
 include/asm-m68k/atomic.h      |   1 +
 include/asm-m68knommu/atomic.h |   1 +
 include/asm-mips/atomic.h      |   1 +
 include/asm-parisc/atomic.h    |   1 +
 include/asm-powerpc/atomic.h   |   1 +
 include/asm-s390/atomic.h      |   1 +
 include/asm-sh/atomic.h        |   1 +
 include/asm-sh64/atomic.h      |   1 +
 include/asm-sparc/atomic.h     |   1 +
 include/asm-sparc64/atomic.h   |   1 +
 include/asm-v850/atomic.h      |   1 +
 include/asm-x86_64/atomic.h    |   1 +
 include/asm-xtensa/atomic.h    |   1 +
 include/linux/sched.h          |  25 +++------
 24 files changed, 144 insertions(+), 19 deletions(-)
 create mode 100644 include/asm-generic/atomic.h

diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h
index 6183eab..cb03bbe 100644
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -216,4 +216,5 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* _ALPHA_ATOMIC_H */
diff --git a/include/asm-arm/atomic.h b/include/asm-arm/atomic.h
index d586f65..f72b633 100644
--- a/include/asm-arm/atomic.h
+++ b/include/asm-arm/atomic.h
@@ -205,5 +205,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
 #endif
diff --git a/include/asm-arm26/atomic.h b/include/asm-arm26/atomic.h
index a47cadc..3074b0e 100644
--- a/include/asm-arm26/atomic.h
+++ b/include/asm-arm26/atomic.h
@@ -118,5 +118,6 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
 #endif
diff --git a/include/asm-cris/atomic.h b/include/asm-cris/atomic.h
index 683b05a..2df2c7a 100644
--- a/include/asm-cris/atomic.h
+++ b/include/asm-cris/atomic.h
@@ -156,4 +156,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()    barrier()
 #define smp_mb__after_atomic_inc()     barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index f6539ff..3f54fea 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -426,4 +426,5 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
 })
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
new file mode 100644
index 0000000..e0a28b9
--- /dev/null
+++ b/include/asm-generic/atomic.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_GENERIC_ATOMIC_H
+#define _ASM_GENERIC_ATOMIC_H
+/*
+ * Copyright (C) 2005 Silicon Graphics, Inc.
+ *	Christoph Lameter <clameter@sgi.com>
+ *
+ * Allows to provide arch independent atomic definitions without the need to
+ * edit all arch specific atomic.h files.
+ */
+
+
+/*
+ * Suppport for atomic_long_t
+ *
+ * Casts for parameters are avoided for existing atomic functions in order to
+ * avoid issues with cast-as-lval under gcc 4.x and other limitations that the
+ * macros of a platform may have.
+ */
+
+#if BITS_PER_LONG == 64
+
+typedef atomic64_t atomic_long_t;
+
+#define ATOMIC_LONG_INIT(i)	ATOMIC64_INIT(i)
+
+static inline long atomic_long_read(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	return (long)atomic64_read(v);
+}
+
+static inline void atomic_long_set(atomic_long_t *l, long i)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic_set(v, i);
+}
+
+static inline void atomic_long_inc(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_inc(v);
+}
+
+static inline void atomic_long_dec(atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_dec(v);
+}
+
+static inline void atomic_long_add(long i, atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_add(i, v);
+}
+
+static inline void atomic_long_sub(long i, atomic_long_t *l)
+{
+	atomic64_t *v = (atomic64_t *)l;
+
+	atomic64_sub(i, v);
+}
+
+#else
+
+typedef atomic_t atomic_long_t;
+
+#define ATOMIC_LONG_INIT(i)	ATOMIC_INIT(i)
+static inline long atomic_long_read(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	return (long)atomic_read(v);
+}
+
+static inline void atomic_long_set(atomic_long_t *l, long i)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_set(v, i);
+}
+
+static inline void atomic_long_inc(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_inc(v);
+}
+
+static inline void atomic_long_dec(atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_dec(v);
+}
+
+static inline void atomic_long_add(long i, atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_add(i, v);
+}
+
+static inline void atomic_long_sub(long i, atomic_long_t *l)
+{
+	atomic_t *v = (atomic_t *)l;
+
+	atomic_sub(i, v);
+}
+
+#endif
+#endif
diff --git a/include/asm-h8300/atomic.h b/include/asm-h8300/atomic.h
index f23d868..d891541 100644
--- a/include/asm-h8300/atomic.h
+++ b/include/asm-h8300/atomic.h
@@ -137,4 +137,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, unsigned long *v)
 #define smp_mb__before_atomic_inc()    barrier()
 #define smp_mb__after_atomic_inc() barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_H8300_ATOMIC __ */
diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index c68557a..7a5472d 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -254,4 +254,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-ia64/atomic.h b/include/asm-ia64/atomic.h
index 2fbebf8..15cf798 100644
--- a/include/asm-ia64/atomic.h
+++ b/include/asm-ia64/atomic.h
@@ -192,4 +192,5 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_IA64_ATOMIC_H */
diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h
index ef1fb8e..7076127 100644
--- a/include/asm-m32r/atomic.h
+++ b/include/asm-m32r/atomic.h
@@ -313,4 +313,5 @@ static __inline__ void atomic_set_mask(unsigned long  mask, atomic_t *addr)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif	/* _ASM_M32R_ATOMIC_H */
diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h
index e3c962e..b8a4e75 100644
--- a/include/asm-m68k/atomic.h
+++ b/include/asm-m68k/atomic.h
@@ -157,4 +157,5 @@ static inline void atomic_set_mask(unsigned long mask, unsigned long *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_M68K_ATOMIC __ */
diff --git a/include/asm-m68knommu/atomic.h b/include/asm-m68knommu/atomic.h
index 3c1cc15..1702dbe 100644
--- a/include/asm-m68knommu/atomic.h
+++ b/include/asm-m68knommu/atomic.h
@@ -143,4 +143,5 @@ static inline int atomic_sub_return(int i, atomic_t * v)
 #define atomic_dec_return(v) atomic_sub_return(1,(v))
 #define atomic_inc_return(v) atomic_add_return(1,(v))
 
+#include <asm-generic/atomic.h>
 #endif /* __ARCH_M68KNOMMU_ATOMIC __ */
diff --git a/include/asm-mips/atomic.h b/include/asm-mips/atomic.h
index 55c37c1..92256e4 100644
--- a/include/asm-mips/atomic.h
+++ b/include/asm-mips/atomic.h
@@ -713,4 +713,5 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index 983e9a2..64ebd08 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -216,4 +216,5 @@ static __inline__ int atomic_read(const atomic_t *v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-powerpc/atomic.h b/include/asm-powerpc/atomic.h
index ec4b144..ae395a0 100644
--- a/include/asm-powerpc/atomic.h
+++ b/include/asm-powerpc/atomic.h
@@ -402,5 +402,6 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 
 #endif /* __powerpc64__ */
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_ATOMIC_H_ */
diff --git a/include/asm-s390/atomic.h b/include/asm-s390/atomic.h
index b3bd4f6..6d07c7d 100644
--- a/include/asm-s390/atomic.h
+++ b/include/asm-s390/atomic.h
@@ -215,5 +215,6 @@ atomic_compare_and_swap(int expected_oldval,int new_val,atomic_t *v)
 #define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	smp_mb()
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 #endif /* __ARCH_S390_ATOMIC__  */
diff --git a/include/asm-sh/atomic.h b/include/asm-sh/atomic.h
index aabfd33..618d8e0 100644
--- a/include/asm-sh/atomic.h
+++ b/include/asm-sh/atomic.h
@@ -140,4 +140,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ASM_SH_ATOMIC_H */
diff --git a/include/asm-sh64/atomic.h b/include/asm-sh64/atomic.h
index 927a2bc..f3ce5c0 100644
--- a/include/asm-sh64/atomic.h
+++ b/include/asm-sh64/atomic.h
@@ -152,4 +152,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __ASM_SH64_ATOMIC_H */
diff --git a/include/asm-sparc/atomic.h b/include/asm-sparc/atomic.h
index 62bec7a..accb496 100644
--- a/include/asm-sparc/atomic.h
+++ b/include/asm-sparc/atomic.h
@@ -159,4 +159,5 @@ static inline int __atomic24_sub(int i, atomic24_t *v)
 
 #endif /* !(__KERNEL__) */
 
+#include <asm-generic/atomic.h>
 #endif /* !(__ARCH_SPARC_ATOMIC__) */
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index 3789fe3..11f5aa5 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -96,4 +96,5 @@ extern int atomic64_sub_ret(int, atomic64_t *);
 #define smp_mb__after_atomic_inc()	barrier()
 #endif
 
+#include <asm-generic/atomic.h>
 #endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/include/asm-v850/atomic.h b/include/asm-v850/atomic.h
index bede317..f5b9ab6 100644
--- a/include/asm-v850/atomic.h
+++ b/include/asm-v850/atomic.h
@@ -126,4 +126,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __V850_ATOMIC_H__ */
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index 50db9f3..72eb071 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -424,4 +424,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif
diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h
index 3670cc7..e2ce06b 100644
--- a/include/asm-xtensa/atomic.h
+++ b/include/asm-xtensa/atomic.h
@@ -286,6 +286,7 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+#include <asm-generic/atomic.h>
 #endif /* __KERNEL__ */
 
 #endif /* _XTENSA_ATOMIC_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0ad6f3..7da3361 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,25 +254,12 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
  * The mm counters are not protected by its page_table_lock,
  * so must be incremented atomically.
  */
-#ifdef ATOMIC64_INIT
-#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
-typedef atomic64_t mm_counter_t;
-#else /* !ATOMIC64_INIT */
-/*
- * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
- * that is, at 16TB if using 4kB page size.
- */
-#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
-typedef atomic_t mm_counter_t;
-#endif /* !ATOMIC64_INIT */
+#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
+typedef atomic_long_t mm_counter_t;
 
 #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 /*
-- 
cgit v1.1


From a74609fafa2e5cc31d558012abaaa55ec9ad9da4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:20 -0800
Subject: [PATCH] mm: page_state opt

Optimise page_state manipulations by introducing interrupt unsafe accessors
to page_state fields.  Callers must provide their own locking (either
disable interrupts or not update from interrupt context).

Switch over the hot callsites that can easily be moved under interrupts off
sections.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 43 ++++++++++++++++------
 mm/page_alloc.c            | 89 ++++++++++++++++++++++++++--------------------
 mm/rmap.c                  | 10 ++++--
 mm/vmscan.c                | 27 +++++++-------
 4 files changed, 104 insertions(+), 65 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 32d09c8..dede8d4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -144,22 +144,33 @@ struct page_state {
 extern void get_page_state(struct page_state *ret);
 extern void get_page_state_node(struct page_state *ret, int node);
 extern void get_full_page_state(struct page_state *ret);
-extern unsigned long __read_page_state(unsigned long offset);
-extern void __mod_page_state(unsigned long offset, unsigned long delta);
+extern unsigned long read_page_state_offset(unsigned long offset);
+extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
+extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 
 #define read_page_state(member) \
-	__read_page_state(offsetof(struct page_state, member))
+	read_page_state_offset(offsetof(struct page_state, member))
 
 #define mod_page_state(member, delta)	\
-	__mod_page_state(offsetof(struct page_state, member), (delta))
+	mod_page_state_offset(offsetof(struct page_state, member), (delta))
 
-#define inc_page_state(member)	mod_page_state(member, 1UL)
-#define dec_page_state(member)	mod_page_state(member, 0UL - 1)
-#define add_page_state(member,delta) mod_page_state(member, (delta))
-#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
+#define __mod_page_state(member, delta)	\
+	__mod_page_state_offset(offsetof(struct page_state, member), (delta))
 
-#define mod_page_state_zone(zone, member, delta)			\
- do {									\
+#define inc_page_state(member)		mod_page_state(member, 1UL)
+#define dec_page_state(member)		mod_page_state(member, 0UL - 1)
+#define add_page_state(member,delta)	mod_page_state(member, (delta))
+#define sub_page_state(member,delta)	mod_page_state(member, 0UL - (delta))
+
+#define __inc_page_state(member)	__mod_page_state(member, 1UL)
+#define __dec_page_state(member)	__mod_page_state(member, 0UL - 1)
+#define __add_page_state(member,delta)	__mod_page_state(member, (delta))
+#define __sub_page_state(member,delta)	__mod_page_state(member, 0UL - (delta))
+
+#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
+
+#define state_zone_offset(zone, member)					\
+({									\
 	unsigned offset;						\
 	if (is_highmem(zone))						\
 		offset = offsetof(struct page_state, member##_high);	\
@@ -169,7 +180,17 @@ extern void __mod_page_state(unsigned long offset, unsigned long delta);
 		offset = offsetof(struct page_state, member##_dma32);	\
 	else								\
 		offset = offsetof(struct page_state, member##_dma);	\
-	__mod_page_state(offset, (delta));				\
+	offset;								\
+})
+
+#define __mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	__mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
+ } while (0)
+
+#define mod_page_state_zone(zone, member, delta)			\
+ do {									\
+	mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
  } while (0)
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f58077..fd47494 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -424,9 +424,9 @@ void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	list_add(&page->lru, &list);
-	mod_page_state(pgfree, 1 << order);
 	kernel_map_pages(page, 1<<order, 0);
 	local_irq_save(flags);
+	__mod_page_state(pgfree, 1 << order);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 	local_irq_restore(flags);
 }
@@ -674,18 +674,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-	unsigned long flags;
-	int cpu;
 	pg_data_t *pg = z->zone_pgdat;
 	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
 	struct per_cpu_pageset *p;
 
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	p = zone_pcp(z,cpu);
+	p = zone_pcp(z, cpu);
 	if (pg == orig) {
 		p->numa_hit++;
 	} else {
@@ -696,7 +692,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 		p->local_node++;
 	else
 		p->other_node++;
-	local_irq_restore(flags);
 #endif
 }
 
@@ -716,11 +711,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
-	inc_page_state(pgfree);
 	kernel_map_pages(page, 1, 0);
 
 	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
+	__inc_page_state(pgfree);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	if (pcp->count >= pcp->high)
@@ -753,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
+			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
+	int cpu;
 
 again:
+	cpu  = get_cpu();
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 
-		page = NULL;
-		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
-		if (!pcp->count)
+		if (!pcp->count) {
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
-		if (likely(pcp->count)) {
-			page = list_entry(pcp->list.next, struct page, lru);
-			list_del(&page->lru);
-			pcp->count--;
+			if (unlikely(!pcp->count))
+				goto failed;
 		}
-		local_irq_restore(flags);
-		put_cpu();
+		page = list_entry(pcp->list.next, struct page, lru);
+		list_del(&page->lru);
+		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
-		spin_unlock_irqrestore(&zone->lock, flags);
+		spin_unlock(&zone->lock);
+		if (!page)
+			goto failed;
 	}
 
-	if (page != NULL) {
-		BUG_ON(bad_range(zone, page));
-		mod_page_state_zone(zone, pgalloc, 1 << order);
-		if (prep_new_page(page, order))
-			goto again;
+	__mod_page_state_zone(zone, pgalloc, 1 << order);
+	zone_statistics(zonelist, zone, cpu);
+	local_irq_restore(flags);
+	put_cpu();
 
-		if (gfp_flags & __GFP_ZERO)
-			prep_zero_page(page, order, gfp_flags);
+	BUG_ON(bad_range(zone, page));
+	if (prep_new_page(page, order))
+		goto again;
 
-		if (order && (gfp_flags & __GFP_COMP))
-			prep_compound_page(page, order);
-	}
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
 	return page;
+
+failed:
+	local_irq_restore(flags);
+	put_cpu();
+	return NULL;
 }
 
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
@@ -871,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 				continue;
 		}
 
-		page = buffered_rmqueue(*z, order, gfp_mask);
+		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
 		if (page) {
-			zone_statistics(zonelist, *z);
 			break;
 		}
 	} while (*(++z) != NULL);
@@ -1248,7 +1251,7 @@ void get_full_page_state(struct page_state *ret)
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
 	unsigned long ret = 0;
 	int cpu;
@@ -1262,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
 	return ret;
 }
 
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+	void *ptr;
+
+	ptr = &__get_cpu_var(page_states);
+	*(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
-	void* ptr;
+	void *ptr;
 
 	local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
-	*(unsigned long*)(ptr + offset) += delta;
+	*(unsigned long *)(ptr + offset) += delta;
 	local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL(__mod_page_state);
+EXPORT_SYMBOL(mod_page_state_offset);
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat)
diff --git a/mm/rmap.c b/mm/rmap.c
index 4107f64..6f3f7db 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -451,7 +451,11 @@ static void __page_set_anon_rmap(struct page *page,
 
 	page->index = linear_page_index(vma, address);
 
-	inc_page_state(nr_mapped);
+	/*
+	 * nr_mapped state can be updated without turning off
+	 * interrupts because it is not modified via interrupt.
+	 */
+	__inc_page_state(nr_mapped);
 }
 
 /**
@@ -498,7 +502,7 @@ void page_add_file_rmap(struct page *page)
 	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
-		inc_page_state(nr_mapped);
+		__inc_page_state(nr_mapped);
 }
 
 /**
@@ -522,7 +526,7 @@ void page_remove_rmap(struct page *page)
 		 */
 		if (page_test_and_clear_dirty(page))
 			set_page_dirty(page);
-		dec_page_state(nr_mapped);
+		__dec_page_state(nr_mapped);
 	}
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7681d8e..be8235f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -645,16 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 			goto done;
 
 		max_scan -= nr_scan;
-		if (current_is_kswapd())
-			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-		else
-			mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		nr_freed = shrink_list(&page_list, sc);
-		if (current_is_kswapd())
-			mod_page_state(kswapd_steal, nr_freed);
-		mod_page_state_zone(zone, pgsteal, nr_freed);
 
-		spin_lock_irq(&zone->lru_lock);
+		local_irq_disable();
+		if (current_is_kswapd()) {
+			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+			__mod_page_state(kswapd_steal, nr_freed);
+		} else
+			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
+		__mod_page_state_zone(zone, pgsteal, nr_freed);
+
+		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
@@ -816,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		}
 	}
 	zone->nr_active += pgmoved;
-	spin_unlock_irq(&zone->lru_lock);
-	pagevec_release(&pvec);
+	spin_unlock(&zone->lru_lock);
+
+	__mod_page_state_zone(zone, pgrefill, pgscanned);
+	__mod_page_state(pgdeactivate, pgdeactivate);
+	local_irq_enable();
 
-	mod_page_state_zone(zone, pgrefill, pgscanned);
-	mod_page_state(pgdeactivate, pgdeactivate);
+	pagevec_release(&pvec);
 }
 
 /*
-- 
cgit v1.1


From b09eb1c06a14641209e6b86e9a5b28ea8287f193 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Fri, 6 Jan 2006 00:11:21 -0800
Subject: [PATCH] mm: page_state opt docs

Comment the new locking rules for page_state statistics.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index dede8d4..d52999c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -79,13 +79,23 @@
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
+ *
+ * - Fields can be modified with xxx_page_state and xxx_page_state_zone at
+ * any time safely (which protects the instance from modification by
+ * interrupt.
+ * - The __xxx_page_state variants can be used safely when interrupts are
+ * disabled.
+ * - The __xxx_page_state variants can be used if the field is only
+ * modified from process context, or only modified from interrupt context.
+ * In this case, the field should be commented here.
  */
 struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
 	unsigned long nr_writeback;	/* Pages under writeback */
 	unsigned long nr_unstable;	/* NFS unstable pages */
 	unsigned long nr_page_table_pages;/* Pages used for pagetables */
-	unsigned long nr_mapped;	/* mapped into pagetables */
+	unsigned long nr_mapped;	/* mapped into pagetables.
+					 * only modified from process context */
 	unsigned long nr_slab;		/* In slab */
 #define GET_PAGE_STATE_LAST nr_slab
 
-- 
cgit v1.1


From 6e20a64a3913819133fefeca466211c7eb8adda1 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Fri, 6 Jan 2006 00:11:22 -0800
Subject: [PATCH] selinux: ARRAY_SIZE cleanups

Use ARRAY_SIZE macro instead of sizeof(x)/sizeof(x[0]).

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/selinuxfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 0e1352a..e59da63 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -376,7 +376,7 @@ static ssize_t selinux_transaction_write(struct file *file, const char __user *b
 	char *data;
 	ssize_t rv;
 
-	if (ino >= sizeof(write_op)/sizeof(write_op[0]) || !write_op[ino])
+	if (ino >= ARRAY_SIZE(write_op) || !write_op[ino])
 		return -EINVAL;
 
 	data = simple_transaction_get(file, buf, size);
@@ -1161,7 +1161,7 @@ static int sel_make_avc_files(struct dentry *dir)
 #endif
 	};
 
-	for (i = 0; i < sizeof (files) / sizeof (files[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(files); i++) {
 		struct inode *inode;
 		struct dentry *dentry;
 
-- 
cgit v1.1


From 32725ad8430b58e42c5d54757ce7871e680d05cb Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@nuerscht.ch>
Date: Fri, 6 Jan 2006 00:11:23 -0800
Subject: [PATCH] selinux: more ARRAY_SIZE cleanups

Further ARRAY_SIZE cleanups under security/selinux.

Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/ss/avtab.c    | 2 +-
 security/selinux/ss/policydb.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index dde094f..d049c7a 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -359,7 +359,7 @@ int avtab_read_item(void *fp, u32 vers, struct avtab *a,
 			return -1;
 		}
 
-		for (i = 0; i < sizeof(spec_order)/sizeof(u16); i++) {
+		for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
 			if (val & spec_order[i]) {
 				key.specified = spec_order[i] | enabled;
 				datum.data = le32_to_cpu(buf32[items++]);
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 0ac311d..0111990 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -103,7 +103,7 @@ static struct policydb_compat_info *policydb_lookup_compat(int version)
 	int i;
 	struct policydb_compat_info *info = NULL;
 
-	for (i = 0; i < sizeof(policydb_compat)/sizeof(*info); i++) {
+	for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) {
 		if (policydb_compat[i].version == version) {
 			info = &policydb_compat[i];
 			break;
-- 
cgit v1.1


From 8d9067bda99c68e1a17d93e78cf3a5a3f67e0c35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:24 -0800
Subject: [PATCH] Keys: Remove key duplication

Remove the key duplication stuff since there's nothing that uses it, no way
to get at it and it's awkward to deal with for LSM purposes.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt       | 18 -------------
 include/keys/user-type.h     |  1 -
 include/linux/key.h          |  8 ------
 security/keys/key.c          | 56 +++-----------------------------------
 security/keys/keyring.c      | 64 --------------------------------------------
 security/keys/user_defined.c | 33 -----------------------
 6 files changed, 3 insertions(+), 177 deletions(-)

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 3115488..6304db5 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -860,24 +860,6 @@ The structure has a number of fields, some of which are mandatory:
      It is safe to sleep in this method.
 
 
- (*) int (*duplicate)(struct key *key, const struct key *source);
-
-     If this type of key can be duplicated, then this method should be
-     provided. It is called to copy the payload attached to the source into the
-     new key. The data length on the new key will have been updated and the
-     quota adjusted already.
-
-     This method will be called with the source key's semaphore read-locked to
-     prevent its payload from being changed, thus RCU constraints need not be
-     applied to the source key.
-
-     This method does not have to lock the destination key in order to attach a
-     payload. The fact that KEY_FLAG_INSTANTIATED is not set in key->flags
-     prevents anything else from gaining access to the key.
-
-     It is safe to sleep in this method.
-
-
  (*) int (*update)(struct key *key, const void *data, size_t datalen);
 
      If this type of key can be updated, then this method should be provided.
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index 26f6ec3..a3dae18 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -35,7 +35,6 @@ struct user_key_payload {
 extern struct key_type key_type_user;
 
 extern int user_instantiate(struct key *key, const void *data, size_t datalen);
-extern int user_duplicate(struct key *key, const struct key *source);
 extern int user_update(struct key *key, const void *data, size_t datalen);
 extern int user_match(const struct key *key, const void *criterion);
 extern void user_destroy(struct key *key);
diff --git a/include/linux/key.h b/include/linux/key.h
index 53513a3..4d189e5 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -193,14 +193,6 @@ struct key_type {
 	 */
 	int (*instantiate)(struct key *key, const void *data, size_t datalen);
 
-	/* duplicate a key of this type (optional)
-	 * - the source key will be locked against change
-	 * - the new description will be attached
-	 * - the quota will have been adjusted automatically from
-	 *   source->quotalen
-	 */
-	int (*duplicate)(struct key *key, const struct key *source);
-
 	/* update a key of this type (optional)
 	 * - this method should call key_payload_reserve() to recalculate the
 	 *   quota consumption
diff --git a/security/keys/key.c b/security/keys/key.c
index 01bcfec..bb03662 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -240,9 +240,9 @@ static inline void key_alloc_serial(struct key *key)
 /*
  * allocate a key of the specified type
  * - update the user's quota to reflect the existence of the key
- * - called from a key-type operation with key_types_sem read-locked by either
- *   key_create_or_update() or by key_duplicate(); this prevents unregistration
- *   of the key type
+ * - called from a key-type operation with key_types_sem read-locked by
+ *   key_create_or_update()
+ *   - this prevents unregistration of the key type
  * - upon return the key is as yet uninstantiated; the caller needs to either
  *   instantiate the key or discard it before returning
  */
@@ -889,56 +889,6 @@ EXPORT_SYMBOL(key_update);
 
 /*****************************************************************************/
 /*
- * duplicate a key, potentially with a revised description
- * - must be supported by the keytype (keyrings for instance can be duplicated)
- */
-struct key *key_duplicate(struct key *source, const char *desc)
-{
-	struct key *key;
-	int ret;
-
-	key_check(source);
-
-	if (!desc)
-		desc = source->description;
-
-	down_read(&key_types_sem);
-
-	ret = -EINVAL;
-	if (!source->type->duplicate)
-		goto error;
-
-	/* allocate and instantiate a key */
-	key = key_alloc(source->type, desc, current->fsuid, current->fsgid,
-			source->perm, 0);
-	if (IS_ERR(key))
-		goto error_k;
-
-	down_read(&source->sem);
-	ret = key->type->duplicate(key, source);
-	up_read(&source->sem);
-	if (ret < 0)
-		goto error2;
-
-	atomic_inc(&key->user->nikeys);
-	set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
-
- error_k:
-	up_read(&key_types_sem);
- out:
-	return key;
-
- error2:
-	key_put(key);
- error:
-	up_read(&key_types_sem);
-	key = ERR_PTR(ret);
-	goto out;
-
-} /* end key_duplicate() */
-
-/*****************************************************************************/
-/*
  * revoke a key
  */
 void key_revoke(struct key *key)
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 4e9fa8b..0acecbd 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -48,7 +48,6 @@ static inline unsigned keyring_hash(const char *desc)
  */
 static int keyring_instantiate(struct key *keyring,
 			       const void *data, size_t datalen);
-static int keyring_duplicate(struct key *keyring, const struct key *source);
 static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_destroy(struct key *keyring);
 static void keyring_describe(const struct key *keyring, struct seq_file *m);
@@ -59,7 +58,6 @@ struct key_type key_type_keyring = {
 	.name		= "keyring",
 	.def_datalen	= sizeof(struct keyring_list),
 	.instantiate	= keyring_instantiate,
-	.duplicate	= keyring_duplicate,
 	.match		= keyring_match,
 	.destroy	= keyring_destroy,
 	.describe	= keyring_describe,
@@ -120,68 +118,6 @@ static int keyring_instantiate(struct key *keyring,
 
 /*****************************************************************************/
 /*
- * duplicate the list of subscribed keys from a source keyring into this one
- */
-static int keyring_duplicate(struct key *keyring, const struct key *source)
-{
-	struct keyring_list *sklist, *klist;
-	unsigned max;
-	size_t size;
-	int loop, ret;
-
-	const unsigned limit =
-		(PAGE_SIZE - sizeof(*klist)) / sizeof(struct key *);
-
-	ret = 0;
-
-	/* find out how many keys are currently linked */
-	rcu_read_lock();
-	sklist = rcu_dereference(source->payload.subscriptions);
-	max = 0;
-	if (sklist)
-		max = sklist->nkeys;
-	rcu_read_unlock();
-
-	/* allocate a new payload and stuff load with key links */
-	if (max > 0) {
-		BUG_ON(max > limit);
-
-		max = (max + 3) & ~3;
-		if (max > limit)
-			max = limit;
-
-		ret = -ENOMEM;
-		size = sizeof(*klist) + sizeof(struct key *) * max;
-		klist = kmalloc(size, GFP_KERNEL);
-		if (!klist)
-			goto error;
-
-		/* set links */
-		rcu_read_lock();
-		sklist = rcu_dereference(source->payload.subscriptions);
-
-		klist->maxkeys = max;
-		klist->nkeys = sklist->nkeys;
-		memcpy(klist->keys,
-		       sklist->keys,
-		       sklist->nkeys * sizeof(struct key *));
-
-		for (loop = klist->nkeys - 1; loop >= 0; loop--)
-			atomic_inc(&klist->keys[loop]->usage);
-
-		rcu_read_unlock();
-
-		rcu_assign_pointer(keyring->payload.subscriptions, klist);
-		ret = 0;
-	}
-
- error:
-	return ret;
-
-} /* end keyring_duplicate() */
-
-/*****************************************************************************/
-/*
  * match keyrings on their name
  */
 static int keyring_match(const struct key *keyring, const void *description)
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index cbda3b2..8e71895 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -26,7 +26,6 @@
 struct key_type key_type_user = {
 	.name		= "user",
 	.instantiate	= user_instantiate,
-	.duplicate	= user_duplicate,
 	.update		= user_update,
 	.match		= user_match,
 	.destroy	= user_destroy,
@@ -68,42 +67,10 @@ error:
 	return ret;
 
 } /* end user_instantiate() */
-
 EXPORT_SYMBOL_GPL(user_instantiate);
 
 /*****************************************************************************/
 /*
- * duplicate a user defined key
- * - both keys' semaphores are locked against further modification
- * - the new key cannot yet be accessed
- */
-int user_duplicate(struct key *key, const struct key *source)
-{
-	struct user_key_payload *upayload, *spayload;
-	int ret;
-
-	/* just copy the payload */
-	ret = -ENOMEM;
-	upayload = kmalloc(sizeof(*upayload) + source->datalen, GFP_KERNEL);
-	if (upayload) {
-		spayload = rcu_dereference(source->payload.data);
-		BUG_ON(source->datalen != spayload->datalen);
-
-		upayload->datalen = key->datalen = spayload->datalen;
-		memcpy(upayload->data, spayload->data, key->datalen);
-
-		key->payload.data = upayload;
-		ret = 0;
-	}
-
-	return ret;
-
-} /* end user_duplicate() */
-
-EXPORT_SYMBOL_GPL(user_duplicate);
-
-/*****************************************************************************/
-/*
  * dispose of the old data from an updated user defined key
  */
 static void user_update_rcu_disposal(struct rcu_head *rcu)
-- 
cgit v1.1


From 1ae8f40767a3afc6244719a2c8fbcf546767d5b0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:11:25 -0800
Subject: [PATCH] security/: possible cleanups

make needlessly global code static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/keys/internal.h | 1 -
 security/keys/key.c      | 2 +-
 security/keys/keyring.c  | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/security/keys/internal.h b/security/keys/internal.h
index db99ed4..39cba97 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -25,7 +25,6 @@
 #define kdebug(FMT, a...)	do {} while(0)
 #endif
 
-extern struct key_type key_type_dead;
 extern struct key_type key_type_user;
 
 /*****************************************************************************/
diff --git a/security/keys/key.c b/security/keys/key.c
index bb03662..99781b7 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -36,7 +36,7 @@ static DECLARE_WORK(key_cleanup_task, key_cleanup, NULL);
 DECLARE_RWSEM(key_construction_sem);
 
 /* any key who's type gets unegistered will be re-typed to this */
-struct key_type key_type_dead = {
+static struct key_type key_type_dead = {
 	.name		= "dead",
 };
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 0acecbd..5d22c03 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -68,7 +68,7 @@ struct key_type key_type_keyring = {
  * semaphore to serialise link/link calls to prevent two link calls in parallel
  * introducing a cycle
  */
-DECLARE_RWSEM(keyring_serialise_link_sem);
+static DECLARE_RWSEM(keyring_serialise_link_sem);
 
 /*****************************************************************************/
 /*
-- 
cgit v1.1


From fa57f9c2b841872ffad9d8f7b3de23d6ba33c30d Mon Sep 17 00:00:00 2001
From: Eugene Surovegin <ebs@ebshome.net>
Date: Fri, 6 Jan 2006 00:11:26 -0800
Subject: [PATCH] ppc32: remove "jumbo" member from ocp_func_emac_data

Remove the not needed anymore "jumbo" member from ocp_func_emac_data.
Jumbo frame support is handled by PPC4xx EMAC driver internally now.

Signed-off-by: Eugene Surovegin <ebs@ebshome.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/platforms/4xx/ibm440gx.c | 2 --
 arch/ppc/platforms/4xx/ibm440sp.c | 1 -
 include/asm-ppc/ibm_ocp.h         | 1 -
 3 files changed, 4 deletions(-)

diff --git a/arch/ppc/platforms/4xx/ibm440gx.c b/arch/ppc/platforms/4xx/ibm440gx.c
index 956f45e..d24c09e 100644
--- a/arch/ppc/platforms/4xx/ibm440gx.c
+++ b/arch/ppc/platforms/4xx/ibm440gx.c
@@ -58,7 +58,6 @@ static struct ocp_func_emac_data ibm440gx_emac2_def = {
 	.wol_irq        = 65,  		/* WOL interrupt number */
 	.mdio_idx       = -1,           /* No shared MDIO */
 	.tah_idx	= 0,		/* TAH device index */
-	.jumbo		= 1,		/* Jumbo frames supported */
 };
 
 static struct ocp_func_emac_data ibm440gx_emac3_def = {
@@ -72,7 +71,6 @@ static struct ocp_func_emac_data ibm440gx_emac3_def = {
 	.wol_irq        = 67,  		/* WOL interrupt number */
 	.mdio_idx       = -1,           /* No shared MDIO */
 	.tah_idx	= 1,		/* TAH device index */
-	.jumbo		= 1,		/* Jumbo frames supported */
 };
 OCP_SYSFS_EMAC_DATA()
 
diff --git a/arch/ppc/platforms/4xx/ibm440sp.c b/arch/ppc/platforms/4xx/ibm440sp.c
index feb17e4..71a0117 100644
--- a/arch/ppc/platforms/4xx/ibm440sp.c
+++ b/arch/ppc/platforms/4xx/ibm440sp.c
@@ -31,7 +31,6 @@ static struct ocp_func_emac_data ibm440sp_emac0_def = {
 	.wol_irq        = 61,  		/* WOL interrupt number */
 	.mdio_idx       = -1,           /* No shared MDIO */
 	.tah_idx	= -1,		/* No TAH */
-	.jumbo		= 1,		/* Jumbo frames supported */
 };
 OCP_SYSFS_EMAC_DATA()
 
diff --git a/include/asm-ppc/ibm_ocp.h b/include/asm-ppc/ibm_ocp.h
index 9c21de1..ddce616 100644
--- a/include/asm-ppc/ibm_ocp.h
+++ b/include/asm-ppc/ibm_ocp.h
@@ -63,7 +63,6 @@ struct ocp_func_emac_data {
 	int	wol_irq;	/* WOL interrupt */
 	int	mdio_idx;	/* EMAC idx of MDIO master or -1 */
 	int	tah_idx;	/* TAH device index or -1 */
-	int	jumbo;		/* Jumbo frames capable flag */
 	int	phy_mode;	/* PHY type or configurable mode */
 	u8	mac_addr[6];	/* EMAC mac address */
 	u32	phy_map;	/* EMAC phy map */
-- 
cgit v1.1


From e13ac219816c58579f40b48220b2fa5d94c30e84 Mon Sep 17 00:00:00 2001
From: Otavio Salvador <otavio@debian.org>
Date: Fri, 6 Jan 2006 00:11:26 -0800
Subject: [PATCH] arch/ppc/kernel/idle.c: don't declare cpu variable in non-SMP
 kernels

Disable declaration of cpu variable in default_idle function when
building non-SMP kernels.

Signed-off-by: Otavio Salvador <otavio@debian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/kernel/idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ppc/kernel/idle.c b/arch/ppc/kernel/idle.c
index 821a75e..1be3ca5 100644
--- a/arch/ppc/kernel/idle.c
+++ b/arch/ppc/kernel/idle.c
@@ -37,7 +37,6 @@
 void default_idle(void)
 {
 	void (*powersave)(void);
-	int cpu = smp_processor_id();
 
 	powersave = ppc_md.power_save;
 
@@ -47,7 +46,8 @@ void default_idle(void)
 #ifdef CONFIG_SMP
 		else {
 			set_thread_flag(TIF_POLLING_NRFLAG);
-			while (!need_resched() && !cpu_is_offline(cpu))
+			while (!need_resched() &&
+					!cpu_is_offline(smp_processor_id()))
 				barrier();
 			clear_thread_flag(TIF_POLLING_NRFLAG);
 		}
-- 
cgit v1.1


From c9662b4b37f8f00a212eb4131d1d177b6ed8ddbd Mon Sep 17 00:00:00 2001
From: Arthur Othieno <a.othieno@bluewin.ch>
Date: Fri, 6 Jan 2006 00:11:29 -0800
Subject: [PATCH] macintosh: don't store i2c_add_driver() return if no further
 processing done

therm_pm72.c and windfarm_lm75_sensor.c both store the return from
i2c_add_driver() but do no further processing on the result.  Simply return
what i2c_add_driver() did, instead.

Signed-off-by: Arthur Othieno <a.othieno@bluewin.ch>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/macintosh/therm_pm72.c           | 7 +------
 drivers/macintosh/windfarm_lm75_sensor.c | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index 190878e..435427d 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1988,18 +1988,13 @@ static void fcu_lookup_fans(struct device_node *fcu_node)
 
 static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match)
 {
-	int rc;
-
 	state = state_detached;
 
 	/* Lookup the fans in the device tree */
 	fcu_lookup_fans(dev->node);
 
 	/* Add the driver */
-	rc = i2c_add_driver(&therm_pm72_driver);
-	if (rc < 0)
-		return rc;
-	return 0;
+	return i2c_add_driver(&therm_pm72_driver);
 }
 
 static int fcu_of_remove(struct of_device* dev)
diff --git a/drivers/macintosh/windfarm_lm75_sensor.c b/drivers/macintosh/windfarm_lm75_sensor.c
index a0a41ad..c62ed68 100644
--- a/drivers/macintosh/windfarm_lm75_sensor.c
+++ b/drivers/macintosh/windfarm_lm75_sensor.c
@@ -240,12 +240,7 @@ static int wf_lm75_detach(struct i2c_client *client)
 
 static int __init wf_lm75_sensor_init(void)
 {
-	int rc;
-
-	rc = i2c_add_driver(&wf_lm75_driver);
-	if (rc < 0)
-		return rc;
-	return 0;
+	return i2c_add_driver(&wf_lm75_driver);
 }
 
 static void __exit wf_lm75_sensor_exit(void)
-- 
cgit v1.1


From 7558824a8d16e244072bfebc9e5e3e3b1b9af261 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:30 -0800
Subject: [PATCH] ppc32: Remove useless file arch/ppc/platforms/mpc5200.c

That file is a left-over of the 'old' OCP model that should have been erased
during the change to platform model but I forgot it ...

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/platforms/mpc5200.c | 53 --------------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 arch/ppc/platforms/mpc5200.c

diff --git a/arch/ppc/platforms/mpc5200.c b/arch/ppc/platforms/mpc5200.c
deleted file mode 100644
index a58db43..0000000
--- a/arch/ppc/platforms/mpc5200.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * arch/ppc/platforms/mpc5200.c
- *
- * OCP Definitions for the boards based on MPC5200 processor. Contains
- * definitions for every common peripherals. (Mostly all but PSCs)
- * 
- * Maintainer : Sylvain Munaut <tnt@246tNt.com>
- *
- * Copyright 2004 Sylvain Munaut <tnt@246tNt.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#include <asm/ocp.h>
-#include <asm/mpc52xx.h>
-
-
-static struct ocp_fs_i2c_data mpc5200_i2c_def = {
-        .flags  = FS_I2C_CLOCK_5200,
-};
-
-
-/* Here is the core_ocp struct.
- * With all the devices common to all board. Even if port multiplexing is
- * not setup for them (if the user don't want them, just don't select the
- * config option). The potentially conflicting devices (like PSCs) goes in
- * board specific file.
- */
-struct ocp_def core_ocp[] = {
-	{
-		.vendor         = OCP_VENDOR_FREESCALE,
-		.function       = OCP_FUNC_IIC,
-		.index          = 0,
-		.paddr          = MPC52xx_I2C1,
-		.irq            = OCP_IRQ_NA,   /* MPC52xx_IRQ_I2C1 - Buggy */
-		.pm             = OCP_CPM_NA,
-		.additions      = &mpc5200_i2c_def,
-	},
-	{
-		.vendor         = OCP_VENDOR_FREESCALE,
-		.function       = OCP_FUNC_IIC,
-		.index          = 1,
-		.paddr          = MPC52xx_I2C2,
-		.irq            = OCP_IRQ_NA,   /* MPC52xx_IRQ_I2C2 - Buggy */
-		.pm             = OCP_CPM_NA,
-		.additions      = &mpc5200_i2c_def,
-	},
-	{	/* Terminating entry */
-		.vendor		= OCP_VENDOR_INVALID
-	}
-};
-- 
cgit v1.1


From 2d8179c0b77b54e27321944e16f65defeda81e27 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:31 -0800
Subject: [PATCH] ppc32/serial: Fix compiler errors with GCC 4.x in
 mpc52xx_uart.c

Signed-off-by: Wolfgang Denk <wd@denx.de>
Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/serial/mpc52xx_uart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index b8727d9..4dcf031 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -668,7 +668,7 @@ mpc52xx_console_setup(struct console *co, char *options)
 }
 
 
-extern struct uart_driver mpc52xx_uart_driver;
+static struct uart_driver mpc52xx_uart_driver;
 
 static struct console mpc52xx_console = {
 	.name	= "ttyS",
-- 
cgit v1.1


From d62de3aa8ac762c09845aa38634a845da55f31dc Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:32 -0800
Subject: [PATCH] ppc32/serial: Change mpc52xx_uart.c to use the Low Density
 Serial port major

Before this patch we were just using the "classic" /dev/ttySx devices.
However when another on the system is loaded that uses those (like drivers for
serial PCMCIA), that creates a conflict for the minors.  Therefore, we now use
/dev/ttyPSC[0:5] (note the 0-based numbering !) with some minors we've been
assigned in the "Low Density Serial port major"

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/serial/mpc52xx_uart.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 4dcf031..1288d62 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -37,11 +37,11 @@
  * by the bootloader or in the platform init code.
  *
  * The idx field must be equal to the PSC index ( e.g. 0 for PSC1, 1 for PSC2,
- * and so on). So the PSC1 is mapped to /dev/ttyS0, PSC2 to /dev/ttyS1 and so
- * on. But be warned, it's an ABSOLUTE REQUIREMENT ! This is needed mainly for
- * the console code : without this 1:1 mapping, at early boot time, when we are
- * parsing the kernel args console=ttyS?, we wouldn't know wich PSC it will be
- * mapped to.
+ * and so on). So the PSC1 is mapped to /dev/ttyPSC0, PSC2 to /dev/ttyPSC1 and
+ * so on. But be warned, it's an ABSOLUTE REQUIREMENT ! This is needed mainly
+ * fpr the console code : without this 1:1 mapping, at early boot time, when we
+ * are parsing the kernel args console=ttyPSC?, we wouldn't know wich PSC it
+ * will be mapped to.
  */
 
 #include <linux/config.h>
@@ -65,6 +65,10 @@
 #include <linux/serial_core.h>
 
 
+/* We've been assigned a range on the "Low-density serial ports" major */
+#define SERIAL_PSC_MAJOR	204
+#define SERIAL_PSC_MINOR	148
+
 
 #define ISR_PASS_LIMIT 256	/* Max number of iteration in the interrupt */
 
@@ -671,12 +675,12 @@ mpc52xx_console_setup(struct console *co, char *options)
 static struct uart_driver mpc52xx_uart_driver;
 
 static struct console mpc52xx_console = {
-	.name	= "ttyS",
+	.name	= "ttyPSC",
 	.write	= mpc52xx_console_write,
 	.device	= uart_console_device,
 	.setup	= mpc52xx_console_setup,
 	.flags	= CON_PRINTBUFFER,
-	.index	= -1,	/* Specified on the cmdline (e.g. console=ttyS0 ) */
+	.index	= -1,	/* Specified on the cmdline (e.g. console=ttyPSC0 ) */
 	.data	= &mpc52xx_uart_driver,
 };
 
@@ -703,10 +707,10 @@ console_initcall(mpc52xx_console_init);
 static struct uart_driver mpc52xx_uart_driver = {
 	.owner		= THIS_MODULE,
 	.driver_name	= "mpc52xx_psc_uart",
-	.dev_name	= "ttyS",
-	.devfs_name	= "ttyS",
-	.major		= TTY_MAJOR,
-	.minor		= 64,
+	.dev_name	= "ttyPSC",
+	.devfs_name	= "ttyPSC",
+	.major		= SERIAL_PSC_MAJOR,
+	.minor		= SERIAL_PSC_MINOR,
 	.nr		= MPC52xx_PSC_MAXNUM,
 	.cons		= MPC52xx_PSC_CONSOLE,
 };
-- 
cgit v1.1


From 4aa7c80193c561e52c06351e0f521e697954a859 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:34 -0800
Subject: [PATCH] ppc32: Fix static IO mapping for Freescale MPC52xx

The current iomapping used MBAR_SIZE for the size argument of
io_block_mapping, resulting in a call to setbat with a size argument of 64k
which is invalid.

This patch correct this and maps the whole 0xf0000000->0xffffffff range so
that devices on the local bus are also included in the BAT mapping.

Thanks to Bernhard Kuhn from Metrowerks for pointing this out.

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/syslib/mpc52xx_setup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/ppc/syslib/mpc52xx_setup.c b/arch/ppc/syslib/mpc52xx_setup.c
index bb23745..a4a4b02 100644
--- a/arch/ppc/syslib/mpc52xx_setup.c
+++ b/arch/ppc/syslib/mpc52xx_setup.c
@@ -84,9 +84,11 @@ mpc52xx_set_bat(void)
 void __init
 mpc52xx_map_io(void)
 {
-	/* Here we only map the MBAR */
+	/* Here we map the MBAR and the whole upper zone. MBAR is only
+	   64k but we can't map only 64k with BATs. Map the whole
+	   0xf0000000 range is ok and helps eventual lpb devices placed there */
 	io_block_mapping(
-		MPC52xx_MBAR_VIRT, MPC52xx_MBAR, MPC52xx_MBAR_SIZE, _PAGE_IO);
+		MPC52xx_MBAR_VIRT, MPC52xx_MBAR, 0x10000000, _PAGE_IO);
 }
 
 
-- 
cgit v1.1


From e21b9f2e9a580ce7375ec58953c1bb19aabe0db4 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:35 -0800
Subject: [PATCH] ppc32: Modify Freescale MPC52xx IRQ mapping to _not_ use irq
 0

AFAIK IRQ number 0 is a perfectly valid IRQ number.  But it seems there are
numerous places where it's considered to be invalid or "no irq" value.  Since
that value is problematic, the IRQ mapping is changed to not use it.

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc/mpc52xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-ppc/mpc52xx.h b/include/asm-ppc/mpc52xx.h
index e5f80c2..04d5630 100644
--- a/include/asm-ppc/mpc52xx.h
+++ b/include/asm-ppc/mpc52xx.h
@@ -107,7 +107,7 @@ enum ppc_sys_devices {
 #define MPC52xx_SDMA_IRQ_NUM	17
 #define MPC52xx_PERP_IRQ_NUM	23
 
-#define MPC52xx_CRIT_IRQ_BASE	0
+#define MPC52xx_CRIT_IRQ_BASE	1
 #define MPC52xx_MAIN_IRQ_BASE	(MPC52xx_CRIT_IRQ_BASE + MPC52xx_CRIT_IRQ_NUM)
 #define MPC52xx_SDMA_IRQ_BASE	(MPC52xx_MAIN_IRQ_BASE + MPC52xx_MAIN_IRQ_NUM)
 #define MPC52xx_PERP_IRQ_BASE	(MPC52xx_SDMA_IRQ_BASE + MPC52xx_SDMA_IRQ_NUM)
-- 
cgit v1.1


From dbeb198d9366eb3d3ad64444ceecb5b1d5b5d7ef Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:35 -0800
Subject: [PATCH] ppc32: Remove __init qualifier from mpc52xx pci resources
 fixups

The mpc52xx_pci_fixup_resources is not only called at init but also when there
is a pci hotplug like when a cardbus card is plugged in.  So that function is
needed after init too.

Thanks to Asier Llano Palacios for reporting this.

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/syslib/mpc52xx_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ppc/syslib/mpc52xx_pci.c b/arch/ppc/syslib/mpc52xx_pci.c
index 4ac1908..e6cb3e2 100644
--- a/arch/ppc/syslib/mpc52xx_pci.c
+++ b/arch/ppc/syslib/mpc52xx_pci.c
@@ -151,7 +151,7 @@ mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs)
 #endif
 }
 
-static void __init
+static void
 mpc52xx_pci_fixup_resources(struct pci_dev *dev)
 {
 	int i;
-- 
cgit v1.1


From db674ed450f113518285f410c93abecd93e71a2f Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:36 -0800
Subject: [PATCH] ppc32: Fix MPC52xx configuration space access

This patch takes care of an errata of the MPC5200 by avoiding 32 bits access
in type 1 configuration accesses.  All others accesses are still 32 bits wide.
 It also adds some mb() since the simple out_be(...) are not sufficient in
this case.

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/syslib/mpc52xx_pci.c | 83 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/arch/ppc/syslib/mpc52xx_pci.c b/arch/ppc/syslib/mpc52xx_pci.c
index e6cb3e2..2c5e6dd 100644
--- a/arch/ppc/syslib/mpc52xx_pci.c
+++ b/arch/ppc/syslib/mpc52xx_pci.c
@@ -24,6 +24,12 @@
 #include <asm/machdep.h>
 
 
+/* This macro is defined to activate the workaround for the bug
+   435 of the MPC5200 (L25R). With it activated, we don't do any
+   32 bits configuration access during type-1 cycles */
+#define MPC5200_BUG_435_WORKAROUND
+
+
 static int
 mpc52xx_pci_read_config(struct pci_bus *bus, unsigned int devfn,
 				int offset, int len, u32 *val)
@@ -40,17 +46,39 @@ mpc52xx_pci_read_config(struct pci_bus *bus, unsigned int devfn,
 		((bus->number - hose->bus_offset) << 16) |
 		(devfn << 8) |
 		(offset & 0xfc));
+	mb();
+
+#ifdef MPC5200_BUG_435_WORKAROUND
+	if (bus->number != hose->bus_offset) {
+		switch (len) {
+			case 1:
+				value = in_8(((u8 __iomem *)hose->cfg_data) + (offset & 3));
+				break;
+			case 2:
+				value = in_le16(((u16 __iomem *)hose->cfg_data) + ((offset>>1) & 1));
+				break;
+
+			default:
+				value = in_le16((u16 __iomem *)hose->cfg_data) |
+					(in_le16(((u16 __iomem *)hose->cfg_data) + 1) << 16);
+				break;
+		}
+	}
+	else
+#endif
+	{
+		value = in_le32(hose->cfg_data);
 
-	value = in_le32(hose->cfg_data);
-
-	if (len != 4) {
-		value >>= ((offset & 0x3) << 3);
-		value &= 0xffffffff >> (32 - (len << 3));
+		if (len != 4) {
+			value >>= ((offset & 0x3) << 3);
+			value &= 0xffffffff >> (32 - (len << 3));
+		}
 	}
 
 	*val = value;
 
 	out_be32(hose->cfg_addr, 0);
+	mb();
 
 	return PCIBIOS_SUCCESSFUL;
 }
@@ -71,21 +99,48 @@ mpc52xx_pci_write_config(struct pci_bus *bus, unsigned int devfn,
 		((bus->number - hose->bus_offset) << 16) |
 		(devfn << 8) |
 		(offset & 0xfc));
+	mb();
+
+#ifdef MPC5200_BUG_435_WORKAROUND
+	if (bus->number != hose->bus_offset) {
+		switch (len) {
+			case 1:
+				out_8(((u8 __iomem *)hose->cfg_data) +
+					(offset & 3), val);
+				break;
+			case 2:
+				out_le16(((u16 __iomem *)hose->cfg_data) +
+					((offset>>1) & 1), val);
+				break;
+
+			default:
+				out_le16((u16 __iomem *)hose->cfg_data,
+					(u16)val);
+				out_le16(((u16 __iomem *)hose->cfg_data) + 1,
+					(u16)(val>>16));
+				break;
+		}
+	}
+	else
+#endif
+	{
+		if (len != 4) {
+			value = in_le32(hose->cfg_data);
 
-	if (len != 4) {
-		value = in_le32(hose->cfg_data);
+			offset = (offset & 0x3) << 3;
+			mask = (0xffffffff >> (32 - (len << 3)));
+			mask <<= offset;
 
-		offset = (offset & 0x3) << 3;
-		mask = (0xffffffff >> (32 - (len << 3)));
-		mask <<= offset;
+			value &= ~mask;
+			val = value | ((val << offset) & mask);
+		}
 
-		value &= ~mask;
-		val = value | ((val << offset) & mask);
+		out_le32(hose->cfg_data, val);
 	}
-
-	out_le32(hose->cfg_data, val);
+	mb();
 
 	out_be32(hose->cfg_addr, 0);
+	mb();
 
 	return PCIBIOS_SUCCESSFUL;
 }
-- 
cgit v1.1


From 041cb6241fa97c4881dd19d79f783b2e077acd28 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:37 -0800
Subject: [PATCH] ppc32: Fix MPC52xx PCI init in cas the bootloader didn't do
 it

We were counting on the bootloader to init some stuff, like get the bus out of
reset and enable accesses.

Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/syslib/mpc52xx_pci.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/ppc/syslib/mpc52xx_pci.c b/arch/ppc/syslib/mpc52xx_pci.c
index 2c5e6dd..313c96e 100644
--- a/arch/ppc/syslib/mpc52xx_pci.c
+++ b/arch/ppc/syslib/mpc52xx_pci.c
@@ -154,9 +154,12 @@ static struct pci_ops mpc52xx_pci_ops = {
 static void __init
 mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs)
 {
+	u32 tmp;
 
 	/* Setup control regs */
-		/* Nothing to do afaik */
+	tmp = in_be32(&pci_regs->scr);
+	tmp |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY;
+	out_be32(&pci_regs->scr, tmp);
 
 	/* Setup windows */
 	out_be32(&pci_regs->iw0btar, MPC52xx_PCI_IWBTAR_TRANSLATION(
@@ -197,13 +200,12 @@ mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs)
 	/* Not necessary and can be a bad thing if for example the bootloader
 	   is displaying a splash screen or ... Just left here for
 	   documentation purpose if anyone need it */
-#if 0
-	u32 tmp;
 	tmp = in_be32(&pci_regs->gscr);
+#if 0
 	out_be32(&pci_regs->gscr, tmp | MPC52xx_PCI_GSCR_PR);
 	udelay(50);
-	out_be32(&pci_regs->gscr, tmp);
 #endif
+	out_be32(&pci_regs->gscr, tmp & ~MPC52xx_PCI_GSCR_PR);
 }
 
 static void
-- 
cgit v1.1


From f80257a25d9f73a0e6e377c7d6bf29b8938c042d Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Fri, 6 Jan 2006 00:11:38 -0800
Subject: [PATCH] ppc32: Allows compilation of a MPC52xx kernel without PCI

Some custom cards might not need PCI, without this patch, compilation fails.

Signed-off-by: Roger Blofeld <blofeldus@yahoo.com>
Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/platforms/lite5200.c |  2 ++
 include/asm-ppc/io.h          |  2 ++
 include/asm-ppc/mpc52xx.h     | 11 +++++++++++
 3 files changed, 15 insertions(+)

diff --git a/arch/ppc/platforms/lite5200.c b/arch/ppc/platforms/lite5200.c
index d44cc99..7ed52dc 100644
--- a/arch/ppc/platforms/lite5200.c
+++ b/arch/ppc/platforms/lite5200.c
@@ -196,8 +196,10 @@ platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
 	mpc52xx_set_bat();
 
 	/* No ISA bus by default */
+#ifdef CONFIG_PCI
 	isa_io_base		= 0;
 	isa_mem_base		= 0;
+#endif
 
 	/* Powersave */
 	/* This is provided as an example on how to do it. But you
diff --git a/include/asm-ppc/io.h b/include/asm-ppc/io.h
index 84ac6e2..df9cf6e 100644
--- a/include/asm-ppc/io.h
+++ b/include/asm-ppc/io.h
@@ -27,6 +27,8 @@
 
 #if defined(CONFIG_4xx)
 #include <asm/ibm4xx.h>
+#elif defined(CONFIG_PPC_MPC52xx)
+#include <asm/mpc52xx.h>
 #elif defined(CONFIG_8xx)
 #include <asm/mpc8xx.h>
 #elif defined(CONFIG_8260)
diff --git a/include/asm-ppc/mpc52xx.h b/include/asm-ppc/mpc52xx.h
index 04d5630..a055e07 100644
--- a/include/asm-ppc/mpc52xx.h
+++ b/include/asm-ppc/mpc52xx.h
@@ -29,6 +29,17 @@ struct pt_regs;
 #endif /* __ASSEMBLY__ */
 
 
+#ifdef CONFIG_PCI
+#define _IO_BASE	isa_io_base
+#define _ISA_MEM_BASE	isa_mem_base
+#define PCI_DRAM_OFFSET	pci_dram_offset
+#else
+#define _IO_BASE	0
+#define _ISA_MEM_BASE	0
+#define PCI_DRAM_OFFSET	0
+#endif
+
+
 /* ======================================================================== */
 /* PPC Sys devices definition                                               */
 /* ======================================================================== */
-- 
cgit v1.1


From 683e2cc6dc5aa9bb4ba2f2e0662df9d7f0a1d6c2 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Fri, 6 Jan 2006 00:11:39 -0800
Subject: [PATCH] ppc32: Re-add embed_config.c to ml300/ep405

Commit 3e9e7c1d0b7a36fb8affb973a054c5098e27baa8 (ppc32: cleanup AMCC PPC40x
eval boards to support U-Boot) broke the kernel for ML300 / EP405.

It still compiles as there's a weak definition of the function in
misc-embedded.c, but the kernel crashes as the bd_t fixup isn't performed.

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/boot/simple/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/ppc/boot/simple/Makefile b/arch/ppc/boot/simple/Makefile
index f3e9c53..9533f8d 100644
--- a/arch/ppc/boot/simple/Makefile
+++ b/arch/ppc/boot/simple/Makefile
@@ -190,6 +190,8 @@ boot-$(CONFIG_REDWOOD_5)	+= embed_config.o
 boot-$(CONFIG_REDWOOD_6)	+= embed_config.o
 boot-$(CONFIG_8xx)		+= embed_config.o
 boot-$(CONFIG_8260)		+= embed_config.o
+boot-$(CONFIG_EP405)		+= embed_config.o
+boot-$(CONFIG_XILINX_ML300)	+= embed_config.o
 boot-$(CONFIG_BSEIP)		+= iic.o
 boot-$(CONFIG_MBX)		+= iic.o pci.o qspan_pci.o
 boot-$(CONFIG_MV64X60)		+= misc-mv64x60.o
-- 
cgit v1.1


From 9f6d4b0c21a6894dad7665d3dda4174c7c120784 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@ubuntu.com>
Date: Fri, 6 Jan 2006 00:11:40 -0800
Subject: [PATCH] therm_adt746x: Quiet fan speed change messages

Only output the messages about fan speed changes with a verbose=1 module
param.

Signed-off-by: Fabio M. Di Nitto <fabbione@ubuntu.com>
Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/macintosh/therm_adt746x.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index f386966..5e1f5e9 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -52,6 +52,7 @@ static char *sensor_location[3] = {NULL, NULL, NULL};
 
 static int limit_adjust = 0;
 static int fan_speed = -1;
+static int verbose = 0;
 
 MODULE_AUTHOR("Colin Leroy <colin@colino.net>");
 MODULE_DESCRIPTION("Driver for ADT746x thermostat in iBook G4 and "
@@ -66,6 +67,10 @@ module_param(fan_speed, int, 0644);
 MODULE_PARM_DESC(fan_speed,"Specify starting fan speed (0-255) "
 		 "(default 64)");
 
+module_param(verbose, bool, 0);
+MODULE_PARM_DESC(verbose,"Verbose log operations "
+		 "(default 0)");
+
 struct thermostat {
 	struct i2c_client	clt;
 	u8			temps[3];
@@ -149,13 +154,13 @@ detach_thermostat(struct i2c_adapter *adapter)
 	if (thread_therm != NULL) {
 		kthread_stop(thread_therm);
 	}
-		
+
 	printk(KERN_INFO "adt746x: Putting max temperatures back from "
 			 "%d, %d, %d to %d, %d, %d\n",
 		th->limits[0], th->limits[1], th->limits[2],
 		th->initial_limits[0], th->initial_limits[1],
 		th->initial_limits[2]);
-	
+
 	for (i = 0; i < 3; i++)
 		write_reg(th, LIMIT_REG[i], th->initial_limits[i]);
 
@@ -212,12 +217,14 @@ static void write_fan_speed(struct thermostat *th, int speed, int fan)
 		return;
 	
 	if (th->last_speed[fan] != speed) {
-		if (speed == -1)
-			printk(KERN_DEBUG "adt746x: Setting speed to automatic "
-				"for %s fan.\n", sensor_location[fan+1]);
-		else
-			printk(KERN_DEBUG "adt746x: Setting speed to %d "
-				"for %s fan.\n", speed, sensor_location[fan+1]);
+		if (verbose) {
+			if (speed == -1)
+				printk(KERN_DEBUG "adt746x: Setting speed to automatic "
+					"for %s fan.\n", sensor_location[fan+1]);
+			else
+				printk(KERN_DEBUG "adt746x: Setting speed to %d "
+					"for %s fan.\n", speed, sensor_location[fan+1]);
+		}
 	} else
 		return;
 	
@@ -298,10 +305,11 @@ static void update_fans_speed (struct thermostat *th)
 			if (new_speed > 255)
 				new_speed = 255;
 
-			printk(KERN_DEBUG "adt746x: setting fans speed to %d "
-					 "(limit exceeded by %d on %s) \n",
-					new_speed, var,
-					sensor_location[fan_number+1]);
+			if (verbose)
+				printk(KERN_DEBUG "adt746x: Setting fans speed to %d "
+						 "(limit exceeded by %d on %s) \n",
+						new_speed, var,
+						sensor_location[fan_number+1]);
 			write_both_fan_speed(th, new_speed);
 			th->last_var[fan_number] = var;
 		} else if (var < -2) {
@@ -309,8 +317,9 @@ static void update_fans_speed (struct thermostat *th)
 			 * so cold (lastvar >= -1) */
 			if (i == 2 && lastvar < -1) {
 				if (th->last_speed[fan_number] != 0)
-					printk(KERN_DEBUG "adt746x: Stopping "
-						"fans.\n");
+					if (verbose)
+						printk(KERN_DEBUG "adt746x: Stopping "
+							"fans.\n");
 				write_both_fan_speed(th, 0);
 			}
 		}
@@ -406,7 +415,7 @@ static int attach_one_thermostat(struct i2c_adapter *adapter, int addr,
 		th->initial_limits[i] = read_reg(th, LIMIT_REG[i]);
 		set_limit(th, i);
 	}
-	
+
 	printk(KERN_INFO "adt746x: Lowering max temperatures from %d, %d, %d"
 			 " to %d, %d, %d\n",
 			 th->initial_limits[0], th->initial_limits[1],
-- 
cgit v1.1


From 642fb4d1f1dd2417aa69189fe5ceb81e4fb72900 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:41 -0800
Subject: [PATCH] NOMMU: Provide shared-writable mmap support on ramfs

The attached patch makes ramfs support shared-writable mmaps by:

 (1) Attempting to perform a contiguous block allocation to the requested size
     when truncate attempts to increase the file from zero size, such as
     happens when:

	fd = shm_open("/file/on/ramfs", ...):
	ftruncate(fd, size_requested);
	addr = mmap(NULL, subsize, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED,
		    fd, offset);

 (2) Permitting any shared-writable mapping over any contiguous set of extant
     pages. get_unmapped_area() will return the address into the actual ramfs
     pages. The mapping may start anywhere and be of any size, but may not go
     over the end of file. Multiple mappings may overlap in any way.

 (3) Not permitting a file to be shrunk if it would truncate any shared
     mappings (private mappings are copied).

Thus this patch provides support for POSIX shared memory on NOMMU kernels,
with certain limitations such as there being a large enough block of pages
available to support the allocation and it only working on directly mappable
filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/Makefile     |   4 +-
 fs/ramfs/file-mmu.c   |  57 ++++++++++
 fs/ramfs/file-nommu.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ramfs/inode.c      |  22 +---
 fs/ramfs/internal.h   |  15 +++
 include/linux/ramfs.h |  10 ++
 6 files changed, 378 insertions(+), 22 deletions(-)
 create mode 100644 fs/ramfs/file-mmu.c
 create mode 100644 fs/ramfs/file-nommu.c
 create mode 100644 fs/ramfs/internal.h

diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f30..5a0236e 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
 
 obj-$(CONFIG_RAMFS) += ramfs.o
 
-ramfs-objs := inode.o
+file-mmu-y := file-nommu.o
+file-mmu-$(CONFIG_MMU) := file-mmu.o
+ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 0000000..2115383
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= simple_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.llseek		= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 0000000..3f810ac
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.prepare_write		= simple_prepare_write,
+	.commit_write		= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= generic_file_read,
+	.write			= generic_file_write,
+	.fsync			= simple_sync_file,
+	.sendfile		= generic_file_sendfile,
+	.llseek			= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	struct pagevec lru_pvec;
+	unsigned long npages, xpages, loop, limit;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		goto too_big;
+
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && newsize > limit)
+		goto fsize_exceeded;
+
+	if (newsize > inode->i_sb->s_maxbytes)
+		goto too_big;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (loop = 0; loop < npages; loop++)
+		set_page_count(pages + loop, 1);
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	pagevec_init(&lru_pvec, 0);
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		unlock_page(page);
+	}
+
+	pagevec_lru_add(&lru_pvec);
+	return 0;
+
+ fsize_exceeded:
+	send_sig(SIGXFSZ, current, 0);
+ too_big:
+	return -EFBIG;
+
+ add_error:
+	page_cache_release(pages + loop);
+	for (loop++; loop < npages; loop++)
+		__free_page(pages + loop);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+				      size_t newsize, size_t size)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	/* search for VMAs that fall within the dead zone */
+	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+			      newsize >> PAGE_SHIFT,
+			      (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+			      ) {
+		/* found one - only interested if it's shared out of the page
+		 * cache */
+		if (vma->vm_flags & VM_SHARED)
+			return -ETXTBSY; /* not quite true, but near enough */
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = ramfs_nommu_check_mappings(inode, newsize, size);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = vmtruncate(inode, size);
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* by providing our own setattr() method, we skip this quotaism */
+	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = i_size_read(inode);
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	ret = inode_setattr(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out;
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+ out:
+	if (pages) {
+		ptr = pages;
+		for (loop = lpages; loop > 0; loop--)
+			put_page(*ptr++);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917..c66bd5e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
 #include <linux/ramfs.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
 static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
 static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 	return error;
 }
 
-static struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
-	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
-	.llseek		= generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
 static struct inode_operations ramfs_dir_inode_operations = {
 	.create		= ramfs_create,
 	.lookup		= simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 0000000..272c8a7
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index e0a4faa..953b6df 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -5,6 +5,16 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
 struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
+#ifndef CONFIG_MMU
+extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+
+extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+#endif
+
 extern struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 
-- 
cgit v1.1


From b0e15190ead07056ab0c3844a499ff35e66d27cc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:42 -0800
Subject: [PATCH] NOMMU: Make SYSV IPC SHM use ramfs facilities on NOMMU

The attached patch makes the SYSV IPC shared memory facilities use the new
ramfs facilities on a no-MMU kernel.

The following changes are made:

 (1) There are now shmem_mmap() and shmem_get_unmapped_area() functions to
     allow the IPC SHM facilities to commune with the tiny-shmem and shmem
     code.

 (2) ramfs files now need resizing using do_truncate() rather than by modifying
     the inode size directly (see shmem_file_setup()). This causes ramfs to
     attempt to bind a block of pages of sufficient size to the inode.

 (3) CONFIG_SYSVIPC is no longer contingent on CONFIG_MMU.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  9 +++++++++
 init/Kconfig       |  1 -
 ipc/shm.c          | 18 +++++++++++++-----
 mm/nommu.c         |  7 +++++++
 mm/shmem.c         |  2 +-
 mm/tiny-shmem.c    | 29 ++++++++++++++++++++++++++++-
 6 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75ec04e..26f3094 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -654,9 +654,18 @@ static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 }
 #endif
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
 
 int shmem_zero_setup(struct vm_area_struct *);
 
+#ifndef CONFIG_MMU
+extern unsigned long shmem_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags);
+#endif
+
 static inline int can_do_mlock(void)
 {
 	if (capable(CAP_IPC_LOCK))
diff --git a/init/Kconfig b/init/Kconfig
index ce737e0..24e0f7c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -105,7 +105,6 @@ config SWAP
 
 config SYSVIPC
 	bool "System V IPC"
-	depends on MMU
 	---help---
 	  Inter Process Communication is a suite of library functions and
 	  system calls which let processes (running programs) synchronize and
diff --git a/ipc/shm.c b/ipc/shm.c
index 587d836..0ef4a1c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -157,14 +157,22 @@ static void shm_close (struct vm_area_struct *shmd)
 
 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	file_accessed(file);
-	vma->vm_ops = &shm_vm_ops;
-	shm_inc(file->f_dentry->d_inode->i_ino);
-	return 0;
+	int ret;
+
+	ret = shmem_mmap(file, vma);
+	if (ret == 0) {
+		vma->vm_ops = &shm_vm_ops;
+		shm_inc(file->f_dentry->d_inode->i_ino);
+	}
+
+	return ret;
 }
 
 static struct file_operations shm_file_operations = {
-	.mmap	= shm_mmap
+	.mmap	= shm_mmap,
+#ifndef CONFIG_MMU
+	.get_unmapped_area = shmem_get_unmapped_area,
+#endif
 };
 
 static struct vm_operations_struct shm_vm_ops = {
diff --git a/mm/nommu.c b/mm/nommu.c
index c119681..c10262d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
 	return 0;
 }
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+			unsigned long address, int *type)
+{
+	BUG();
+	return NULL;
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index 65c148e..a1f2f02 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1270,7 +1270,7 @@ out_nomem:
 	return retval;
 }
 
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf..cdc6d43 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 		goto close_file;
 
 	d_instantiate(dentry, inode);
-	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
+
 	file->f_vfsmnt = mntget(shm_mnt);
 	file->f_dentry = dentry;
 	file->f_mapping = inode->i_mapping;
 	file->f_op = &ramfs_file_operations;
 	file->f_mode = FMODE_WRITE | FMODE_READ;
+
+	/* notify everyone as to the change of file size */
+	error = do_truncate(dentry, size, file);
+	if (error < 0)
+		goto close_file;
+
 	return file;
 
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	return 0;
 }
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+#ifndef CONFIG_MMU
+	return ramfs_nommu_mmap(file, vma);
+#else
+	return 0;
+#endif
+}
+
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+				      unsigned long addr,
+				      unsigned long len,
+				      unsigned long pgoff,
+				      unsigned long flags)
+{
+	return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
-- 
cgit v1.1


From 5c40f7f373889930d176a515ec375b60a70b5b49 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:43 -0800
Subject: [PATCH] FRV: Implement futex operations for FRV

The attached patch implements futex operations for the FRV architecture. The
operations are applicable to both MMU and no-MMU modes; though the EFAULT
handling will be a little bit of wasted space on the latter.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/Makefile |   1 +
 arch/frv/kernel/futex.c  | 242 +++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-frv/futex.h  |  42 +-------
 3 files changed, 244 insertions(+), 41 deletions(-)
 create mode 100644 arch/frv/kernel/futex.c

diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile
index 981c2c7..422f30e 100644
--- a/arch/frv/kernel/Makefile
+++ b/arch/frv/kernel/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_FUJITSU_MB93493)	+= irq-mb93493.o
 obj-$(CONFIG_PM)		+= pm.o cmode.o
 obj-$(CONFIG_MB93093_PDK)	+= pm-mb93093.o
 obj-$(CONFIG_SYSCTL)		+= sysctl.o
+obj-$(CONFIG_FUTEX)		+= futex.o
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
new file mode 100644
index 0000000..eae874a
--- /dev/null
+++ b/arch/frv/kernel/futex.c
@@ -0,0 +1,242 @@
+/* futex.c: futex operations
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/futex.h>
+#include <asm/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+/*
+ * the various futex operations; MMU fault checking is ignored under no-MMU
+ * conditions
+ */
+static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval)
+{
+	int oldval, ret;
+
+	asm("0:						\n"
+	    "	orcc		gr0,gr0,gr0,icc3	\n"	/* set ICC3.Z */
+	    "	ckeq		icc3,cc7		\n"
+	    "1:	ld.p		%M0,%1			\n"	/* LD.P/ORCR must be atomic */
+	    "	orcr		cc7,cc7,cc3		\n"	/* set CC3 to true */
+	    "2:	cst.p		%3,%M0		,cc3,#1	\n"
+	    "	corcc		gr29,gr29,gr0	,cc3,#1	\n"	/* clear ICC3.Z if store happens */
+	    "	beq		icc3,#0,0b		\n"
+	    "	setlos		0,%2			\n"
+	    "3:						\n"
+	    ".subsection 2				\n"
+	    "4:	setlos		%5,%2			\n"
+	    "	bra		3b			\n"
+	    ".previous					\n"
+	    ".section __ex_table,\"a\"			\n"
+	    "	.balign		8			\n"
+	    "	.long		1b,4b			\n"
+	    "	.long		2b,4b			\n"
+	    ".previous"
+	    : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
+	    : "3"(oparg), "i"(-EFAULT)
+	    : "memory", "cc7", "cc3", "icc3"
+	    );
+
+	*_oldval = oldval;
+	return ret;
+}
+
+static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval)
+{
+	int oldval, ret;
+
+	asm("0:						\n"
+	    "	orcc		gr0,gr0,gr0,icc3	\n"	/* set ICC3.Z */
+	    "	ckeq		icc3,cc7		\n"
+	    "1:	ld.p		%M0,%1			\n"	/* LD.P/ORCR must be atomic */
+	    "	orcr		cc7,cc7,cc3		\n"	/* set CC3 to true */
+	    "	add		%1,%3,%3		\n"
+	    "2:	cst.p		%3,%M0		,cc3,#1	\n"
+	    "	corcc		gr29,gr29,gr0	,cc3,#1	\n"	/* clear ICC3.Z if store happens */
+	    "	beq		icc3,#0,0b		\n"
+	    "	setlos		0,%2			\n"
+	    "3:						\n"
+	    ".subsection 2				\n"
+	    "4:	setlos		%5,%2			\n"
+	    "	bra		3b			\n"
+	    ".previous					\n"
+	    ".section __ex_table,\"a\"			\n"
+	    "	.balign		8			\n"
+	    "	.long		1b,4b			\n"
+	    "	.long		2b,4b			\n"
+	    ".previous"
+	    : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
+	    : "3"(oparg), "i"(-EFAULT)
+	    : "memory", "cc7", "cc3", "icc3"
+	    );
+
+	*_oldval = oldval;
+	return ret;
+}
+
+static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval)
+{
+	int oldval, ret;
+
+	asm("0:						\n"
+	    "	orcc		gr0,gr0,gr0,icc3	\n"	/* set ICC3.Z */
+	    "	ckeq		icc3,cc7		\n"
+	    "1:	ld.p		%M0,%1			\n"	/* LD.P/ORCR must be atomic */
+	    "	orcr		cc7,cc7,cc3		\n"	/* set CC3 to true */
+	    "	or		%1,%3,%3		\n"
+	    "2:	cst.p		%3,%M0		,cc3,#1	\n"
+	    "	corcc		gr29,gr29,gr0	,cc3,#1	\n"	/* clear ICC3.Z if store happens */
+	    "	beq		icc3,#0,0b		\n"
+	    "	setlos		0,%2			\n"
+	    "3:						\n"
+	    ".subsection 2				\n"
+	    "4:	setlos		%5,%2			\n"
+	    "	bra		3b			\n"
+	    ".previous					\n"
+	    ".section __ex_table,\"a\"			\n"
+	    "	.balign		8			\n"
+	    "	.long		1b,4b			\n"
+	    "	.long		2b,4b			\n"
+	    ".previous"
+	    : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
+	    : "3"(oparg), "i"(-EFAULT)
+	    : "memory", "cc7", "cc3", "icc3"
+	    );
+
+	*_oldval = oldval;
+	return ret;
+}
+
+static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval)
+{
+	int oldval, ret;
+
+	asm("0:						\n"
+	    "	orcc		gr0,gr0,gr0,icc3	\n"	/* set ICC3.Z */
+	    "	ckeq		icc3,cc7		\n"
+	    "1:	ld.p		%M0,%1			\n"	/* LD.P/ORCR must be atomic */
+	    "	orcr		cc7,cc7,cc3		\n"	/* set CC3 to true */
+	    "	and		%1,%3,%3		\n"
+	    "2:	cst.p		%3,%M0		,cc3,#1	\n"
+	    "	corcc		gr29,gr29,gr0	,cc3,#1	\n"	/* clear ICC3.Z if store happens */
+	    "	beq		icc3,#0,0b		\n"
+	    "	setlos		0,%2			\n"
+	    "3:						\n"
+	    ".subsection 2				\n"
+	    "4:	setlos		%5,%2			\n"
+	    "	bra		3b			\n"
+	    ".previous					\n"
+	    ".section __ex_table,\"a\"			\n"
+	    "	.balign		8			\n"
+	    "	.long		1b,4b			\n"
+	    "	.long		2b,4b			\n"
+	    ".previous"
+	    : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
+	    : "3"(oparg), "i"(-EFAULT)
+	    : "memory", "cc7", "cc3", "icc3"
+	    );
+
+	*_oldval = oldval;
+	return ret;
+}
+
+static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval)
+{
+	int oldval, ret;
+
+	asm("0:						\n"
+	    "	orcc		gr0,gr0,gr0,icc3	\n"	/* set ICC3.Z */
+	    "	ckeq		icc3,cc7		\n"
+	    "1:	ld.p		%M0,%1			\n"	/* LD.P/ORCR must be atomic */
+	    "	orcr		cc7,cc7,cc3		\n"	/* set CC3 to true */
+	    "	xor		%1,%3,%3		\n"
+	    "2:	cst.p		%3,%M0		,cc3,#1	\n"
+	    "	corcc		gr29,gr29,gr0	,cc3,#1	\n"	/* clear ICC3.Z if store happens */
+	    "	beq		icc3,#0,0b		\n"
+	    "	setlos		0,%2			\n"
+	    "3:						\n"
+	    ".subsection 2				\n"
+	    "4:	setlos		%5,%2			\n"
+	    "	bra		3b			\n"
+	    ".previous					\n"
+	    ".section __ex_table,\"a\"			\n"
+	    "	.balign		8			\n"
+	    "	.long		1b,4b			\n"
+	    "	.long		2b,4b			\n"
+	    ".previous"
+	    : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
+	    : "3"(oparg), "i"(-EFAULT)
+	    : "memory", "cc7", "cc3", "icc3"
+	    );
+
+	*_oldval = oldval;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * do the futex operations
+ */
+int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		ret = atomic_futex_op_xchg_set(oparg, uaddr, &oldval);
+		break;
+	case FUTEX_OP_ADD:
+		ret = atomic_futex_op_xchg_add(oparg, uaddr, &oldval);
+		break;
+	case FUTEX_OP_OR:
+		ret = atomic_futex_op_xchg_or(oparg, uaddr, &oldval);
+		break;
+	case FUTEX_OP_ANDN:
+		ret = atomic_futex_op_xchg_and(~oparg, uaddr, &oldval);
+		break;
+	case FUTEX_OP_XOR:
+		ret = atomic_futex_op_xchg_xor(oparg, uaddr, &oldval);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS; break;
+		}
+	}
+
+	return ret;
+
+} /* end futex_atomic_op_inuser() */
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9feff4c..fca9d90 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -7,47 +7,7 @@
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
-static inline int
-futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
-{
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
-	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
-	inc_preempt_count();
-
-	switch (op) {
-	case FUTEX_OP_SET:
-	case FUTEX_OP_ADD:
-	case FUTEX_OP_OR:
-	case FUTEX_OP_ANDN:
-	case FUTEX_OP_XOR:
-	default:
-		ret = -ENOSYS;
-	}
-
-	dec_preempt_count();
-
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
-	return ret;
-}
+extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 #endif
 #endif
-- 
cgit v1.1


From 7ee1dd3fee22f15728f545d266403fc977e1eb99 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:44 -0800
Subject: [PATCH] FRV: Make futex code compilable on nommu [try #2]

Make the futex code compilable and usable on NOMMU by making the attempt to
handle page faults conditional on CONFIG_MMU.  If this is not enabled, then
we can assume that EFAULT returned from futex_atomic_op_inuser() is not
recoverable, and that the address lies outside of valid memory.

handle_mm_fault() is made to BUG if called on NOMMU without attempting to
invoke the actual handler (__handle_mm_fault).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 22 +++++++++++++++++++---
 kernel/futex.c     |  7 +++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26f3094..bc01fff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -717,12 +717,28 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
-extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 
-static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access)
+#ifdef CONFIG_MMU
+extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
+			unsigned long address, int write_access);
+
+static inline int handle_mm_fault(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long address,
+			int write_access)
 {
-	return __handle_mm_fault(mm, vma, address, write_access) & (~VM_FAULT_WRITE);
+	return __handle_mm_fault(mm, vma, address, write_access) &
+				(~VM_FAULT_WRITE);
 }
+#else
+static inline int handle_mm_fault(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long address,
+			int write_access)
+{
+	/* should never happen if there's no MMU */
+	BUG();
+	return VM_FAULT_SIGBUS;
+}
+#endif
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5e71a6b..5efa2f9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -356,6 +356,13 @@ retry:
 		if (bh1 != bh2)
 			spin_unlock(&bh2->lock);
 
+#ifndef CONFIG_MMU
+		/* we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking */
+		ret = op_ret;
+		goto out;
+#endif
+
 		if (unlikely(op_ret != -EFAULT)) {
 			ret = op_ret;
 			goto out;
-- 
cgit v1.1


From 8efc0ab50edbac5c65191b8a58dfdab3741b7901 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:44 -0800
Subject: [PATCH] frv: fix signal handling

The attached patch makes FRV signal handling work properly:

 (1) After do_notify_resume() has been called, the work flags must be checked
     again (there may be another signal to deliver or the process might require
     rescheduling for instance).

 (2) After the signal frame is set up on the userspace stack, ptrace() should
     be given an opportunity to single-step into the signal handler.

 (3) The error state from setting up a signal frame should be passed back up
     the call chain.

 (4) The segfault handler shouldn't be preemptively reset in the arch if we
     fail to deliver a SEGV signal: force_sig() will take care of that.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/entry.S  |  2 +-
 arch/frv/kernel/signal.c | 73 +++++++++++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index ad10ea5..5f65483 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1076,7 +1076,7 @@ __entry_work_notifysig:
 	LEDS		0x6410
 	ori.p		gr4,#0,gr8
 	call		do_notify_resume
-	bra		__entry_return_direct
+	bra		__entry_resume_userspace
 
 	# perform syscall entry tracing
 __syscall_trace_entry:
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index d4ccc07..89a1cf5 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -297,7 +297,8 @@ static inline void __user *get_sigframe(struct k_sigaction *ka,
 /*
  *
  */
-static void setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs)
+static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+		       struct pt_regs *regs)
 {
 	struct sigframe __user *frame;
 	int rsig;
@@ -362,26 +363,30 @@ static void setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct p
 
 	set_fs(USER_DS);
 
+	/* the tracer may want to single-step inside the handler */
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
 #if DEBUG_SIG
 	printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
-		sig, current->comm, current->pid, frame, regs->pc, frame->pretcode);
+	       sig, current->comm, current->pid, frame, regs->pc,
+	       frame->pretcode);
 #endif
 
-	return;
+	return 1;
 
 give_sigsegv:
-	if (sig == SIGSEGV)
-		ka->sa.sa_handler = SIG_DFL;
-
 	force_sig(SIGSEGV, current);
+	return 0;
+
 } /* end setup_frame() */
 
 /*****************************************************************************/
 /*
  *
  */
-static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-			   sigset_t *set, struct pt_regs * regs)
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			  sigset_t *set, struct pt_regs * regs)
 {
 	struct rt_sigframe __user *frame;
 	int rsig;
@@ -457,17 +462,21 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 
 	set_fs(USER_DS);
 
+	/* the tracer may want to single-step inside the handler */
+	if (test_thread_flag(TIF_SINGLESTEP))
+		ptrace_notify(SIGTRAP);
+
 #if DEBUG_SIG
 	printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
-		sig, current->comm, current->pid, frame, regs->pc, frame->pretcode);
+		sig, current->comm, current->pid, frame, regs->pc,
+	       frame->pretcode);
 #endif
 
-	return;
+	return 1;
 
 give_sigsegv:
-	if (sig == SIGSEGV)
-		ka->sa.sa_handler = SIG_DFL;
 	force_sig(SIGSEGV, current);
+	return 0;
 
 } /* end setup_rt_frame() */
 
@@ -475,10 +484,12 @@ give_sigsegv:
 /*
  * OK, we're invoking a handler
  */
-static void handle_signal(unsigned long sig, siginfo_t *info,
-			  struct k_sigaction *ka, sigset_t *oldset,
-			  struct pt_regs *regs)
+static int handle_signal(unsigned long sig, siginfo_t *info,
+			 struct k_sigaction *ka, sigset_t *oldset,
+			 struct pt_regs *regs)
 {
+	int ret;
+
 	/* Are we from a system call? */
 	if (in_syscall(regs)) {
 		/* If so, check system call restarting.. */
@@ -493,6 +504,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 				regs->gr8 = -EINTR;
 				break;
 			}
+
 			/* fallthrough */
 		case -ERESTARTNOINTR:
 			regs->gr8 = regs->orig_gr8;
@@ -502,16 +514,22 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
 
 	/* Set up the stack frame */
 	if (ka->sa.sa_flags & SA_SIGINFO)
-		setup_rt_frame(sig, ka, info, oldset, regs);
+		ret = setup_rt_frame(sig, ka, info, oldset, regs);
 	else
-		setup_frame(sig, ka, oldset, regs);
+		ret = setup_frame(sig, ka, oldset, regs);
+
+	if (ret) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked, &current->blocked,
+			  &ka->sa.sa_mask);
+		if (!(ka->sa.sa_flags & SA_NODEFER))
+			sigaddset(&current->blocked, sig);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+
+	return ret;
 
-	spin_lock_irq(&current->sighand->siglock);
-	sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&current->blocked, sig);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 } /* end handle_signal() */
 
 /*****************************************************************************/
@@ -542,12 +560,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 		oldset = &current->blocked;
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-	if (signr > 0) {
-		handle_signal(signr, &info, &ka, oldset, regs);
-		return 1;
-	}
+	if (signr > 0)
+		return handle_signal(signr, &info, &ka, oldset, regs);
 
- no_signal:
+no_signal:
 	/* Did we come from a system call? */
 	if (regs->syscallno >= 0) {
 		/* Restart the system call - no handlers present */
@@ -565,6 +581,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	}
 
 	return 0;
+
 } /* end do_signal() */
 
 /*****************************************************************************/
-- 
cgit v1.1


From fef2b580eb50281ae1d2413ab340f677f6722281 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:45 -0800
Subject: [PATCH] frv: improve signal handling

The attached patch improves the signal handling:

 (1) It makes do_signal() static as it isn't called from anywhere outside of
     the arch code.

 (2) It removes the regs argument to all the static functions within that file,
     using __frame instead (which is the same thing held in a global register).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/signal.c | 102 +++++++++++++++++++++++------------------------
 include/asm-frv/signal.h |   1 -
 2 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 89a1cf5..5b7146f 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -35,7 +35,7 @@ struct fdpic_func_descriptor {
 	unsigned long	GOT;
 };
 
-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+static int do_signal(sigset_t *oldset);
 
 /*
  * Atomically swap in the new signal mask, and wait for a signal.
@@ -55,7 +55,7 @@ asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 	while (1) {
 		current->state = TASK_INTERRUPTIBLE;
 		schedule();
-		if (do_signal(__frame, &saveset))
+		if (do_signal(&saveset))
 			/* return the signal number as the return value of this function
 			 * - this is an utterly evil hack. syscalls should not invoke do_signal()
 			 *   as entry.S sets regs->gr8 to the return value of the system call
@@ -91,7 +91,7 @@ asmlinkage int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
 	while (1) {
 		current->state = TASK_INTERRUPTIBLE;
 		schedule();
-		if (do_signal(__frame, &saveset))
+		if (do_signal(&saveset))
 			/* return the signal number as the return value of this function
 			 * - this is an utterly evil hack. syscalls should not invoke do_signal()
 			 *   as entry.S sets regs->gr8 to the return value of the system call
@@ -276,13 +276,12 @@ static int setup_sigcontext(struct sigcontext __user *sc, unsigned long mask)
  * Determine which stack to use..
  */
 static inline void __user *get_sigframe(struct k_sigaction *ka,
-					struct pt_regs *regs,
 					size_t frame_size)
 {
 	unsigned long sp;
 
 	/* Default to using normal stack */
-	sp = regs->sp;
+	sp = __frame->sp;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
@@ -291,19 +290,19 @@ static inline void __user *get_sigframe(struct k_sigaction *ka,
 	}
 
 	return (void __user *) ((sp - frame_size) & ~7UL);
+
 } /* end get_sigframe() */
 
 /*****************************************************************************/
 /*
  *
  */
-static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
-		       struct pt_regs *regs)
+static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set)
 {
 	struct sigframe __user *frame;
 	int rsig;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, sizeof(*frame));
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -347,18 +346,18 @@ static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
 	}
 
 	/* set up registers for signal handler */
-	regs->sp   = (unsigned long) frame;
-	regs->lr   = (unsigned long) &frame->retcode;
-	regs->gr8  = sig;
+	__frame->sp   = (unsigned long) frame;
+	__frame->lr   = (unsigned long) &frame->retcode;
+	__frame->gr8  = sig;
 
 	if (get_personality & FDPIC_FUNCPTRS) {
 		struct fdpic_func_descriptor __user *funcptr =
 			(struct fdpic_func_descriptor *) ka->sa.sa_handler;
-		__get_user(regs->pc, &funcptr->text);
-		__get_user(regs->gr15, &funcptr->GOT);
+		__get_user(__frame->pc, &funcptr->text);
+		__get_user(__frame->gr15, &funcptr->GOT);
 	} else {
-		regs->pc   = (unsigned long) ka->sa.sa_handler;
-		regs->gr15 = 0;
+		__frame->pc   = (unsigned long) ka->sa.sa_handler;
+		__frame->gr15 = 0;
 	}
 
 	set_fs(USER_DS);
@@ -369,7 +368,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
 
 #if DEBUG_SIG
 	printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
-	       sig, current->comm, current->pid, frame, regs->pc,
+	       sig, current->comm, current->pid, frame, __frame->pc,
 	       frame->pretcode);
 #endif
 
@@ -386,12 +385,12 @@ give_sigsegv:
  *
  */
 static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-			  sigset_t *set, struct pt_regs * regs)
+			  sigset_t *set)
 {
 	struct rt_sigframe __user *frame;
 	int rsig;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, sizeof(*frame));
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		goto give_sigsegv;
@@ -414,7 +413,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	if (__put_user(0, &frame->uc.uc_flags) ||
 	    __put_user(0, &frame->uc.uc_link) ||
 	    __put_user((void*)current->sas_ss_sp, &frame->uc.uc_stack.ss_sp) ||
-	    __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags) ||
+	    __put_user(sas_ss_flags(__frame->sp), &frame->uc.uc_stack.ss_flags) ||
 	    __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size))
 		goto give_sigsegv;
 
@@ -445,19 +444,19 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	}
 
 	/* Set up registers for signal handler */
-	regs->sp  = (unsigned long) frame;
-	regs->lr   = (unsigned long) &frame->retcode;
-	regs->gr8 = sig;
-	regs->gr9 = (unsigned long) &frame->info;
+	__frame->sp  = (unsigned long) frame;
+	__frame->lr   = (unsigned long) &frame->retcode;
+	__frame->gr8 = sig;
+	__frame->gr9 = (unsigned long) &frame->info;
 
 	if (get_personality & FDPIC_FUNCPTRS) {
 		struct fdpic_func_descriptor *funcptr =
 			(struct fdpic_func_descriptor __user *) ka->sa.sa_handler;
-		__get_user(regs->pc, &funcptr->text);
-		__get_user(regs->gr15, &funcptr->GOT);
+		__get_user(__frame->pc, &funcptr->text);
+		__get_user(__frame->gr15, &funcptr->GOT);
 	} else {
-		regs->pc   = (unsigned long) ka->sa.sa_handler;
-		regs->gr15 = 0;
+		__frame->pc   = (unsigned long) ka->sa.sa_handler;
+		__frame->gr15 = 0;
 	}
 
 	set_fs(USER_DS);
@@ -468,7 +467,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 
 #if DEBUG_SIG
 	printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
-		sig, current->comm, current->pid, frame, regs->pc,
+	       sig, current->comm, current->pid, frame, __frame->pc,
 	       frame->pretcode);
 #endif
 
@@ -485,38 +484,37 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 static int handle_signal(unsigned long sig, siginfo_t *info,
-			 struct k_sigaction *ka, sigset_t *oldset,
-			 struct pt_regs *regs)
+			 struct k_sigaction *ka, sigset_t *oldset)
 {
 	int ret;
 
 	/* Are we from a system call? */
-	if (in_syscall(regs)) {
+	if (in_syscall(__frame)) {
 		/* If so, check system call restarting.. */
-		switch (regs->gr8) {
+		switch (__frame->gr8) {
 		case -ERESTART_RESTARTBLOCK:
 		case -ERESTARTNOHAND:
-			regs->gr8 = -EINTR;
+			__frame->gr8 = -EINTR;
 			break;
 
 		case -ERESTARTSYS:
 			if (!(ka->sa.sa_flags & SA_RESTART)) {
-				regs->gr8 = -EINTR;
+				__frame->gr8 = -EINTR;
 				break;
 			}
 
 			/* fallthrough */
 		case -ERESTARTNOINTR:
-			regs->gr8 = regs->orig_gr8;
-			regs->pc -= 4;
+			__frame->gr8 = __frame->orig_gr8;
+			__frame->pc -= 4;
 		}
 	}
 
 	/* Set up the stack frame */
 	if (ka->sa.sa_flags & SA_SIGINFO)
-		ret = setup_rt_frame(sig, ka, info, oldset, regs);
+		ret = setup_rt_frame(sig, ka, info, oldset);
 	else
-		ret = setup_frame(sig, ka, oldset, regs);
+		ret = setup_frame(sig, ka, oldset);
 
 	if (ret) {
 		spin_lock_irq(&current->sighand->siglock);
@@ -538,7 +536,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-int do_signal(struct pt_regs *regs, sigset_t *oldset)
+static int do_signal(sigset_t *oldset)
 {
 	struct k_sigaction ka;
 	siginfo_t info;
@@ -550,7 +548,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	 * kernel mode. Just return without doing anything
 	 * if so.
 	 */
-	if (!user_mode(regs))
+	if (!user_mode(__frame))
 		return 1;
 
 	if (try_to_freeze())
@@ -559,24 +557,24 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!oldset)
 		oldset = &current->blocked;
 
-	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+	signr = get_signal_to_deliver(&info, &ka, __frame, NULL);
 	if (signr > 0)
-		return handle_signal(signr, &info, &ka, oldset, regs);
+		return handle_signal(signr, &info, &ka, oldset);
 
 no_signal:
 	/* Did we come from a system call? */
-	if (regs->syscallno >= 0) {
+	if (__frame->syscallno >= 0) {
 		/* Restart the system call - no handlers present */
-		if (regs->gr8 == -ERESTARTNOHAND ||
-		    regs->gr8 == -ERESTARTSYS ||
-		    regs->gr8 == -ERESTARTNOINTR) {
-			regs->gr8 = regs->orig_gr8;
-			regs->pc -= 4;
+		if (__frame->gr8 == -ERESTARTNOHAND ||
+		    __frame->gr8 == -ERESTARTSYS ||
+		    __frame->gr8 == -ERESTARTNOINTR) {
+			__frame->gr8 = __frame->orig_gr8;
+			__frame->pc -= 4;
 		}
 
-		if (regs->gr8 == -ERESTART_RESTARTBLOCK){
-			regs->gr8 = __NR_restart_syscall;
-			regs->pc -= 4;
+		if (__frame->gr8 == -ERESTART_RESTARTBLOCK){
+			__frame->gr8 = __NR_restart_syscall;
+			__frame->pc -= 4;
 		}
 	}
 
@@ -597,6 +595,6 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(__frame, NULL);
+		do_signal(NULL);
 
 } /* end do_notify_resume() */
diff --git a/include/asm-frv/signal.h b/include/asm-frv/signal.h
index d407bde..6736689 100644
--- a/include/asm-frv/signal.h
+++ b/include/asm-frv/signal.h
@@ -151,7 +151,6 @@ typedef struct sigaltstack {
 	size_t ss_size;
 } stack_t;
 
-extern int do_signal(struct pt_regs *regs, sigset_t *oldset);
 #define ptrace_signal_deliver(regs, cookie) do { } while (0)
 
 #ifdef __KERNEL__
-- 
cgit v1.1


From 599a6e8ca4ff7f453f847217ecc2718d68e3b0f6 Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Fri, 6 Jan 2006 00:11:46 -0800
Subject: [PATCH] mips: remove include/asm-mips/riscos-syscall.h

Remove nowhere referenced file ("grep riscos -r ." didn't find anything).

Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-mips/riscos-syscall.h | 979 --------------------------------------
 1 file changed, 979 deletions(-)
 delete mode 100644 include/asm-mips/riscos-syscall.h

diff --git a/include/asm-mips/riscos-syscall.h b/include/asm-mips/riscos-syscall.h
deleted file mode 100644
index 4d8eb15..0000000
--- a/include/asm-mips/riscos-syscall.h
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1995, 96, 97, 98, 99, 2000 by Ralf Baechle
- */
-#ifndef _ASM_RISCOS_SYSCALL_H
-#define _ASM_RISCOS_SYSCALL_H
-
-/*
- * The syscalls 0 - 3999 are reserved for a down to the root syscall
- * compatibility with RISC/os and IRIX.  We'll see how to deal with the
- * various "real" BSD variants like Ultrix, NetBSD ...
- */
-
-/*
- * SVR4 syscalls are in the range from 1 to 999
- */
-#define __NR_SVR4			0
-#define __NR_SVR4_syscall		(__NR_SVR4 +   0)
-#define __NR_SVR4_exit			(__NR_SVR4 +   1)
-#define __NR_SVR4_fork			(__NR_SVR4 +   2)
-#define __NR_SVR4_read			(__NR_SVR4 +   3)
-#define __NR_SVR4_write			(__NR_SVR4 +   4)
-#define __NR_SVR4_open			(__NR_SVR4 +   5)
-#define __NR_SVR4_close			(__NR_SVR4 +   6)
-#define __NR_SVR4_wait			(__NR_SVR4 +   7)
-#define __NR_SVR4_creat			(__NR_SVR4 +   8)
-#define __NR_SVR4_link			(__NR_SVR4 +   9)
-#define __NR_SVR4_unlink		(__NR_SVR4 +  10)
-#define __NR_SVR4_exec			(__NR_SVR4 +  11)
-#define __NR_SVR4_chdir			(__NR_SVR4 +  12)
-#define __NR_SVR4_gtime			(__NR_SVR4 +  13)
-#define __NR_SVR4_mknod			(__NR_SVR4 +  14)
-#define __NR_SVR4_chmod			(__NR_SVR4 +  15)
-#define __NR_SVR4_chown			(__NR_SVR4 +  16)
-#define __NR_SVR4_sbreak		(__NR_SVR4 +  17)
-#define __NR_SVR4_stat			(__NR_SVR4 +  18)
-#define __NR_SVR4_lseek			(__NR_SVR4 +  19)
-#define __NR_SVR4_getpid		(__NR_SVR4 +  20)
-#define __NR_SVR4_mount			(__NR_SVR4 +  21)
-#define __NR_SVR4_umount		(__NR_SVR4 +  22)
-#define __NR_SVR4_setuid		(__NR_SVR4 +  23)
-#define __NR_SVR4_getuid		(__NR_SVR4 +  24)
-#define __NR_SVR4_stime			(__NR_SVR4 +  25)
-#define __NR_SVR4_ptrace		(__NR_SVR4 +  26)
-#define __NR_SVR4_alarm			(__NR_SVR4 +  27)
-#define __NR_SVR4_fstat			(__NR_SVR4 +  28)
-#define __NR_SVR4_pause			(__NR_SVR4 +  29)
-#define __NR_SVR4_utime			(__NR_SVR4 +  30)
-#define __NR_SVR4_stty			(__NR_SVR4 +  31)
-#define __NR_SVR4_gtty			(__NR_SVR4 +  32)
-#define __NR_SVR4_access		(__NR_SVR4 +  33)
-#define __NR_SVR4_nice			(__NR_SVR4 +  34)
-#define __NR_SVR4_statfs		(__NR_SVR4 +  35)
-#define __NR_SVR4_sync			(__NR_SVR4 +  36)
-#define __NR_SVR4_kill			(__NR_SVR4 +  37)
-#define __NR_SVR4_fstatfs		(__NR_SVR4 +  38)
-#define __NR_SVR4_setpgrp		(__NR_SVR4 +  39)
-#define __NR_SVR4_cxenix		(__NR_SVR4 +  40)
-#define __NR_SVR4_dup			(__NR_SVR4 +  41)
-#define __NR_SVR4_pipe			(__NR_SVR4 +  42)
-#define __NR_SVR4_times			(__NR_SVR4 +  43)
-#define __NR_SVR4_profil		(__NR_SVR4 +  44)
-#define __NR_SVR4_plock			(__NR_SVR4 +  45)
-#define __NR_SVR4_setgid		(__NR_SVR4 +  46)
-#define __NR_SVR4_getgid		(__NR_SVR4 +  47)
-#define __NR_SVR4_sig			(__NR_SVR4 +  48)
-#define __NR_SVR4_msgsys		(__NR_SVR4 +  49)
-#define __NR_SVR4_sysmips		(__NR_SVR4 +  50)
-#define __NR_SVR4_sysacct		(__NR_SVR4 +  51)
-#define __NR_SVR4_shmsys		(__NR_SVR4 +  52)
-#define __NR_SVR4_semsys		(__NR_SVR4 +  53)
-#define __NR_SVR4_ioctl			(__NR_SVR4 +  54)
-#define __NR_SVR4_uadmin		(__NR_SVR4 +  55)
-#define __NR_SVR4_exch 			(__NR_SVR4 +  56)
-#define __NR_SVR4_utssys		(__NR_SVR4 +  57)
-#define __NR_SVR4_fsync			(__NR_SVR4 +  58)
-#define __NR_SVR4_exece			(__NR_SVR4 +  59)
-#define __NR_SVR4_umask			(__NR_SVR4 +  60)
-#define __NR_SVR4_chroot		(__NR_SVR4 +  61)
-#define __NR_SVR4_fcntl			(__NR_SVR4 +  62)
-#define __NR_SVR4_ulimit		(__NR_SVR4 +  63)
-#define __NR_SVR4_reserved1		(__NR_SVR4 +  64)
-#define __NR_SVR4_reserved2		(__NR_SVR4 +  65)
-#define __NR_SVR4_reserved3		(__NR_SVR4 +  66)
-#define __NR_SVR4_reserved4		(__NR_SVR4 +  67)
-#define __NR_SVR4_reserved5		(__NR_SVR4 +  68)
-#define __NR_SVR4_reserved6		(__NR_SVR4 +  69)
-#define __NR_SVR4_advfs			(__NR_SVR4 +  70)
-#define __NR_SVR4_unadvfs		(__NR_SVR4 +  71)
-#define __NR_SVR4_unused1		(__NR_SVR4 +  72)
-#define __NR_SVR4_unused2		(__NR_SVR4 +  73)
-#define __NR_SVR4_rfstart		(__NR_SVR4 +  74)
-#define __NR_SVR4_unused3		(__NR_SVR4 +  75)
-#define __NR_SVR4_rdebug		(__NR_SVR4 +  76)
-#define __NR_SVR4_rfstop		(__NR_SVR4 +  77)
-#define __NR_SVR4_rfsys			(__NR_SVR4 +  78)
-#define __NR_SVR4_rmdir			(__NR_SVR4 +  79)
-#define __NR_SVR4_mkdir			(__NR_SVR4 +  80)
-#define __NR_SVR4_getdents		(__NR_SVR4 +  81)
-#define __NR_SVR4_libattach		(__NR_SVR4 +  82)
-#define __NR_SVR4_libdetach		(__NR_SVR4 +  83)
-#define __NR_SVR4_sysfs			(__NR_SVR4 +  84)
-#define __NR_SVR4_getmsg		(__NR_SVR4 +  85)
-#define __NR_SVR4_putmsg		(__NR_SVR4 +  86)
-#define __NR_SVR4_poll			(__NR_SVR4 +  87)
-#define __NR_SVR4_lstat			(__NR_SVR4 +  88)
-#define __NR_SVR4_symlink		(__NR_SVR4 +  89)
-#define __NR_SVR4_readlink		(__NR_SVR4 +  90)
-#define __NR_SVR4_setgroups		(__NR_SVR4 +  91)
-#define __NR_SVR4_getgroups		(__NR_SVR4 +  92)
-#define __NR_SVR4_fchmod		(__NR_SVR4 +  93)
-#define __NR_SVR4_fchown		(__NR_SVR4 +  94)
-#define __NR_SVR4_sigprocmask		(__NR_SVR4 +  95)
-#define __NR_SVR4_sigsuspend		(__NR_SVR4 +  96)
-#define __NR_SVR4_sigaltstack		(__NR_SVR4 +  97)
-#define __NR_SVR4_sigaction		(__NR_SVR4 +  98)
-#define __NR_SVR4_sigpending		(__NR_SVR4 +  99)
-#define __NR_SVR4_setcontext		(__NR_SVR4 + 100)
-#define __NR_SVR4_evsys			(__NR_SVR4 + 101)
-#define __NR_SVR4_evtrapret		(__NR_SVR4 + 102)
-#define __NR_SVR4_statvfs		(__NR_SVR4 + 103)
-#define __NR_SVR4_fstatvfs		(__NR_SVR4 + 104)
-#define __NR_SVR4_reserved7		(__NR_SVR4 + 105)
-#define __NR_SVR4_nfssys		(__NR_SVR4 + 106)
-#define __NR_SVR4_waitid		(__NR_SVR4 + 107)
-#define __NR_SVR4_sigsendset		(__NR_SVR4 + 108)
-#define __NR_SVR4_hrtsys		(__NR_SVR4 + 109)
-#define __NR_SVR4_acancel		(__NR_SVR4 + 110)
-#define __NR_SVR4_async			(__NR_SVR4 + 111)
-#define __NR_SVR4_priocntlset		(__NR_SVR4 + 112)
-#define __NR_SVR4_pathconf		(__NR_SVR4 + 113)
-#define __NR_SVR4_mincore		(__NR_SVR4 + 114)
-#define __NR_SVR4_mmap			(__NR_SVR4 + 115)
-#define __NR_SVR4_mprotect		(__NR_SVR4 + 116)
-#define __NR_SVR4_munmap		(__NR_SVR4 + 117)
-#define __NR_SVR4_fpathconf		(__NR_SVR4 + 118)
-#define __NR_SVR4_vfork			(__NR_SVR4 + 119)
-#define __NR_SVR4_fchdir		(__NR_SVR4 + 120)
-#define __NR_SVR4_readv			(__NR_SVR4 + 121)
-#define __NR_SVR4_writev		(__NR_SVR4 + 122)
-#define __NR_SVR4_xstat			(__NR_SVR4 + 123)
-#define __NR_SVR4_lxstat		(__NR_SVR4 + 124)
-#define __NR_SVR4_fxstat		(__NR_SVR4 + 125)
-#define __NR_SVR4_xmknod		(__NR_SVR4 + 126)
-#define __NR_SVR4_clocal		(__NR_SVR4 + 127)
-#define __NR_SVR4_setrlimit		(__NR_SVR4 + 128)
-#define __NR_SVR4_getrlimit		(__NR_SVR4 + 129)
-#define __NR_SVR4_lchown		(__NR_SVR4 + 130)
-#define __NR_SVR4_memcntl		(__NR_SVR4 + 131)
-#define __NR_SVR4_getpmsg		(__NR_SVR4 + 132)
-#define __NR_SVR4_putpmsg		(__NR_SVR4 + 133)
-#define __NR_SVR4_rename		(__NR_SVR4 + 134)
-#define __NR_SVR4_nuname		(__NR_SVR4 + 135)
-#define __NR_SVR4_setegid		(__NR_SVR4 + 136)
-#define __NR_SVR4_sysconf		(__NR_SVR4 + 137)
-#define __NR_SVR4_adjtime		(__NR_SVR4 + 138)
-#define __NR_SVR4_sysinfo		(__NR_SVR4 + 139)
-#define __NR_SVR4_reserved8		(__NR_SVR4 + 140)
-#define __NR_SVR4_seteuid		(__NR_SVR4 + 141)
-#define __NR_SVR4_PYRAMID_statis	(__NR_SVR4 + 142)
-#define __NR_SVR4_PYRAMID_tuning	(__NR_SVR4 + 143)
-#define __NR_SVR4_PYRAMID_forcerr	(__NR_SVR4 + 144)
-#define __NR_SVR4_PYRAMID_mpcntl	(__NR_SVR4 + 145)
-#define __NR_SVR4_reserved9		(__NR_SVR4 + 146)
-#define __NR_SVR4_reserved10		(__NR_SVR4 + 147)
-#define __NR_SVR4_reserved11		(__NR_SVR4 + 148)
-#define __NR_SVR4_reserved12		(__NR_SVR4 + 149)
-#define __NR_SVR4_reserved13		(__NR_SVR4 + 150)
-#define __NR_SVR4_reserved14		(__NR_SVR4 + 151)
-#define __NR_SVR4_reserved15		(__NR_SVR4 + 152)
-#define __NR_SVR4_reserved16		(__NR_SVR4 + 153)
-#define __NR_SVR4_reserved17		(__NR_SVR4 + 154)
-#define __NR_SVR4_reserved18		(__NR_SVR4 + 155)
-#define __NR_SVR4_reserved19		(__NR_SVR4 + 156)
-#define __NR_SVR4_reserved20		(__NR_SVR4 + 157)
-#define __NR_SVR4_reserved21		(__NR_SVR4 + 158)
-#define __NR_SVR4_reserved22		(__NR_SVR4 + 159)
-#define __NR_SVR4_reserved23		(__NR_SVR4 + 160)
-#define __NR_SVR4_reserved24		(__NR_SVR4 + 161)
-#define __NR_SVR4_reserved25		(__NR_SVR4 + 162)
-#define __NR_SVR4_reserved26		(__NR_SVR4 + 163)
-#define __NR_SVR4_reserved27		(__NR_SVR4 + 164)
-#define __NR_SVR4_reserved28		(__NR_SVR4 + 165)
-#define __NR_SVR4_reserved29		(__NR_SVR4 + 166)
-#define __NR_SVR4_reserved30		(__NR_SVR4 + 167)
-#define __NR_SVR4_reserved31		(__NR_SVR4 + 168)
-#define __NR_SVR4_reserved32		(__NR_SVR4 + 169)
-#define __NR_SVR4_reserved33		(__NR_SVR4 + 170)
-#define __NR_SVR4_reserved34		(__NR_SVR4 + 171)
-#define __NR_SVR4_reserved35		(__NR_SVR4 + 172)
-#define __NR_SVR4_reserved36		(__NR_SVR4 + 173)
-#define __NR_SVR4_reserved37		(__NR_SVR4 + 174)
-#define __NR_SVR4_reserved38		(__NR_SVR4 + 175)
-#define __NR_SVR4_reserved39		(__NR_SVR4 + 176)
-#define __NR_SVR4_reserved40		(__NR_SVR4 + 177)
-#define __NR_SVR4_reserved41		(__NR_SVR4 + 178)
-#define __NR_SVR4_reserved42		(__NR_SVR4 + 179)
-#define __NR_SVR4_reserved43		(__NR_SVR4 + 180)
-#define __NR_SVR4_reserved44		(__NR_SVR4 + 181)
-#define __NR_SVR4_reserved45		(__NR_SVR4 + 182)
-#define __NR_SVR4_reserved46		(__NR_SVR4 + 183)
-#define __NR_SVR4_reserved47		(__NR_SVR4 + 184)
-#define __NR_SVR4_reserved48		(__NR_SVR4 + 185)
-#define __NR_SVR4_reserved49		(__NR_SVR4 + 186)
-#define __NR_SVR4_reserved50		(__NR_SVR4 + 187)
-#define __NR_SVR4_reserved51		(__NR_SVR4 + 188)
-#define __NR_SVR4_reserved52		(__NR_SVR4 + 189)
-#define __NR_SVR4_reserved53		(__NR_SVR4 + 190)
-#define __NR_SVR4_reserved54		(__NR_SVR4 + 191)
-#define __NR_SVR4_reserved55		(__NR_SVR4 + 192)
-#define __NR_SVR4_reserved56		(__NR_SVR4 + 193)
-#define __NR_SVR4_reserved57		(__NR_SVR4 + 194)
-#define __NR_SVR4_reserved58		(__NR_SVR4 + 195)
-#define __NR_SVR4_reserved59		(__NR_SVR4 + 196)
-#define __NR_SVR4_reserved60		(__NR_SVR4 + 197)
-#define __NR_SVR4_reserved61		(__NR_SVR4 + 198)
-#define __NR_SVR4_reserved62		(__NR_SVR4 + 199)
-#define __NR_SVR4_reserved63		(__NR_SVR4 + 200)
-#define __NR_SVR4_aread			(__NR_SVR4 + 201)
-#define __NR_SVR4_awrite		(__NR_SVR4 + 202)
-#define __NR_SVR4_listio		(__NR_SVR4 + 203)
-#define __NR_SVR4_mips_acancel		(__NR_SVR4 + 204)
-#define __NR_SVR4_astatus		(__NR_SVR4 + 205)
-#define __NR_SVR4_await			(__NR_SVR4 + 206)
-#define __NR_SVR4_areadv		(__NR_SVR4 + 207)
-#define __NR_SVR4_awritev		(__NR_SVR4 + 208)
-#define __NR_SVR4_MIPS_reserved1	(__NR_SVR4 + 209)
-#define __NR_SVR4_MIPS_reserved2	(__NR_SVR4 + 210)
-#define __NR_SVR4_MIPS_reserved3	(__NR_SVR4 + 211)
-#define __NR_SVR4_MIPS_reserved4	(__NR_SVR4 + 212)
-#define __NR_SVR4_MIPS_reserved5	(__NR_SVR4 + 213)
-#define __NR_SVR4_MIPS_reserved6	(__NR_SVR4 + 214)
-#define __NR_SVR4_MIPS_reserved7	(__NR_SVR4 + 215)
-#define __NR_SVR4_MIPS_reserved8	(__NR_SVR4 + 216)
-#define __NR_SVR4_MIPS_reserved9	(__NR_SVR4 + 217)
-#define __NR_SVR4_MIPS_reserved10	(__NR_SVR4 + 218)
-#define __NR_SVR4_MIPS_reserved11	(__NR_SVR4 + 219)
-#define __NR_SVR4_MIPS_reserved12	(__NR_SVR4 + 220)
-#define __NR_SVR4_CDC_reserved1		(__NR_SVR4 + 221)
-#define __NR_SVR4_CDC_reserved2		(__NR_SVR4 + 222)
-#define __NR_SVR4_CDC_reserved3		(__NR_SVR4 + 223)
-#define __NR_SVR4_CDC_reserved4		(__NR_SVR4 + 224)
-#define __NR_SVR4_CDC_reserved5		(__NR_SVR4 + 225)
-#define __NR_SVR4_CDC_reserved6		(__NR_SVR4 + 226)
-#define __NR_SVR4_CDC_reserved7		(__NR_SVR4 + 227)
-#define __NR_SVR4_CDC_reserved8		(__NR_SVR4 + 228)
-#define __NR_SVR4_CDC_reserved9		(__NR_SVR4 + 229)
-#define __NR_SVR4_CDC_reserved10	(__NR_SVR4 + 230)
-#define __NR_SVR4_CDC_reserved11	(__NR_SVR4 + 231)
-#define __NR_SVR4_CDC_reserved12	(__NR_SVR4 + 232)
-#define __NR_SVR4_CDC_reserved13	(__NR_SVR4 + 233)
-#define __NR_SVR4_CDC_reserved14	(__NR_SVR4 + 234)
-#define __NR_SVR4_CDC_reserved15	(__NR_SVR4 + 235)
-#define __NR_SVR4_CDC_reserved16	(__NR_SVR4 + 236)
-#define __NR_SVR4_CDC_reserved17	(__NR_SVR4 + 237)
-#define __NR_SVR4_CDC_reserved18	(__NR_SVR4 + 238)
-#define __NR_SVR4_CDC_reserved19	(__NR_SVR4 + 239)
-#define __NR_SVR4_CDC_reserved20	(__NR_SVR4 + 240)
-
-/*
- * SYS V syscalls are in the range from 1000 to 1999
- */
-#define __NR_SYSV			1000
-#define __NR_SYSV_syscall		(__NR_SYSV +   0)
-#define __NR_SYSV_exit			(__NR_SYSV +   1)
-#define __NR_SYSV_fork			(__NR_SYSV +   2)
-#define __NR_SYSV_read			(__NR_SYSV +   3)
-#define __NR_SYSV_write			(__NR_SYSV +   4)
-#define __NR_SYSV_open			(__NR_SYSV +   5)
-#define __NR_SYSV_close			(__NR_SYSV +   6)
-#define __NR_SYSV_wait			(__NR_SYSV +   7)
-#define __NR_SYSV_creat			(__NR_SYSV +   8)
-#define __NR_SYSV_link			(__NR_SYSV +   9)
-#define __NR_SYSV_unlink		(__NR_SYSV +  10)
-#define __NR_SYSV_execv			(__NR_SYSV +  11)
-#define __NR_SYSV_chdir			(__NR_SYSV +  12)
-#define __NR_SYSV_time			(__NR_SYSV +  13)
-#define __NR_SYSV_mknod			(__NR_SYSV +  14)
-#define __NR_SYSV_chmod			(__NR_SYSV +  15)
-#define __NR_SYSV_chown			(__NR_SYSV +  16)
-#define __NR_SYSV_brk			(__NR_SYSV +  17)
-#define __NR_SYSV_stat			(__NR_SYSV +  18)
-#define __NR_SYSV_lseek			(__NR_SYSV +  19)
-#define __NR_SYSV_getpid		(__NR_SYSV +  20)
-#define __NR_SYSV_mount			(__NR_SYSV +  21)
-#define __NR_SYSV_umount		(__NR_SYSV +  22)
-#define __NR_SYSV_setuid		(__NR_SYSV +  23)
-#define __NR_SYSV_getuid		(__NR_SYSV +  24)
-#define __NR_SYSV_stime			(__NR_SYSV +  25)
-#define __NR_SYSV_ptrace		(__NR_SYSV +  26)
-#define __NR_SYSV_alarm			(__NR_SYSV +  27)
-#define __NR_SYSV_fstat			(__NR_SYSV +  28)
-#define __NR_SYSV_pause			(__NR_SYSV +  29)
-#define __NR_SYSV_utime			(__NR_SYSV +  30)
-#define __NR_SYSV_stty			(__NR_SYSV +  31)
-#define __NR_SYSV_gtty			(__NR_SYSV +  32)
-#define __NR_SYSV_access		(__NR_SYSV +  33)
-#define __NR_SYSV_nice			(__NR_SYSV +  34)
-#define __NR_SYSV_statfs		(__NR_SYSV +  35)
-#define __NR_SYSV_sync			(__NR_SYSV +  36)
-#define __NR_SYSV_kill			(__NR_SYSV +  37)
-#define __NR_SYSV_fstatfs		(__NR_SYSV +  38)
-#define __NR_SYSV_setpgrp		(__NR_SYSV +  39)
-#define __NR_SYSV_syssgi		(__NR_SYSV +  40)
-#define __NR_SYSV_dup			(__NR_SYSV +  41)
-#define __NR_SYSV_pipe			(__NR_SYSV +  42)
-#define __NR_SYSV_times			(__NR_SYSV +  43)
-#define __NR_SYSV_profil		(__NR_SYSV +  44)
-#define __NR_SYSV_plock			(__NR_SYSV +  45)
-#define __NR_SYSV_setgid		(__NR_SYSV +  46)
-#define __NR_SYSV_getgid		(__NR_SYSV +  47)
-#define __NR_SYSV_sig			(__NR_SYSV +  48)
-#define __NR_SYSV_msgsys		(__NR_SYSV +  49)
-#define __NR_SYSV_sysmips		(__NR_SYSV +  50)
-#define __NR_SYSV_acct			(__NR_SYSV +  51)
-#define __NR_SYSV_shmsys		(__NR_SYSV +  52)
-#define __NR_SYSV_semsys		(__NR_SYSV +  53)
-#define __NR_SYSV_ioctl			(__NR_SYSV +  54)
-#define __NR_SYSV_uadmin		(__NR_SYSV +  55)
-#define __NR_SYSV_sysmp			(__NR_SYSV +  56)
-#define __NR_SYSV_utssys		(__NR_SYSV +  57)
-#define __NR_SYSV_USG_reserved1		(__NR_SYSV +  58)
-#define __NR_SYSV_execve		(__NR_SYSV +  59)
-#define __NR_SYSV_umask			(__NR_SYSV +  60)
-#define __NR_SYSV_chroot		(__NR_SYSV +  61)
-#define __NR_SYSV_fcntl			(__NR_SYSV +  62)
-#define __NR_SYSV_ulimit		(__NR_SYSV +  63)
-#define __NR_SYSV_SAFARI4_reserved1	(__NR_SYSV +  64)
-#define __NR_SYSV_SAFARI4_reserved2	(__NR_SYSV +  65)
-#define __NR_SYSV_SAFARI4_reserved3	(__NR_SYSV +  66)
-#define __NR_SYSV_SAFARI4_reserved4	(__NR_SYSV +  67)
-#define __NR_SYSV_SAFARI4_reserved5	(__NR_SYSV +  68)
-#define __NR_SYSV_SAFARI4_reserved6	(__NR_SYSV +  69)
-#define __NR_SYSV_advfs			(__NR_SYSV +  70)
-#define __NR_SYSV_unadvfs		(__NR_SYSV +  71)
-#define __NR_SYSV_rmount		(__NR_SYSV +  72)
-#define __NR_SYSV_rumount		(__NR_SYSV +  73)
-#define __NR_SYSV_rfstart		(__NR_SYSV +  74)
-#define __NR_SYSV_getrlimit64		(__NR_SYSV +  75)
-#define __NR_SYSV_setrlimit64		(__NR_SYSV +  76)
-#define __NR_SYSV_nanosleep		(__NR_SYSV +  77)
-#define __NR_SYSV_lseek64		(__NR_SYSV +  78)
-#define __NR_SYSV_rmdir			(__NR_SYSV +  79)
-#define __NR_SYSV_mkdir			(__NR_SYSV +  80)
-#define __NR_SYSV_getdents		(__NR_SYSV +  81)
-#define __NR_SYSV_sginap		(__NR_SYSV +  82)
-#define __NR_SYSV_sgikopt		(__NR_SYSV +  83)
-#define __NR_SYSV_sysfs			(__NR_SYSV +  84)
-#define __NR_SYSV_getmsg		(__NR_SYSV +  85)
-#define __NR_SYSV_putmsg		(__NR_SYSV +  86)
-#define __NR_SYSV_poll			(__NR_SYSV +  87)
-#define __NR_SYSV_sigreturn		(__NR_SYSV +  88)
-#define __NR_SYSV_accept		(__NR_SYSV +  89)
-#define __NR_SYSV_bind			(__NR_SYSV +  90)
-#define __NR_SYSV_connect		(__NR_SYSV +  91)
-#define __NR_SYSV_gethostid		(__NR_SYSV +  92)
-#define __NR_SYSV_getpeername		(__NR_SYSV +  93)
-#define __NR_SYSV_getsockname		(__NR_SYSV +  94)
-#define __NR_SYSV_getsockopt		(__NR_SYSV +  95)
-#define __NR_SYSV_listen		(__NR_SYSV +  96)
-#define __NR_SYSV_recv			(__NR_SYSV +  97)
-#define __NR_SYSV_recvfrom		(__NR_SYSV +  98)
-#define __NR_SYSV_recvmsg		(__NR_SYSV +  99)
-#define __NR_SYSV_select		(__NR_SYSV + 100)
-#define __NR_SYSV_send			(__NR_SYSV + 101)
-#define __NR_SYSV_sendmsg		(__NR_SYSV + 102)
-#define __NR_SYSV_sendto		(__NR_SYSV + 103)
-#define __NR_SYSV_sethostid		(__NR_SYSV + 104)
-#define __NR_SYSV_setsockopt		(__NR_SYSV + 105)
-#define __NR_SYSV_shutdown		(__NR_SYSV + 106)
-#define __NR_SYSV_socket		(__NR_SYSV + 107)
-#define __NR_SYSV_gethostname		(__NR_SYSV + 108)
-#define __NR_SYSV_sethostname		(__NR_SYSV + 109)
-#define __NR_SYSV_getdomainname		(__NR_SYSV + 110)
-#define __NR_SYSV_setdomainname		(__NR_SYSV + 111)
-#define __NR_SYSV_truncate		(__NR_SYSV + 112)
-#define __NR_SYSV_ftruncate		(__NR_SYSV + 113)
-#define __NR_SYSV_rename		(__NR_SYSV + 114)
-#define __NR_SYSV_symlink		(__NR_SYSV + 115)
-#define __NR_SYSV_readlink		(__NR_SYSV + 116)
-#define __NR_SYSV_lstat			(__NR_SYSV + 117)
-#define __NR_SYSV_nfsmount		(__NR_SYSV + 118)
-#define __NR_SYSV_nfssvc		(__NR_SYSV + 119)
-#define __NR_SYSV_getfh			(__NR_SYSV + 120)
-#define __NR_SYSV_async_daemon		(__NR_SYSV + 121)
-#define __NR_SYSV_exportfs		(__NR_SYSV + 122)
-#define __NR_SYSV_setregid		(__NR_SYSV + 123)
-#define __NR_SYSV_setreuid		(__NR_SYSV + 124)
-#define __NR_SYSV_getitimer		(__NR_SYSV + 125)
-#define __NR_SYSV_setitimer		(__NR_SYSV + 126)
-#define __NR_SYSV_adjtime		(__NR_SYSV + 127)
-#define __NR_SYSV_BSD_getime		(__NR_SYSV + 128)
-#define __NR_SYSV_sproc			(__NR_SYSV + 129)
-#define __NR_SYSV_prctl			(__NR_SYSV + 130)
-#define __NR_SYSV_procblk		(__NR_SYSV + 131)
-#define __NR_SYSV_sprocsp		(__NR_SYSV + 132)
-#define __NR_SYSV_sgigsc		(__NR_SYSV + 133)
-#define __NR_SYSV_mmap			(__NR_SYSV + 134)
-#define __NR_SYSV_munmap		(__NR_SYSV + 135)
-#define __NR_SYSV_mprotect		(__NR_SYSV + 136)
-#define __NR_SYSV_msync			(__NR_SYSV + 137)
-#define __NR_SYSV_madvise		(__NR_SYSV + 138)
-#define __NR_SYSV_pagelock		(__NR_SYSV + 139)
-#define __NR_SYSV_getpagesize		(__NR_SYSV + 140)
-#define __NR_SYSV_quotactl		(__NR_SYSV + 141)
-#define __NR_SYSV_libdetach		(__NR_SYSV + 142)
-#define __NR_SYSV_BSDgetpgrp		(__NR_SYSV + 143)
-#define __NR_SYSV_BSDsetpgrp		(__NR_SYSV + 144)
-#define __NR_SYSV_vhangup		(__NR_SYSV + 145)
-#define __NR_SYSV_fsync			(__NR_SYSV + 146)
-#define __NR_SYSV_fchdir		(__NR_SYSV + 147)
-#define __NR_SYSV_getrlimit		(__NR_SYSV + 148)
-#define __NR_SYSV_setrlimit		(__NR_SYSV + 149)
-#define __NR_SYSV_cacheflush		(__NR_SYSV + 150)
-#define __NR_SYSV_cachectl		(__NR_SYSV + 151)
-#define __NR_SYSV_fchown		(__NR_SYSV + 152)
-#define __NR_SYSV_fchmod		(__NR_SYSV + 153)
-#define __NR_SYSV_wait3			(__NR_SYSV + 154)
-#define __NR_SYSV_socketpair		(__NR_SYSV + 155)
-#define __NR_SYSV_sysinfo		(__NR_SYSV + 156)
-#define __NR_SYSV_nuname		(__NR_SYSV + 157)
-#define __NR_SYSV_xstat			(__NR_SYSV + 158)
-#define __NR_SYSV_lxstat		(__NR_SYSV + 159)
-#define __NR_SYSV_fxstat		(__NR_SYSV + 160)
-#define __NR_SYSV_xmknod		(__NR_SYSV + 161)
-#define __NR_SYSV_ksigaction		(__NR_SYSV + 162)
-#define __NR_SYSV_sigpending		(__NR_SYSV + 163)
-#define __NR_SYSV_sigprocmask		(__NR_SYSV + 164)
-#define __NR_SYSV_sigsuspend		(__NR_SYSV + 165)
-#define __NR_SYSV_sigpoll		(__NR_SYSV + 166)
-#define __NR_SYSV_swapctl		(__NR_SYSV + 167)
-#define __NR_SYSV_getcontext		(__NR_SYSV + 168)
-#define __NR_SYSV_setcontext		(__NR_SYSV + 169)
-#define __NR_SYSV_waitsys		(__NR_SYSV + 170)
-#define __NR_SYSV_sigstack		(__NR_SYSV + 171)
-#define __NR_SYSV_sigaltstack		(__NR_SYSV + 172)
-#define __NR_SYSV_sigsendset		(__NR_SYSV + 173)
-#define __NR_SYSV_statvfs		(__NR_SYSV + 174)
-#define __NR_SYSV_fstatvfs		(__NR_SYSV + 175)
-#define __NR_SYSV_getpmsg		(__NR_SYSV + 176)
-#define __NR_SYSV_putpmsg		(__NR_SYSV + 177)
-#define __NR_SYSV_lchown		(__NR_SYSV + 178)
-#define __NR_SYSV_priocntl		(__NR_SYSV + 179)
-#define __NR_SYSV_ksigqueue		(__NR_SYSV + 180)
-#define __NR_SYSV_readv			(__NR_SYSV + 181)
-#define __NR_SYSV_writev		(__NR_SYSV + 182)
-#define __NR_SYSV_truncate64		(__NR_SYSV + 183)
-#define __NR_SYSV_ftruncate64		(__NR_SYSV + 184)
-#define __NR_SYSV_mmap64		(__NR_SYSV + 185)
-#define __NR_SYSV_dmi			(__NR_SYSV + 186)
-#define __NR_SYSV_pread			(__NR_SYSV + 187)
-#define __NR_SYSV_pwrite		(__NR_SYSV + 188)
-
-/*
- * BSD 4.3 syscalls are in the range from 2000 to 2999
- */
-#define __NR_BSD43			2000
-#define __NR_BSD43_syscall		(__NR_BSD43 +   0)
-#define __NR_BSD43_exit			(__NR_BSD43 +   1)
-#define __NR_BSD43_fork			(__NR_BSD43 +   2)
-#define __NR_BSD43_read			(__NR_BSD43 +   3)
-#define __NR_BSD43_write		(__NR_BSD43 +   4)
-#define __NR_BSD43_open			(__NR_BSD43 +   5)
-#define __NR_BSD43_close		(__NR_BSD43 +   6)
-#define __NR_BSD43_wait			(__NR_BSD43 +   7)
-#define __NR_BSD43_creat		(__NR_BSD43 +   8)
-#define __NR_BSD43_link			(__NR_BSD43 +   9)
-#define __NR_BSD43_unlink		(__NR_BSD43 +  10)
-#define __NR_BSD43_exec			(__NR_BSD43 +  11)
-#define __NR_BSD43_chdir		(__NR_BSD43 +  12)
-#define __NR_BSD43_time			(__NR_BSD43 +  13)
-#define __NR_BSD43_mknod		(__NR_BSD43 +  14)
-#define __NR_BSD43_chmod		(__NR_BSD43 +  15)
-#define __NR_BSD43_chown		(__NR_BSD43 +  16)
-#define __NR_BSD43_sbreak		(__NR_BSD43 +  17)
-#define __NR_BSD43_oldstat		(__NR_BSD43 +  18)
-#define __NR_BSD43_lseek		(__NR_BSD43 +  19)
-#define __NR_BSD43_getpid		(__NR_BSD43 +  20)
-#define __NR_BSD43_oldmount		(__NR_BSD43 +  21)
-#define __NR_BSD43_umount		(__NR_BSD43 +  22)
-#define __NR_BSD43_setuid		(__NR_BSD43 +  23)
-#define __NR_BSD43_getuid		(__NR_BSD43 +  24)
-#define __NR_BSD43_stime		(__NR_BSD43 +  25)
-#define __NR_BSD43_ptrace		(__NR_BSD43 +  26)
-#define __NR_BSD43_alarm		(__NR_BSD43 +  27)
-#define __NR_BSD43_oldfstat		(__NR_BSD43 +  28)
-#define __NR_BSD43_pause		(__NR_BSD43 +  29)
-#define __NR_BSD43_utime		(__NR_BSD43 +  30)
-#define __NR_BSD43_stty			(__NR_BSD43 +  31)
-#define __NR_BSD43_gtty			(__NR_BSD43 +  32)
-#define __NR_BSD43_access		(__NR_BSD43 +  33)
-#define __NR_BSD43_nice			(__NR_BSD43 +  34)
-#define __NR_BSD43_ftime		(__NR_BSD43 +  35)
-#define __NR_BSD43_sync			(__NR_BSD43 +  36)
-#define __NR_BSD43_kill			(__NR_BSD43 +  37)
-#define __NR_BSD43_stat			(__NR_BSD43 +  38)
-#define __NR_BSD43_oldsetpgrp		(__NR_BSD43 +  39)
-#define __NR_BSD43_lstat		(__NR_BSD43 +  40)
-#define __NR_BSD43_dup			(__NR_BSD43 +  41)
-#define __NR_BSD43_pipe			(__NR_BSD43 +  42)
-#define __NR_BSD43_times		(__NR_BSD43 +  43)
-#define __NR_BSD43_profil		(__NR_BSD43 +  44)
-#define __NR_BSD43_msgsys		(__NR_BSD43 +  45)
-#define __NR_BSD43_setgid		(__NR_BSD43 +  46)
-#define __NR_BSD43_getgid		(__NR_BSD43 +  47)
-#define __NR_BSD43_ssig			(__NR_BSD43 +  48)
-#define __NR_BSD43_reserved1		(__NR_BSD43 +  49)
-#define __NR_BSD43_reserved2		(__NR_BSD43 +  50)
-#define __NR_BSD43_sysacct		(__NR_BSD43 +  51)
-#define __NR_BSD43_phys			(__NR_BSD43 +  52)
-#define __NR_BSD43_lock			(__NR_BSD43 +  53)
-#define __NR_BSD43_ioctl		(__NR_BSD43 +  54)
-#define __NR_BSD43_reboot		(__NR_BSD43 +  55)
-#define __NR_BSD43_mpxchan		(__NR_BSD43 +  56)
-#define __NR_BSD43_symlink		(__NR_BSD43 +  57)
-#define __NR_BSD43_readlink		(__NR_BSD43 +  58)
-#define __NR_BSD43_execve		(__NR_BSD43 +  59)
-#define __NR_BSD43_umask		(__NR_BSD43 +  60)
-#define __NR_BSD43_chroot		(__NR_BSD43 +  61)
-#define __NR_BSD43_fstat		(__NR_BSD43 +  62)
-#define __NR_BSD43_reserved3		(__NR_BSD43 +  63)
-#define __NR_BSD43_getpagesize		(__NR_BSD43 +  64)
-#define __NR_BSD43_mremap		(__NR_BSD43 +  65)
-#define __NR_BSD43_vfork		(__NR_BSD43 +  66)
-#define __NR_BSD43_vread		(__NR_BSD43 +  67)
-#define __NR_BSD43_vwrite		(__NR_BSD43 +  68)
-#define __NR_BSD43_sbrk			(__NR_BSD43 +  69)
-#define __NR_BSD43_sstk			(__NR_BSD43 +  70)
-#define __NR_BSD43_mmap			(__NR_BSD43 +  71)
-#define __NR_BSD43_vadvise		(__NR_BSD43 +  72)
-#define __NR_BSD43_munmap		(__NR_BSD43 +  73)
-#define __NR_BSD43_mprotect		(__NR_BSD43 +  74)
-#define __NR_BSD43_madvise		(__NR_BSD43 +  75)
-#define __NR_BSD43_vhangup		(__NR_BSD43 +  76)
-#define __NR_BSD43_vlimit		(__NR_BSD43 +  77)
-#define __NR_BSD43_mincore		(__NR_BSD43 +  78)
-#define __NR_BSD43_getgroups		(__NR_BSD43 +  79)
-#define __NR_BSD43_setgroups		(__NR_BSD43 +  80)
-#define __NR_BSD43_getpgrp		(__NR_BSD43 +  81)
-#define __NR_BSD43_setpgrp		(__NR_BSD43 +  82)
-#define __NR_BSD43_setitimer		(__NR_BSD43 +  83)
-#define __NR_BSD43_wait3		(__NR_BSD43 +  84)
-#define __NR_BSD43_swapon		(__NR_BSD43 +  85)
-#define __NR_BSD43_getitimer		(__NR_BSD43 +  86)
-#define __NR_BSD43_gethostname		(__NR_BSD43 +  87)
-#define __NR_BSD43_sethostname		(__NR_BSD43 +  88)
-#define __NR_BSD43_getdtablesize	(__NR_BSD43 +  89)
-#define __NR_BSD43_dup2			(__NR_BSD43 +  90)
-#define __NR_BSD43_getdopt		(__NR_BSD43 +  91)
-#define __NR_BSD43_fcntl		(__NR_BSD43 +  92)
-#define __NR_BSD43_select		(__NR_BSD43 +  93)
-#define __NR_BSD43_setdopt		(__NR_BSD43 +  94)
-#define __NR_BSD43_fsync		(__NR_BSD43 +  95)
-#define __NR_BSD43_setpriority		(__NR_BSD43 +  96)
-#define __NR_BSD43_socket		(__NR_BSD43 +  97)
-#define __NR_BSD43_connect		(__NR_BSD43 +  98)
-#define __NR_BSD43_oldaccept		(__NR_BSD43 +  99)
-#define __NR_BSD43_getpriority		(__NR_BSD43 + 100)
-#define __NR_BSD43_send			(__NR_BSD43 + 101)
-#define __NR_BSD43_recv			(__NR_BSD43 + 102)
-#define __NR_BSD43_sigreturn		(__NR_BSD43 + 103)
-#define __NR_BSD43_bind			(__NR_BSD43 + 104)
-#define __NR_BSD43_setsockopt		(__NR_BSD43 + 105)
-#define __NR_BSD43_listen		(__NR_BSD43 + 106)
-#define __NR_BSD43_vtimes		(__NR_BSD43 + 107)
-#define __NR_BSD43_sigvec		(__NR_BSD43 + 108)
-#define __NR_BSD43_sigblock		(__NR_BSD43 + 109)
-#define __NR_BSD43_sigsetmask		(__NR_BSD43 + 110)
-#define __NR_BSD43_sigpause		(__NR_BSD43 + 111)
-#define __NR_BSD43_sigstack		(__NR_BSD43 + 112)
-#define __NR_BSD43_oldrecvmsg		(__NR_BSD43 + 113)
-#define __NR_BSD43_oldsendmsg		(__NR_BSD43 + 114)
-#define __NR_BSD43_vtrace		(__NR_BSD43 + 115)
-#define __NR_BSD43_gettimeofday		(__NR_BSD43 + 116)
-#define __NR_BSD43_getrusage		(__NR_BSD43 + 117)
-#define __NR_BSD43_getsockopt		(__NR_BSD43 + 118)
-#define __NR_BSD43_reserved4		(__NR_BSD43 + 119)
-#define __NR_BSD43_readv		(__NR_BSD43 + 120)
-#define __NR_BSD43_writev		(__NR_BSD43 + 121)
-#define __NR_BSD43_settimeofday		(__NR_BSD43 + 122)
-#define __NR_BSD43_fchown		(__NR_BSD43 + 123)
-#define __NR_BSD43_fchmod		(__NR_BSD43 + 124)
-#define __NR_BSD43_oldrecvfrom		(__NR_BSD43 + 125)
-#define __NR_BSD43_setreuid		(__NR_BSD43 + 126)
-#define __NR_BSD43_setregid		(__NR_BSD43 + 127)
-#define __NR_BSD43_rename		(__NR_BSD43 + 128)
-#define __NR_BSD43_truncate		(__NR_BSD43 + 129)
-#define __NR_BSD43_ftruncate		(__NR_BSD43 + 130)
-#define __NR_BSD43_flock		(__NR_BSD43 + 131)
-#define __NR_BSD43_semsys		(__NR_BSD43 + 132)
-#define __NR_BSD43_sendto		(__NR_BSD43 + 133)
-#define __NR_BSD43_shutdown		(__NR_BSD43 + 134)
-#define __NR_BSD43_socketpair		(__NR_BSD43 + 135)
-#define __NR_BSD43_mkdir		(__NR_BSD43 + 136)
-#define __NR_BSD43_rmdir		(__NR_BSD43 + 137)
-#define __NR_BSD43_utimes		(__NR_BSD43 + 138)
-#define __NR_BSD43_sigcleanup		(__NR_BSD43 + 139)
-#define __NR_BSD43_adjtime		(__NR_BSD43 + 140)
-#define __NR_BSD43_oldgetpeername	(__NR_BSD43 + 141)
-#define __NR_BSD43_gethostid		(__NR_BSD43 + 142)
-#define __NR_BSD43_sethostid		(__NR_BSD43 + 143)
-#define __NR_BSD43_getrlimit		(__NR_BSD43 + 144)
-#define __NR_BSD43_setrlimit		(__NR_BSD43 + 145)
-#define __NR_BSD43_killpg		(__NR_BSD43 + 146)
-#define __NR_BSD43_shmsys		(__NR_BSD43 + 147)
-#define __NR_BSD43_quota		(__NR_BSD43 + 148)
-#define __NR_BSD43_qquota		(__NR_BSD43 + 149)
-#define __NR_BSD43_oldgetsockname	(__NR_BSD43 + 150)
-#define __NR_BSD43_sysmips		(__NR_BSD43 + 151)
-#define __NR_BSD43_cacheflush		(__NR_BSD43 + 152)
-#define __NR_BSD43_cachectl		(__NR_BSD43 + 153)
-#define __NR_BSD43_debug		(__NR_BSD43 + 154)
-#define __NR_BSD43_reserved5		(__NR_BSD43 + 155)
-#define __NR_BSD43_reserved6		(__NR_BSD43 + 156)
-#define __NR_BSD43_nfs_mount		(__NR_BSD43 + 157)
-#define __NR_BSD43_nfs_svc		(__NR_BSD43 + 158)
-#define __NR_BSD43_getdirentries	(__NR_BSD43 + 159)
-#define __NR_BSD43_statfs		(__NR_BSD43 + 160)
-#define __NR_BSD43_fstatfs		(__NR_BSD43 + 161)
-#define __NR_BSD43_unmount		(__NR_BSD43 + 162)
-#define __NR_BSD43_async_daemon		(__NR_BSD43 + 163)
-#define __NR_BSD43_nfs_getfh		(__NR_BSD43 + 164)
-#define __NR_BSD43_getdomainname	(__NR_BSD43 + 165)
-#define __NR_BSD43_setdomainname	(__NR_BSD43 + 166)
-#define __NR_BSD43_pcfs_mount		(__NR_BSD43 + 167)
-#define __NR_BSD43_quotactl		(__NR_BSD43 + 168)
-#define __NR_BSD43_oldexportfs		(__NR_BSD43 + 169)
-#define __NR_BSD43_smount		(__NR_BSD43 + 170)
-#define __NR_BSD43_mipshwconf		(__NR_BSD43 + 171)
-#define __NR_BSD43_exportfs		(__NR_BSD43 + 172)
-#define __NR_BSD43_nfsfh_open		(__NR_BSD43 + 173)
-#define __NR_BSD43_libattach		(__NR_BSD43 + 174)
-#define __NR_BSD43_libdetach		(__NR_BSD43 + 175)
-#define __NR_BSD43_accept		(__NR_BSD43 + 176)
-#define __NR_BSD43_reserved7		(__NR_BSD43 + 177)
-#define __NR_BSD43_reserved8		(__NR_BSD43 + 178)
-#define __NR_BSD43_recvmsg		(__NR_BSD43 + 179)
-#define __NR_BSD43_recvfrom		(__NR_BSD43 + 180)
-#define __NR_BSD43_sendmsg		(__NR_BSD43 + 181)
-#define __NR_BSD43_getpeername		(__NR_BSD43 + 182)
-#define __NR_BSD43_getsockname		(__NR_BSD43 + 183)
-#define __NR_BSD43_aread		(__NR_BSD43 + 184)
-#define __NR_BSD43_awrite		(__NR_BSD43 + 185)
-#define __NR_BSD43_listio		(__NR_BSD43 + 186)
-#define __NR_BSD43_acancel		(__NR_BSD43 + 187)
-#define __NR_BSD43_astatus		(__NR_BSD43 + 188)
-#define __NR_BSD43_await		(__NR_BSD43 + 189)
-#define __NR_BSD43_areadv		(__NR_BSD43 + 190)
-#define __NR_BSD43_awritev		(__NR_BSD43 + 191)
-
-/*
- * POSIX syscalls are in the range from 3000 to 3999
- */
-#define __NR_POSIX			3000
-#define __NR_POSIX_syscall		(__NR_POSIX +   0)
-#define __NR_POSIX_exit			(__NR_POSIX +   1)
-#define __NR_POSIX_fork			(__NR_POSIX +   2)
-#define __NR_POSIX_read			(__NR_POSIX +   3)
-#define __NR_POSIX_write		(__NR_POSIX +   4)
-#define __NR_POSIX_open			(__NR_POSIX +   5)
-#define __NR_POSIX_close		(__NR_POSIX +   6)
-#define __NR_POSIX_wait			(__NR_POSIX +   7)
-#define __NR_POSIX_creat		(__NR_POSIX +   8)
-#define __NR_POSIX_link			(__NR_POSIX +   9)
-#define __NR_POSIX_unlink		(__NR_POSIX +  10)
-#define __NR_POSIX_exec			(__NR_POSIX +  11)
-#define __NR_POSIX_chdir		(__NR_POSIX +  12)
-#define __NR_POSIX_gtime		(__NR_POSIX +  13)
-#define __NR_POSIX_mknod		(__NR_POSIX +  14)
-#define __NR_POSIX_chmod		(__NR_POSIX +  15)
-#define __NR_POSIX_chown		(__NR_POSIX +  16)
-#define __NR_POSIX_sbreak		(__NR_POSIX +  17)
-#define __NR_POSIX_stat			(__NR_POSIX +  18)
-#define __NR_POSIX_lseek		(__NR_POSIX +  19)
-#define __NR_POSIX_getpid		(__NR_POSIX +  20)
-#define __NR_POSIX_mount		(__NR_POSIX +  21)
-#define __NR_POSIX_umount		(__NR_POSIX +  22)
-#define __NR_POSIX_setuid		(__NR_POSIX +  23)
-#define __NR_POSIX_getuid		(__NR_POSIX +  24)
-#define __NR_POSIX_stime		(__NR_POSIX +  25)
-#define __NR_POSIX_ptrace		(__NR_POSIX +  26)
-#define __NR_POSIX_alarm		(__NR_POSIX +  27)
-#define __NR_POSIX_fstat		(__NR_POSIX +  28)
-#define __NR_POSIX_pause		(__NR_POSIX +  29)
-#define __NR_POSIX_utime		(__NR_POSIX +  30)
-#define __NR_POSIX_stty			(__NR_POSIX +  31)
-#define __NR_POSIX_gtty			(__NR_POSIX +  32)
-#define __NR_POSIX_access		(__NR_POSIX +  33)
-#define __NR_POSIX_nice			(__NR_POSIX +  34)
-#define __NR_POSIX_statfs		(__NR_POSIX +  35)
-#define __NR_POSIX_sync			(__NR_POSIX +  36)
-#define __NR_POSIX_kill			(__NR_POSIX +  37)
-#define __NR_POSIX_fstatfs		(__NR_POSIX +  38)
-#define __NR_POSIX_getpgrp		(__NR_POSIX +  39)
-#define __NR_POSIX_syssgi		(__NR_POSIX +  40)
-#define __NR_POSIX_dup			(__NR_POSIX +  41)
-#define __NR_POSIX_pipe			(__NR_POSIX +  42)
-#define __NR_POSIX_times		(__NR_POSIX +  43)
-#define __NR_POSIX_profil		(__NR_POSIX +  44)
-#define __NR_POSIX_lock			(__NR_POSIX +  45)
-#define __NR_POSIX_setgid		(__NR_POSIX +  46)
-#define __NR_POSIX_getgid		(__NR_POSIX +  47)
-#define __NR_POSIX_sig			(__NR_POSIX +  48)
-#define __NR_POSIX_msgsys		(__NR_POSIX +  49)
-#define __NR_POSIX_sysmips		(__NR_POSIX +  50)
-#define __NR_POSIX_sysacct		(__NR_POSIX +  51)
-#define __NR_POSIX_shmsys		(__NR_POSIX +  52)
-#define __NR_POSIX_semsys		(__NR_POSIX +  53)
-#define __NR_POSIX_ioctl		(__NR_POSIX +  54)
-#define __NR_POSIX_uadmin		(__NR_POSIX +  55)
-#define __NR_POSIX_exch			(__NR_POSIX +  56)
-#define __NR_POSIX_utssys		(__NR_POSIX +  57)
-#define __NR_POSIX_USG_reserved1	(__NR_POSIX +  58)
-#define __NR_POSIX_exece		(__NR_POSIX +  59)
-#define __NR_POSIX_umask		(__NR_POSIX +  60)
-#define __NR_POSIX_chroot		(__NR_POSIX +  61)
-#define __NR_POSIX_fcntl		(__NR_POSIX +  62)
-#define __NR_POSIX_ulimit		(__NR_POSIX +  63)
-#define __NR_POSIX_SAFARI4_reserved1	(__NR_POSIX +  64)
-#define __NR_POSIX_SAFARI4_reserved2	(__NR_POSIX +  65)
-#define __NR_POSIX_SAFARI4_reserved3	(__NR_POSIX +  66)
-#define __NR_POSIX_SAFARI4_reserved4	(__NR_POSIX +  67)
-#define __NR_POSIX_SAFARI4_reserved5	(__NR_POSIX +  68)
-#define __NR_POSIX_SAFARI4_reserved6	(__NR_POSIX +  69)
-#define __NR_POSIX_advfs		(__NR_POSIX +  70)
-#define __NR_POSIX_unadvfs		(__NR_POSIX +  71)
-#define __NR_POSIX_rmount		(__NR_POSIX +  72)
-#define __NR_POSIX_rumount		(__NR_POSIX +  73)
-#define __NR_POSIX_rfstart		(__NR_POSIX +  74)
-#define __NR_POSIX_reserved1		(__NR_POSIX +  75)
-#define __NR_POSIX_rdebug		(__NR_POSIX +  76)
-#define __NR_POSIX_rfstop		(__NR_POSIX +  77)
-#define __NR_POSIX_rfsys		(__NR_POSIX +  78)
-#define __NR_POSIX_rmdir		(__NR_POSIX +  79)
-#define __NR_POSIX_mkdir		(__NR_POSIX +  80)
-#define __NR_POSIX_getdents		(__NR_POSIX +  81)
-#define __NR_POSIX_sginap		(__NR_POSIX +  82)
-#define __NR_POSIX_sgikopt		(__NR_POSIX +  83)
-#define __NR_POSIX_sysfs		(__NR_POSIX +  84)
-#define __NR_POSIX_getmsg		(__NR_POSIX +  85)
-#define __NR_POSIX_putmsg		(__NR_POSIX +  86)
-#define __NR_POSIX_poll			(__NR_POSIX +  87)
-#define __NR_POSIX_sigreturn		(__NR_POSIX +  88)
-#define __NR_POSIX_accept		(__NR_POSIX +  89)
-#define __NR_POSIX_bind			(__NR_POSIX +  90)
-#define __NR_POSIX_connect		(__NR_POSIX +  91)
-#define __NR_POSIX_gethostid		(__NR_POSIX +  92)
-#define __NR_POSIX_getpeername		(__NR_POSIX +  93)
-#define __NR_POSIX_getsockname		(__NR_POSIX +  94)
-#define __NR_POSIX_getsockopt		(__NR_POSIX +  95)
-#define __NR_POSIX_listen		(__NR_POSIX +  96)
-#define __NR_POSIX_recv			(__NR_POSIX +  97)
-#define __NR_POSIX_recvfrom		(__NR_POSIX +  98)
-#define __NR_POSIX_recvmsg		(__NR_POSIX +  99)
-#define __NR_POSIX_select		(__NR_POSIX + 100)
-#define __NR_POSIX_send			(__NR_POSIX + 101)
-#define __NR_POSIX_sendmsg		(__NR_POSIX + 102)
-#define __NR_POSIX_sendto		(__NR_POSIX + 103)
-#define __NR_POSIX_sethostid		(__NR_POSIX + 104)
-#define __NR_POSIX_setsockopt		(__NR_POSIX + 105)
-#define __NR_POSIX_shutdown		(__NR_POSIX + 106)
-#define __NR_POSIX_socket		(__NR_POSIX + 107)
-#define __NR_POSIX_gethostname		(__NR_POSIX + 108)
-#define __NR_POSIX_sethostname		(__NR_POSIX + 109)
-#define __NR_POSIX_getdomainname	(__NR_POSIX + 110)
-#define __NR_POSIX_setdomainname	(__NR_POSIX + 111)
-#define __NR_POSIX_truncate		(__NR_POSIX + 112)
-#define __NR_POSIX_ftruncate		(__NR_POSIX + 113)
-#define __NR_POSIX_rename		(__NR_POSIX + 114)
-#define __NR_POSIX_symlink		(__NR_POSIX + 115)
-#define __NR_POSIX_readlink		(__NR_POSIX + 116)
-#define __NR_POSIX_lstat		(__NR_POSIX + 117)
-#define __NR_POSIX_nfs_mount		(__NR_POSIX + 118)
-#define __NR_POSIX_nfs_svc		(__NR_POSIX + 119)
-#define __NR_POSIX_nfs_getfh		(__NR_POSIX + 120)
-#define __NR_POSIX_async_daemon		(__NR_POSIX + 121)
-#define __NR_POSIX_exportfs		(__NR_POSIX + 122)
-#define __NR_POSIX_SGI_setregid		(__NR_POSIX + 123)
-#define __NR_POSIX_SGI_setreuid		(__NR_POSIX + 124)
-#define __NR_POSIX_getitimer		(__NR_POSIX + 125)
-#define __NR_POSIX_setitimer		(__NR_POSIX + 126)
-#define __NR_POSIX_adjtime		(__NR_POSIX + 127)
-#define __NR_POSIX_SGI_bsdgettime	(__NR_POSIX + 128)
-#define __NR_POSIX_SGI_sproc		(__NR_POSIX + 129)
-#define __NR_POSIX_SGI_prctl		(__NR_POSIX + 130)
-#define __NR_POSIX_SGI_blkproc		(__NR_POSIX + 131)
-#define __NR_POSIX_SGI_reserved1	(__NR_POSIX + 132)
-#define __NR_POSIX_SGI_sgigsc		(__NR_POSIX + 133)
-#define __NR_POSIX_SGI_mmap		(__NR_POSIX + 134)
-#define __NR_POSIX_SGI_munmap		(__NR_POSIX + 135)
-#define __NR_POSIX_SGI_mprotect		(__NR_POSIX + 136)
-#define __NR_POSIX_SGI_msync		(__NR_POSIX + 137)
-#define __NR_POSIX_SGI_madvise		(__NR_POSIX + 138)
-#define __NR_POSIX_SGI_mpin		(__NR_POSIX + 139)
-#define __NR_POSIX_SGI_getpagesize	(__NR_POSIX + 140)
-#define __NR_POSIX_SGI_libattach	(__NR_POSIX + 141)
-#define __NR_POSIX_SGI_libdetach	(__NR_POSIX + 142)
-#define __NR_POSIX_SGI_getpgrp		(__NR_POSIX + 143)
-#define __NR_POSIX_SGI_setpgrp		(__NR_POSIX + 144)
-#define __NR_POSIX_SGI_reserved2	(__NR_POSIX + 145)
-#define __NR_POSIX_SGI_reserved3	(__NR_POSIX + 146)
-#define __NR_POSIX_SGI_reserved4	(__NR_POSIX + 147)
-#define __NR_POSIX_SGI_reserved5	(__NR_POSIX + 148)
-#define __NR_POSIX_SGI_reserved6	(__NR_POSIX + 149)
-#define __NR_POSIX_cacheflush		(__NR_POSIX + 150)
-#define __NR_POSIX_cachectl		(__NR_POSIX + 151)
-#define __NR_POSIX_fchown		(__NR_POSIX + 152)
-#define __NR_POSIX_fchmod		(__NR_POSIX + 153)
-#define __NR_POSIX_wait3		(__NR_POSIX + 154)
-#define __NR_POSIX_mmap			(__NR_POSIX + 155)
-#define __NR_POSIX_munmap		(__NR_POSIX + 156)
-#define __NR_POSIX_madvise		(__NR_POSIX + 157)
-#define __NR_POSIX_BSD_getpagesize	(__NR_POSIX + 158)
-#define __NR_POSIX_setreuid		(__NR_POSIX + 159)
-#define __NR_POSIX_setregid		(__NR_POSIX + 160)
-#define __NR_POSIX_setpgid		(__NR_POSIX + 161)
-#define __NR_POSIX_getgroups		(__NR_POSIX + 162)
-#define __NR_POSIX_setgroups		(__NR_POSIX + 163)
-#define __NR_POSIX_gettimeofday		(__NR_POSIX + 164)
-#define __NR_POSIX_getrusage		(__NR_POSIX + 165)
-#define __NR_POSIX_getrlimit		(__NR_POSIX + 166)
-#define __NR_POSIX_setrlimit		(__NR_POSIX + 167)
-#define __NR_POSIX_waitpid		(__NR_POSIX + 168)
-#define __NR_POSIX_dup2			(__NR_POSIX + 169)
-#define __NR_POSIX_reserved2		(__NR_POSIX + 170)
-#define __NR_POSIX_reserved3		(__NR_POSIX + 171)
-#define __NR_POSIX_reserved4		(__NR_POSIX + 172)
-#define __NR_POSIX_reserved5		(__NR_POSIX + 173)
-#define __NR_POSIX_reserved6		(__NR_POSIX + 174)
-#define __NR_POSIX_reserved7		(__NR_POSIX + 175)
-#define __NR_POSIX_reserved8		(__NR_POSIX + 176)
-#define __NR_POSIX_reserved9		(__NR_POSIX + 177)
-#define __NR_POSIX_reserved10		(__NR_POSIX + 178)
-#define __NR_POSIX_reserved11		(__NR_POSIX + 179)
-#define __NR_POSIX_reserved12		(__NR_POSIX + 180)
-#define __NR_POSIX_reserved13		(__NR_POSIX + 181)
-#define __NR_POSIX_reserved14		(__NR_POSIX + 182)
-#define __NR_POSIX_reserved15		(__NR_POSIX + 183)
-#define __NR_POSIX_reserved16		(__NR_POSIX + 184)
-#define __NR_POSIX_reserved17		(__NR_POSIX + 185)
-#define __NR_POSIX_reserved18		(__NR_POSIX + 186)
-#define __NR_POSIX_reserved19		(__NR_POSIX + 187)
-#define __NR_POSIX_reserved20		(__NR_POSIX + 188)
-#define __NR_POSIX_reserved21		(__NR_POSIX + 189)
-#define __NR_POSIX_reserved22		(__NR_POSIX + 190)
-#define __NR_POSIX_reserved23		(__NR_POSIX + 191)
-#define __NR_POSIX_reserved24		(__NR_POSIX + 192)
-#define __NR_POSIX_reserved25		(__NR_POSIX + 193)
-#define __NR_POSIX_reserved26		(__NR_POSIX + 194)
-#define __NR_POSIX_reserved27		(__NR_POSIX + 195)
-#define __NR_POSIX_reserved28		(__NR_POSIX + 196)
-#define __NR_POSIX_reserved29		(__NR_POSIX + 197)
-#define __NR_POSIX_reserved30		(__NR_POSIX + 198)
-#define __NR_POSIX_reserved31		(__NR_POSIX + 199)
-#define __NR_POSIX_reserved32		(__NR_POSIX + 200)
-#define __NR_POSIX_reserved33		(__NR_POSIX + 201)
-#define __NR_POSIX_reserved34		(__NR_POSIX + 202)
-#define __NR_POSIX_reserved35		(__NR_POSIX + 203)
-#define __NR_POSIX_reserved36		(__NR_POSIX + 204)
-#define __NR_POSIX_reserved37		(__NR_POSIX + 205)
-#define __NR_POSIX_reserved38		(__NR_POSIX + 206)
-#define __NR_POSIX_reserved39		(__NR_POSIX + 207)
-#define __NR_POSIX_reserved40		(__NR_POSIX + 208)
-#define __NR_POSIX_reserved41		(__NR_POSIX + 209)
-#define __NR_POSIX_reserved42		(__NR_POSIX + 210)
-#define __NR_POSIX_reserved43		(__NR_POSIX + 211)
-#define __NR_POSIX_reserved44		(__NR_POSIX + 212)
-#define __NR_POSIX_reserved45		(__NR_POSIX + 213)
-#define __NR_POSIX_reserved46		(__NR_POSIX + 214)
-#define __NR_POSIX_reserved47		(__NR_POSIX + 215)
-#define __NR_POSIX_reserved48		(__NR_POSIX + 216)
-#define __NR_POSIX_reserved49		(__NR_POSIX + 217)
-#define __NR_POSIX_reserved50		(__NR_POSIX + 218)
-#define __NR_POSIX_reserved51		(__NR_POSIX + 219)
-#define __NR_POSIX_reserved52		(__NR_POSIX + 220)
-#define __NR_POSIX_reserved53		(__NR_POSIX + 221)
-#define __NR_POSIX_reserved54		(__NR_POSIX + 222)
-#define __NR_POSIX_reserved55		(__NR_POSIX + 223)
-#define __NR_POSIX_reserved56		(__NR_POSIX + 224)
-#define __NR_POSIX_reserved57		(__NR_POSIX + 225)
-#define __NR_POSIX_reserved58		(__NR_POSIX + 226)
-#define __NR_POSIX_reserved59		(__NR_POSIX + 227)
-#define __NR_POSIX_reserved60		(__NR_POSIX + 228)
-#define __NR_POSIX_reserved61		(__NR_POSIX + 229)
-#define __NR_POSIX_reserved62		(__NR_POSIX + 230)
-#define __NR_POSIX_reserved63		(__NR_POSIX + 231)
-#define __NR_POSIX_reserved64		(__NR_POSIX + 232)
-#define __NR_POSIX_reserved65		(__NR_POSIX + 233)
-#define __NR_POSIX_reserved66		(__NR_POSIX + 234)
-#define __NR_POSIX_reserved67		(__NR_POSIX + 235)
-#define __NR_POSIX_reserved68		(__NR_POSIX + 236)
-#define __NR_POSIX_reserved69		(__NR_POSIX + 237)
-#define __NR_POSIX_reserved70		(__NR_POSIX + 238)
-#define __NR_POSIX_reserved71		(__NR_POSIX + 239)
-#define __NR_POSIX_reserved72		(__NR_POSIX + 240)
-#define __NR_POSIX_reserved73		(__NR_POSIX + 241)
-#define __NR_POSIX_reserved74		(__NR_POSIX + 242)
-#define __NR_POSIX_reserved75		(__NR_POSIX + 243)
-#define __NR_POSIX_reserved76		(__NR_POSIX + 244)
-#define __NR_POSIX_reserved77		(__NR_POSIX + 245)
-#define __NR_POSIX_reserved78		(__NR_POSIX + 246)
-#define __NR_POSIX_reserved79		(__NR_POSIX + 247)
-#define __NR_POSIX_reserved80		(__NR_POSIX + 248)
-#define __NR_POSIX_reserved81		(__NR_POSIX + 249)
-#define __NR_POSIX_reserved82		(__NR_POSIX + 250)
-#define __NR_POSIX_reserved83		(__NR_POSIX + 251)
-#define __NR_POSIX_reserved84		(__NR_POSIX + 252)
-#define __NR_POSIX_reserved85		(__NR_POSIX + 253)
-#define __NR_POSIX_reserved86		(__NR_POSIX + 254)
-#define __NR_POSIX_reserved87		(__NR_POSIX + 255)
-#define __NR_POSIX_reserved88		(__NR_POSIX + 256)
-#define __NR_POSIX_reserved89		(__NR_POSIX + 257)
-#define __NR_POSIX_reserved90		(__NR_POSIX + 258)
-#define __NR_POSIX_reserved91		(__NR_POSIX + 259)
-#define __NR_POSIX_netboot		(__NR_POSIX + 260)
-#define __NR_POSIX_netunboot		(__NR_POSIX + 261)
-#define __NR_POSIX_rdump		(__NR_POSIX + 262)
-#define __NR_POSIX_setsid		(__NR_POSIX + 263)
-#define __NR_POSIX_getmaxsig		(__NR_POSIX + 264)
-#define __NR_POSIX_sigpending		(__NR_POSIX + 265)
-#define __NR_POSIX_sigprocmask		(__NR_POSIX + 266)
-#define __NR_POSIX_sigsuspend		(__NR_POSIX + 267)
-#define __NR_POSIX_sigaction		(__NR_POSIX + 268)
-#define __NR_POSIX_MIPS_reserved1	(__NR_POSIX + 269)
-#define __NR_POSIX_MIPS_reserved2	(__NR_POSIX + 270)
-#define __NR_POSIX_MIPS_reserved3	(__NR_POSIX + 271)
-#define __NR_POSIX_MIPS_reserved4	(__NR_POSIX + 272)
-#define __NR_POSIX_MIPS_reserved5	(__NR_POSIX + 273)
-#define __NR_POSIX_MIPS_reserved6	(__NR_POSIX + 274)
-#define __NR_POSIX_MIPS_reserved7	(__NR_POSIX + 275)
-#define __NR_POSIX_MIPS_reserved8	(__NR_POSIX + 276)
-#define __NR_POSIX_MIPS_reserved9	(__NR_POSIX + 277)
-#define __NR_POSIX_MIPS_reserved10	(__NR_POSIX + 278)
-#define __NR_POSIX_MIPS_reserved11	(__NR_POSIX + 279)
-#define __NR_POSIX_TANDEM_reserved1	(__NR_POSIX + 280)
-#define __NR_POSIX_TANDEM_reserved2	(__NR_POSIX + 281)
-#define __NR_POSIX_TANDEM_reserved3	(__NR_POSIX + 282)
-#define __NR_POSIX_TANDEM_reserved4	(__NR_POSIX + 283)
-#define __NR_POSIX_TANDEM_reserved5	(__NR_POSIX + 284)
-#define __NR_POSIX_TANDEM_reserved6	(__NR_POSIX + 285)
-#define __NR_POSIX_TANDEM_reserved7	(__NR_POSIX + 286)
-#define __NR_POSIX_TANDEM_reserved8	(__NR_POSIX + 287)
-#define __NR_POSIX_TANDEM_reserved9	(__NR_POSIX + 288)
-#define __NR_POSIX_TANDEM_reserved10	(__NR_POSIX + 289)
-#define __NR_POSIX_TANDEM_reserved11	(__NR_POSIX + 290)
-#define __NR_POSIX_TANDEM_reserved12	(__NR_POSIX + 291)
-#define __NR_POSIX_TANDEM_reserved13	(__NR_POSIX + 292)
-#define __NR_POSIX_TANDEM_reserved14	(__NR_POSIX + 293)
-#define __NR_POSIX_TANDEM_reserved15	(__NR_POSIX + 294)
-#define __NR_POSIX_TANDEM_reserved16	(__NR_POSIX + 295)
-#define __NR_POSIX_TANDEM_reserved17	(__NR_POSIX + 296)
-#define __NR_POSIX_TANDEM_reserved18	(__NR_POSIX + 297)
-#define __NR_POSIX_TANDEM_reserved19	(__NR_POSIX + 298)
-#define __NR_POSIX_TANDEM_reserved20	(__NR_POSIX + 299)
-#define __NR_POSIX_SGI_reserved7	(__NR_POSIX + 300)
-#define __NR_POSIX_SGI_reserved8	(__NR_POSIX + 301)
-#define __NR_POSIX_SGI_reserved9	(__NR_POSIX + 302)
-#define __NR_POSIX_SGI_reserved10	(__NR_POSIX + 303)
-#define __NR_POSIX_SGI_reserved11	(__NR_POSIX + 304)
-#define __NR_POSIX_SGI_reserved12	(__NR_POSIX + 305)
-#define __NR_POSIX_SGI_reserved13	(__NR_POSIX + 306)
-#define __NR_POSIX_SGI_reserved14	(__NR_POSIX + 307)
-#define __NR_POSIX_SGI_reserved15	(__NR_POSIX + 308)
-#define __NR_POSIX_SGI_reserved16	(__NR_POSIX + 309)
-#define __NR_POSIX_SGI_reserved17	(__NR_POSIX + 310)
-#define __NR_POSIX_SGI_reserved18	(__NR_POSIX + 311)
-#define __NR_POSIX_SGI_reserved19	(__NR_POSIX + 312)
-#define __NR_POSIX_SGI_reserved20	(__NR_POSIX + 313)
-#define __NR_POSIX_SGI_reserved21	(__NR_POSIX + 314)
-#define __NR_POSIX_SGI_reserved22	(__NR_POSIX + 315)
-#define __NR_POSIX_SGI_reserved23	(__NR_POSIX + 316)
-#define __NR_POSIX_SGI_reserved24	(__NR_POSIX + 317)
-#define __NR_POSIX_SGI_reserved25	(__NR_POSIX + 318)
-#define __NR_POSIX_SGI_reserved26	(__NR_POSIX + 319)
-
-#endif /* _ASM_RISCOS_SYSCALL_H */
-- 
cgit v1.1


From 7c4cb60e5b97677424e95baee9c29df54b26e6ba Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:47 -0800
Subject: [PATCH] x86: GDT alignment fix

Make GDT page aligned and page padded to support running inside of a
hypervisor.  This prevents false sharing of the GDT page with other hot
data, which is not allowed in Xen, and causes performance problems in
VMware.

Rather than go back to the old method of statically allocating the GDT
(which wastes unneded space for non-present CPUs), the GDT for APs is
allocated dynamically.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c          |  2 ++
 arch/i386/kernel/cpu/common.c   |  3 ---
 arch/i386/kernel/head.S         |  2 ++
 arch/i386/kernel/i386_ksyms.c   |  3 +--
 arch/i386/kernel/smpboot.c      |  6 ++++++
 drivers/pnp/pnpbios/bioscalls.c | 22 +++++++++++++---------
 include/asm-i386/desc.h         |  8 +++++---
 7 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 1e60acb..6c8e483 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -2317,6 +2317,8 @@ static int __init apm_init(void)
 
 	for (i = 0; i < NR_CPUS; i++) {
 		struct desc_struct *gdt = get_cpu_gdt_table(i);
+  		if (!gdt)
+  			continue;
 		set_base(gdt[APM_CS >> 3],
 			 __va((unsigned long)apm_info.bios.cseg << 4));
 		set_base(gdt[APM_CS_16 >> 3],
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 31e344b..cbc3206 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,9 +18,6 @@
 
 #include "cpu.h"
 
-DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
-EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
-
 DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
 EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
 
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index e437fb3..870f20b 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -525,3 +525,5 @@ ENTRY(cpu_gdt_table)
 	.quad 0x0000000000000000	/* 0xf0 - unused */
 	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
 
+	/* Be sure this is zeroed to avoid false validations in Xen */
+	.fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 180f070..3999bec 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -3,8 +3,7 @@
 #include <asm/checksum.h>
 #include <asm/desc.h>
 
-/* This is definitely a GPL-only symbol */
-EXPORT_SYMBOL_GPL(cpu_gdt_table);
+EXPORT_SYMBOL_GPL(cpu_gdt_descr);
 
 EXPORT_SYMBOL(__down_failed);
 EXPORT_SYMBOL(__down_failed_interruptible);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 9ed449a..b3c2e2c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -903,6 +903,12 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
 	unsigned long start_eip;
 	unsigned short nmi_high = 0, nmi_low = 0;
 
+	if (!cpu_gdt_descr[cpu].address &&
+	    !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
+		printk("Failed to allocate GDT for CPU %d\n", cpu);
+		return 1;
+	}
+
 	++cpucount;
 
 	/*
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 6b7583f..7cb476e 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -69,14 +69,16 @@ __asm__(
 
 #define Q_SET_SEL(cpu, selname, address, size) \
 do { \
-set_base(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], __va((u32)(address))); \
-set_limit(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], size); \
+struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
+set_base(gdt[(selname) >> 3], __va((u32)(address))); \
+set_limit(gdt[(selname) >> 3], size); \
 } while(0)
 
 #define Q2_SET_SEL(cpu, selname, address, size) \
 do { \
-set_base(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], (u32)(address)); \
-set_limit(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], size); \
+struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
+set_base(gdt[(selname) >> 3], (u32)(address)); \
+set_limit(gdt[(selname) >> 3], size); \
 } while(0)
 
 static struct desc_struct bad_bios_desc = { 0, 0x00409200 };
@@ -115,8 +117,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
 		return PNP_FUNCTION_NOT_SUPPORTED;
 
 	cpu = get_cpu();
-	save_desc_40 = per_cpu(cpu_gdt_table,cpu)[0x40 / 8];
-	per_cpu(cpu_gdt_table,cpu)[0x40 / 8] = bad_bios_desc;
+	save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
+	get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
 
 	/* On some boxes IRQ's during PnP BIOS calls are deadly.  */
 	spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -158,7 +160,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
 	);
 	spin_unlock_irqrestore(&pnp_bios_lock, flags);
 
-	per_cpu(cpu_gdt_table,cpu)[0x40 / 8] = save_desc_40;
+	get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
 	put_cpu();
 
 	/* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -535,8 +537,10 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
 
 	set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
 	_set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
-	for(i=0; i < NR_CPUS; i++)
-	{
+ 	for (i = 0; i < NR_CPUS; i++) {
+  		struct desc_struct *gdt = get_cpu_gdt_table(i);
+  		if (!gdt)
+  			continue;
 		Q2_SET_SEL(i, PNP_CS32, &pnp_bios_callfunc, 64 * 1024);
 		Q_SET_SEL(i, PNP_CS16, header->fields.pm16cseg, 64 * 1024);
 		Q_SET_SEL(i, PNP_DS, header->fields.pm16dseg, 64 * 1024);
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 29b851a..494e73b 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -15,9 +15,6 @@
 #include <asm/mmu.h>
 
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
-DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
-
-#define get_cpu_gdt_table(_cpu) (per_cpu(cpu_gdt_table,_cpu))
 
 DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
 
@@ -29,6 +26,11 @@ struct Xgt_desc_struct {
 
 extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
 
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
+}
+
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
 
-- 
cgit v1.1


From e43d674f44dc885a2476cab3537e639d9eaa31a9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 6 Jan 2006 00:11:48 -0800
Subject: [PATCH] i386: don't blindly enable interrupts in die()

Rather than blindly re-enabling interrupts in die(), save their state
upon entry and then restore that state.

If the kernel is in really bad condition and faults with interrupts disabled,
re-enabling them in die() may cause even more trouble, implying more chances
of data corruption.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index ab0e943..bb36a98 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,14 +306,17 @@ void die(const char * str, struct pt_regs * regs, long err)
 		.lock_owner_depth =	0
 	};
 	static int die_counter;
+	unsigned long flags;
 
 	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
-		spin_lock_irq(&die.lock);
+		spin_lock_irqsave(&die.lock, flags);
 		die.lock_owner = smp_processor_id();
 		die.lock_owner_depth = 0;
 		bust_spinlocks(1);
 	}
+	else
+		local_save_flags(flags);
 
 	if (++die.lock_owner_depth < 3) {
 		int nl = 0;
@@ -340,7 +343,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
-	spin_unlock_irq(&die.lock);
+	spin_unlock_irqrestore(&die.lock, flags);
 
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
-- 
cgit v1.1


From d43c6e8083ac8baeb1a167510aea34fcef396e33 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 6 Jan 2006 00:11:49 -0800
Subject: [PATCH] i386: move SIMD initialization

Move some code unrelated to any dealing with hardware bugs from i386's
bugs.h to a more logical place.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 22 ++++++++++++++++++++++
 include/asm-i386/bugs.h  | 23 +----------------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index bb36a98..f0c4060 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1098,6 +1098,28 @@ void __init trap_init(void)
 #endif
 	set_trap_gate(19,&simd_coprocessor_error);
 
+	if (cpu_has_fxsr) {
+		/*
+		 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
+		 * Generates a compile-time "error: zero width for bit-field" if
+		 * the alignment is wrong.
+		 */
+		struct fxsrAlignAssert {
+			int _:!(offsetof(struct task_struct,
+					thread.i387.fxsave) & 15);
+		};
+
+		printk(KERN_INFO "Enabling fast FPU save and restore... ");
+		set_in_cr4(X86_CR4_OSFXSR);
+		printk("done.\n");
+	}
+	if (cpu_has_xmm) {
+		printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
+				"support... ");
+		set_in_cr4(X86_CR4_OSXMMEXCPT);
+		printk("done.\n");
+	}
+
 	set_system_gate(SYSCALL_VECTOR,&system_call);
 
 	/*
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index ea54540..50233e0 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -8,9 +8,6 @@
  *        <rreilova@ececs.uc.edu>
  *	- Channing Corn (tests & fixes),
  *	- Andrew D. Balsa (code cleanup).
- *
- *  Pentium III FXSR, SSE support
- *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
 
 /*
@@ -76,25 +73,7 @@ static void __init check_fpu(void)
 		return;
 	}
 
-/* Enable FXSR and company _before_ testing for FP problems. */
-	/*
-	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-	 */
-	if (offsetof(struct task_struct, thread.i387.fxsave) & 15) {
-		extern void __buggy_fxsr_alignment(void);
-		__buggy_fxsr_alignment();
-	}
-	if (cpu_has_fxsr) {
-		printk(KERN_INFO "Enabling fast FPU save and restore... ");
-		set_in_cr4(X86_CR4_OSFXSR);
-		printk("done.\n");
-	}
-	if (cpu_has_xmm) {
-		printk(KERN_INFO "Enabling unmasked SIMD FPU exception support... ");
-		set_in_cr4(X86_CR4_OSXMMEXCPT);
-		printk("done.\n");
-	}
-
+/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
 	/* Test for the divl bug.. */
 	__asm__("fninit\n\t"
 		"fldl %1\n\t"
-- 
cgit v1.1


From eb05c3249a8e8a675e79d221f4a0874dc10ec903 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 6 Jan 2006 00:11:49 -0800
Subject: [PATCH] i386: fix bound check IDT gate

Other than apparently commonly assumed, the bound instruction does not
require the corresponding IDT entry to have DPL 3.

Acked-by: "Seth, Rohit" <rohit.seth@intel.com>
Acked-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index f0c4060..53ad954 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1078,9 +1078,9 @@ void __init trap_init(void)
 	set_trap_gate(0,&divide_error);
 	set_intr_gate(1,&debug);
 	set_intr_gate(2,&nmi);
-	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
+	set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
 	set_system_gate(4,&overflow);
-	set_system_gate(5,&bounds);
+	set_trap_gate(5,&bounds);
 	set_trap_gate(6,&invalid_op);
 	set_trap_gate(7,&device_not_available);
 	set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
-- 
cgit v1.1


From ff6e8c0d5e47f0ceeebde86ec2f5919dbd5beb67 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:50 -0800
Subject: [PATCH] x86: Cr4 is valid on some 486s

So some 486 processors do have CR4 register.  Allow them to present it in
register dumps by using the old fault technique rather than testing processor
family.

Thanks to Maciej for noticing this.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c |  4 +---
 include/asm-i386/system.h  | 13 +++++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 2333aea..6081a10 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -308,9 +308,7 @@ void show_regs(struct pt_regs * regs)
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
-	if (current_cpu_data.x86 > 4) {
-		cr4 = read_cr4();
-	}
+	cr4 = read_cr4_safe();
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
 	show_trace(NULL, &regs->esp);
 }
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 772f85d..88b4d5c 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -140,6 +140,19 @@ static inline unsigned long _get_base(char * addr)
 		:"=r" (__dummy)); \
 	__dummy; \
 })
+
+#define read_cr4_safe() ({			      \
+	unsigned int __dummy;			      \
+	/* This could fault if %cr4 does not exist */ \
+	__asm__("1: movl %%cr4, %0		\n"   \
+		"2:				\n"   \
+		".section __ex_table,\"a\"	\n"   \
+		".long 1b,2b			\n"   \
+		".previous			\n"   \
+		: "=r" (__dummy): "0" (0));	      \
+	__dummy;				      \
+})
+
 #define write_cr4(x) \
 	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
 #define stts() write_cr0(8 | read_cr0())
-- 
cgit v1.1


From 5702d0f742b2f462267bca147334f77a255bcc74 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:51 -0800
Subject: [PATCH] x86: Pnp segments in segment h

Move PnP BIOS segment definitions into segment.h; the segments are reserved
here, so they might as well be defined here as well.

Note I didn't do this for APM BIOS, as Macintosh and other systems use those
values to emulate APM in some scary way I don't want to understand.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pnp/pnpbios/bioscalls.c |  9 ---------
 include/asm-i386/segment.h      | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 7cb476e..37bacfc 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -31,15 +31,6 @@ static struct {
 } pnp_bios_callpoint;
 
 
-/* The PnP BIOS entries in the GDT */
-#define PNP_GDT    (GDT_ENTRY_PNPBIOS_BASE * 8)
-
-#define PNP_CS32   (PNP_GDT+0x00)	/* segment for calling fn */
-#define PNP_CS16   (PNP_GDT+0x08)	/* code segment for BIOS */
-#define PNP_DS     (PNP_GDT+0x10)	/* data segment for BIOS */
-#define PNP_TS1    (PNP_GDT+0x18)	/* transfer data segment */
-#define PNP_TS2    (PNP_GDT+0x20)	/* another data segment */
-
 /*
  * These are some opcodes for a "static asmlinkage"
  * As this code is *not* executed inside the linux kernel segment, but in a
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index bb5ff5b..faf9953 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -91,6 +91,20 @@
 #define GDT_ENTRY_BOOT_DS		(GDT_ENTRY_BOOT_CS + 1)
 #define __BOOT_DS	(GDT_ENTRY_BOOT_DS * 8)
 
+/* The PnP BIOS entries in the GDT */
+#define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)
+#define GDT_ENTRY_PNPBIOS_CS16		(GDT_ENTRY_PNPBIOS_BASE + 1)
+#define GDT_ENTRY_PNPBIOS_DS		(GDT_ENTRY_PNPBIOS_BASE + 2)
+#define GDT_ENTRY_PNPBIOS_TS1		(GDT_ENTRY_PNPBIOS_BASE + 3)
+#define GDT_ENTRY_PNPBIOS_TS2		(GDT_ENTRY_PNPBIOS_BASE + 4)
+
+/* The PnP BIOS selectors */
+#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)	/* segment for calling fn */
+#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)	/* code segment for BIOS */
+#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)	/* data segment for BIOS */
+#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8)	/* transfer data segment */
+#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8)	/* another data segment */
+
 /*
  * The interrupt descriptor table has room for 256 idt's,
  * the global descriptor table is dependent on the number
-- 
cgit v1.1


From 3012d2d209580c78b5927d55c60a10891be8befd Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:53 -0800
Subject: [PATCH] x86: Always relax segments

APM BIOSes have many bugs regarding proper representation of the appropriate
segment limits for calling the BIOS.  By default, APM_RELAX_SEGMENTS is always
turned on to support running the APM BIOS on these buggy machines.  Keeping
64k limits poses very little danger to the kernel, because the pages where the
APM BIOS is located will always be in low physical memory BIOS areas, which
should already be marked reserved, and only buggy BIOSes would possibly
overstep the segment bounds with writes to data anyway.

Since forcing stricter limits breaks many machines and is not default
behavior, it seems reasonable to deprecate the older code which may cause APM
BIOS to fault.

If you really have a badly enough broken APM BIOS that you have to turn off
APM_RELAX_SEGMENTS, seems like the best recourse here would be to disable the
APM BIOS and / or not compile it into your kernel to begin with, and / or add
your system to the known bad list.

The reason I want to deprecate this code is there is underlying brokenness
with the set_limit macros, and getting rid of many of the call sites rather
than rewriting them seems to be the simplest and most correct course of
action.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c | 55 ++++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 6c8e483..0d29811 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -303,17 +303,6 @@ extern int (*console_blank_hook)(int);
 #include "apm.h"
 
 /*
- * Define to make all _set_limit calls use 64k limits.  The APM 1.1 BIOS is
- * supposed to provide limit information that it recognizes.  Many machines
- * do this correctly, but many others do not restrict themselves to their
- * claimed limit.  When this happens, they will cause a segmentation
- * violation in the kernel at boot time.  Most BIOS's, however, will
- * respect a 64k limit, so we use that.  If you want to be pedantic and
- * hold your BIOS to its claims, then undefine this.
- */
-#define APM_RELAX_SEGMENTS
-
-/*
  * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
  * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
  * David Chen <chen@ctpa04.mit.edu>
@@ -2312,9 +2301,20 @@ static int __init apm_init(void)
 	set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
 	_set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
 
+	/*
+	 * Set up the long jump entry point to the APM BIOS, which is called
+	 * from inline assembly.
+	 */
 	apm_bios_entry.offset = apm_info.bios.offset;
 	apm_bios_entry.segment = APM_CS;
 
+	/*
+	 * The APM 1.1 BIOS is supposed to provide limit information that it
+	 * recognizes.  Many machines do this correctly, but many others do
+	 * not restrict themselves to their claimed limit.  When this happens,
+	 * they will cause a segmentation violation in the kernel at boot time.
+	 * Most BIOS's, however, will respect a 64k limit, so we use that.
+	 */
 	for (i = 0; i < NR_CPUS; i++) {
 		struct desc_struct *gdt = get_cpu_gdt_table(i);
   		if (!gdt)
@@ -2325,33 +2325,12 @@ static int __init apm_init(void)
 			 __va((unsigned long)apm_info.bios.cseg_16 << 4));
 		set_base(gdt[APM_DS >> 3],
 			 __va((unsigned long)apm_info.bios.dseg << 4));
-#ifndef APM_RELAX_SEGMENTS
-		if (apm_info.bios.version == 0x100) {
-#endif
-			/* For ASUS motherboard, Award BIOS rev 110 (and others?) */
-			_set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 - 1);
-			/* For some unknown machine. */
-			_set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1);
-			/* For the DEC Hinote Ultra CT475 (and others?) */
-			_set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1);
-#ifndef APM_RELAX_SEGMENTS
-		} else {
-			_set_limit((char *)&gdt[APM_CS >> 3],
-				(apm_info.bios.cseg_len - 1) & 0xffff);
-			_set_limit((char *)&gdt[APM_CS_16 >> 3],
-				(apm_info.bios.cseg_16_len - 1) & 0xffff);
-			_set_limit((char *)&gdt[APM_DS >> 3],
-				(apm_info.bios.dseg_len - 1) & 0xffff);
-		      /* workaround for broken BIOSes */
-	                if (apm_info.bios.cseg_len <= apm_info.bios.offset)
-        	                _set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 -1);
-                       if (apm_info.bios.dseg_len <= 0x40) { /* 0x40 * 4kB == 64kB */
-                        	/* for the BIOS that assumes granularity = 1 */
-                        	gdt[APM_DS >> 3].b |= 0x800000;
-                        	printk(KERN_NOTICE "apm: we set the granularity of dseg.\n");
-        	        }
-		}
-#endif
+		/* For ASUS motherboard, Award BIOS rev 110 (and others?) */
+		_set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 - 1);
+		/* For some unknown machine. */
+		_set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1);
+		/* For the DEC Hinote Ultra CT475 (and others?) */
+		_set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1);
 	}
 
 	apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
-- 
cgit v1.1


From 99022c4695d3f45fcf7f3827aa46dd2d9e53e365 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:53 -0800
Subject: [PATCH] x86: Apm seg in gdt

Since APM BIOS segment limits are now fixed, set them in head.S GDT and don't
use the complicated _set_limit() macro expansion.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c  | 6 ------
 arch/i386/kernel/head.S | 9 +++++----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 0d29811..45199bb 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -2325,12 +2325,6 @@ static int __init apm_init(void)
 			 __va((unsigned long)apm_info.bios.cseg_16 << 4));
 		set_base(gdt[APM_DS >> 3],
 			 __va((unsigned long)apm_info.bios.dseg << 4));
-		/* For ASUS motherboard, Award BIOS rev 110 (and others?) */
-		_set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 - 1);
-		/* For some unknown machine. */
-		_set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1);
-		/* For the DEC Hinote Ultra CT475 (and others?) */
-		_set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1);
 	}
 
 	apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 870f20b..37b599f 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -510,13 +510,14 @@ ENTRY(cpu_gdt_table)
 	.quad 0x0080920000000000	/* 0xa0 16-bit data */
 	.quad 0x0080920000000000	/* 0xa8 16-bit data */
 	.quad 0x0080920000000000	/* 0xb0 16-bit data */
+
 	/*
 	 * The APM segments have byte granularity and their bases
-	 * and limits are set at run time.
+	 * are set at run time.  All have 64k limits.
 	 */
-	.quad 0x00409a0000000000	/* 0xb8 APM CS    code */
-	.quad 0x00009a0000000000	/* 0xc0 APM CS 16 code (16 bit) */
-	.quad 0x0040920000000000	/* 0xc8 APM DS    data */
+	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
+	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
+	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
 
 	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
 	.quad 0x0000000000000000	/* 0xd8 - unused */
-- 
cgit v1.1


From 3fae1c37eea98097de34ba665796fea93b29f4aa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:54 -0800
Subject: [PATCH] x86: Deprecate obsolete ldt accessors

Old accessors to fetch LDT descriptors are unused and outdated and in the
wrong header file.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/system.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 88b4d5c..24cc0c8 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -56,22 +56,6 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
 #define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
 #define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1)>>12 )
 
-static inline unsigned long _get_base(char * addr)
-{
-	unsigned long __base;
-	__asm__("movb %3,%%dh\n\t"
-		"movb %2,%%dl\n\t"
-		"shll $16,%%edx\n\t"
-		"movw %1,%%dx"
-		:"=&d" (__base)
-		:"m" (*((addr)+2)),
-		 "m" (*((addr)+4)),
-		 "m" (*((addr)+7)));
-	return __base;
-}
-
-#define get_base(ldt) _get_base( ((char *)&(ldt)) )
-
 /*
  * Load a segment. Fall back on loading the zero
  * segment if something goes wrong..
-- 
cgit v1.1


From 5fe9fe3c6f9a1ae7aa224bb7a66eb9aad9e4abef Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:55 -0800
Subject: [PATCH] x86: Pnp byte granularity

The one remaining caller of set_limit, the PnP BIOS code, calls into the PnP
BIOS, passing kernel parameters in and out.  These parameteres may be passed
from arbitrary kernel virtual memory, so they deserve strict protection to
stop a bad BIOS from smashing beyond the object size.

Unfortunately, the use of set_limit was badly botching this by setting the
limit in terms of pages, when it really should have byte granularity.

When doing this, I discovered my BIOS had the buggy code during the "get
system device node" call:

 mov ax, es:[bx]

Which is harmless, but has a trivial workaround.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/head.S         | 12 ++++++------
 drivers/pnp/pnpbios/bioscalls.c |  5 ++++-
 include/asm-i386/system.h       |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 37b599f..58d2746 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -504,12 +504,12 @@ ENTRY(cpu_gdt_table)
 	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
 	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
 
-	/* Segments used for calling PnP BIOS */
-	.quad 0x00c09a0000000000	/* 0x90 32-bit code */
-	.quad 0x00809a0000000000	/* 0x98 16-bit code */
-	.quad 0x0080920000000000	/* 0xa0 16-bit data */
-	.quad 0x0080920000000000	/* 0xa8 16-bit data */
-	.quad 0x0080920000000000	/* 0xb0 16-bit data */
+	/* Segments used for calling PnP BIOS have byte granularity */
+	.quad 0x00409a0000000000	/* 0x90 32-bit code */
+	.quad 0x00009a0000000000	/* 0x98 16-bit code */
+	.quad 0x0000920000000000	/* 0xa0 16-bit data */
+	.quad 0x0000920000000000	/* 0xa8 16-bit data */
+	.quad 0x0000920000000000	/* 0xb0 16-bit data */
 
 	/*
 	 * The APM segments have byte granularity and their bases
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 37bacfc..a721261 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -283,12 +283,15 @@ int pnp_bios_dev_node_info(struct pnp_dev_node_info *data)
 static int __pnp_bios_get_dev_node(u8 *nodenum, char boot, struct pnp_bios_node *data)
 {
 	u16 status;
+	u16 tmp_nodenum;
 	if (!pnp_bios_present())
 		return PNP_FUNCTION_NOT_SUPPORTED;
 	if ( !boot && pnpbios_dont_use_current_config )
 		return PNP_FUNCTION_NOT_SUPPORTED;
+	tmp_nodenum = *nodenum;
 	status = call_pnp_bios(PNP_GET_SYS_DEV_NODE, 0, PNP_TS1, 0, PNP_TS2, boot ? 2 : 1, PNP_DS, 0,
-			       nodenum, sizeof(char), data, 65536);
+			       &tmp_nodenum, sizeof(tmp_nodenum), data, 65536);
+	*nodenum = tmp_nodenum;
 	return status;
 }
 
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 24cc0c8..9c0593b 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -54,7 +54,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
         ); } while(0)
 
 #define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1)>>12 )
+#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
 
 /*
  * Load a segment. Fall back on loading the zero
-- 
cgit v1.1


From e6a9918c9617ed21f71f2f20b45efe06822c8f00 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:56 -0800
Subject: [PATCH] x86: Fixed pnp bios limits

PnP BIOS data, code, and 32-bit entry segments all have fixed limits as well;
set them in the GDT rather than adding more code.  It would be nice to add
these fixups to the boot GDT rather than setting the GDT for each CPU; perhaps
I can wiggle this in later, but getting it in before the subsys init looks
tricky.

Also, make some progress on deprecating the ugly Q_SET_SEL macros.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/head.S         | 12 ++++++++----
 drivers/pnp/pnpbios/bioscalls.c | 15 ++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 58d2746..5884469 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -504,10 +504,14 @@ ENTRY(cpu_gdt_table)
 	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
 	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
 
-	/* Segments used for calling PnP BIOS have byte granularity */
-	.quad 0x00409a0000000000	/* 0x90 32-bit code */
-	.quad 0x00009a0000000000	/* 0x98 16-bit code */
-	.quad 0x0000920000000000	/* 0xa0 16-bit data */
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
+	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
+	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
 	.quad 0x0000920000000000	/* 0xa8 16-bit data */
 	.quad 0x0000920000000000	/* 0xb0 16-bit data */
 
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index a721261..a1f0b0b 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -58,13 +58,6 @@ __asm__(
 	".previous		\n"
 );
 
-#define Q_SET_SEL(cpu, selname, address, size) \
-do { \
-struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
-set_base(gdt[(selname) >> 3], __va((u32)(address))); \
-set_limit(gdt[(selname) >> 3], size); \
-} while(0)
-
 #define Q2_SET_SEL(cpu, selname, address, size) \
 do { \
 struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
@@ -535,8 +528,8 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
   		struct desc_struct *gdt = get_cpu_gdt_table(i);
   		if (!gdt)
   			continue;
-		Q2_SET_SEL(i, PNP_CS32, &pnp_bios_callfunc, 64 * 1024);
-		Q_SET_SEL(i, PNP_CS16, header->fields.pm16cseg, 64 * 1024);
-		Q_SET_SEL(i, PNP_DS, header->fields.pm16dseg, 64 * 1024);
-	}
+ 		set_base(gdt[GDT_ENTRY_PNPBIOS_CS32], &pnp_bios_callfunc);
+ 		set_base(gdt[GDT_ENTRY_PNPBIOS_CS16], __va(header->fields.pm16cseg));
+ 		set_base(gdt[GDT_ENTRY_PNPBIOS_DS], __va(header->fields.pm16dseg));
+  	}
 }
-- 
cgit v1.1


From 2891dcdc4538e8f4ce50c9d1eea457cf2d81fb5b Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:57 -0800
Subject: [PATCH] x86: Stop deleting nt

Stop deleting NT bit from EFLAGS.  See arch/i386/kernel/head.S line 223, which
does something even better.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/common.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index cbc3206..cca6556 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -596,11 +596,6 @@ void __devinit cpu_init(void)
 	load_idt(&idt_descr);
 
 	/*
-	 * Delete NT
-	 */
-	__asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
-
-	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
 	atomic_inc(&init_mm.mm_count);
-- 
cgit v1.1


From 92f17f0171e864a2cbe448c5b7b473e72a7d27b8 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:58 -0800
Subject: [PATCH] x86: Apm is on cpu zero only

APM BIOS code has a protective wrapper that runs it only on CPU zero.  Thus,
no need to set APM BIOS segments in the GDT for other CPUs.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 45199bb..d0b4880 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -2222,8 +2222,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
 static int __init apm_init(void)
 {
 	struct proc_dir_entry *apm_proc;
+	struct desc_struct *gdt;
 	int ret;
-	int i;
 
 	dmi_check_system(apm_dmi_table);
 
@@ -2314,18 +2314,17 @@ static int __init apm_init(void)
 	 * not restrict themselves to their claimed limit.  When this happens,
 	 * they will cause a segmentation violation in the kernel at boot time.
 	 * Most BIOS's, however, will respect a 64k limit, so we use that.
+	 *
+	 * Note we only set APM segments on CPU zero, since we pin the APM
+	 * code to that CPU.
 	 */
-	for (i = 0; i < NR_CPUS; i++) {
-		struct desc_struct *gdt = get_cpu_gdt_table(i);
-  		if (!gdt)
-  			continue;
-		set_base(gdt[APM_CS >> 3],
-			 __va((unsigned long)apm_info.bios.cseg << 4));
-		set_base(gdt[APM_CS_16 >> 3],
-			 __va((unsigned long)apm_info.bios.cseg_16 << 4));
-		set_base(gdt[APM_DS >> 3],
-			 __va((unsigned long)apm_info.bios.dseg << 4));
-	}
+	gdt = get_cpu_gdt_table(0);
+	set_base(gdt[APM_CS >> 3],
+		 __va((unsigned long)apm_info.bios.cseg << 4));
+	set_base(gdt[APM_CS_16 >> 3],
+		 __va((unsigned long)apm_info.bios.cseg_16 << 4));
+	set_base(gdt[APM_DS >> 3],
+		 __va((unsigned long)apm_info.bios.dseg << 4));
 
 	apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
 	if (apm_proc)
-- 
cgit v1.1


From 2684927c6b938ec7a679891e0ec1fa0709c521bd Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Fri, 6 Jan 2006 00:11:59 -0800
Subject: [PATCH] x86: Deprecate useless bug

Remove the "temporary debugging check" which has managed to live for quite
some time, and is clearly unneeded.  The mm can never be live at this point,
so clearly checking the LDT in the mm->context is redundant as well.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 6081a10..45e7f0a 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -402,17 +402,7 @@ void flush_thread(void)
 
 void release_thread(struct task_struct *dead_task)
 {
-	if (dead_task->mm) {
-		// temporary debugging check
-		if (dead_task->mm->context.size) {
-			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
-					dead_task->comm,
-					dead_task->mm->context.ldt,
-					dead_task->mm->context.size);
-			BUG();
-		}
-	}
-
+	BUG_ON(dead_task->mm);
 	release_vm86_irqs(dead_task);
 }
 
-- 
cgit v1.1


From d89c145c0344fe2180336af6a309a59a8bc8c1c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:59 -0800
Subject: [PATCH] x86: handle -Wsign-compare in bitops

Make i386's find_first_bit() use an unsigned integer as a counter to avoid
getting warnings when -Wsign-compare is given.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index 4807aa1..26eb981 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -332,9 +332,9 @@ static inline unsigned long __ffs(unsigned long word)
  * Returns the bit-number of the first set bit, not the number of the byte
  * containing a bit.
  */
-static inline int find_first_bit(const unsigned long *addr, unsigned size)
+static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
 {
-	int x = 0;
+	unsigned x = 0;
 
 	while (x < size) {
 		unsigned long val = *addr++;
-- 
cgit v1.1


From 37b73c828185731f6236a6387c02d7b08c150810 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 00:12:01 -0800
Subject: [PATCH] x86/x86_64: mark rodata section read only: generic
 infrastructure

Generic prep-work for marking the .rodata section readonly:
* Align the rodata section at 4Kb boundary
* call the mark_rodata_ro() function when available

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/vmlinux.lds.h | 4 ++++
 init/main.c                       | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 094d491..35de20c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -10,6 +10,8 @@
 #define ALIGN_FUNCTION()  . = ALIGN(8)
 
 #define RODATA								\
+	. = ALIGN(4096);						\
+	__start_rodata = .;						\
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		*(.rodata) *(.rodata.*)					\
 		*(__vermagic)		/* Kernel version magic */	\
@@ -74,6 +76,8 @@
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
 	}								\
+	__end_rodata = .;						\
+	. = ALIGN(4096);						\
 									\
 	/* Built-in module parameters. */				\
 	__param : AT(ADDR(__param) - LOAD_OFFSET) {			\
diff --git a/init/main.c b/init/main.c
index 54aaf56..2ed3638 100644
--- a/init/main.c
+++ b/init/main.c
@@ -52,6 +52,7 @@
 #include <asm/bugs.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
+#include <asm/cacheflush.h>
 
 /*
  * This is one of the first .c files built. Error out early
@@ -99,6 +100,9 @@ extern void acpi_early_init(void);
 #else
 static inline void acpi_early_init(void) { }
 #endif
+#ifndef CONFIG_DEBUG_RODATA
+static inline void mark_rodata_ro(void) { }
+#endif
 
 #ifdef CONFIG_TC
 extern void tc_init(void);
@@ -708,6 +712,7 @@ static int init(void * unused)
 	 */
 	free_initmem();
 	unlock_kernel();
+	mark_rodata_ro();
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
-- 
cgit v1.1


From 63aaf3086baea7b94c218053af8237f9dbac5d05 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 00:12:02 -0800
Subject: [PATCH] x86/x86_64: mark rodata section read only: x86 parts

x86 specific parts to make the .rodata section read only

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig.debug       | 10 ++++++++++
 arch/i386/mm/init.c           | 24 ++++++++++++++++++++++++
 include/asm-i386/cacheflush.h |  4 ++++
 3 files changed, 38 insertions(+)

diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index c48b424..bf32ecc 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -42,6 +42,16 @@ config DEBUG_PAGEALLOC
 	  This results in a large slowdown, but helps to find certain types
 	  of memory corruptions.
 
+config DEBUG_RODATA
+	bool "Write protect kernel read-only data structures"
+	depends on DEBUG_KERNEL
+	help
+	  Mark the kernel read-only data as write-protected in the pagetables,
+	  in order to catch accidental (and incorrect) writes to such const
+	  data. This option may have a slight performance impact because a
+	  portion of the kernel code won't be covered by a 2MB TLB anymore.
+	  If in doubt, say "N".
+
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on DEBUG_KERNEL
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 06e26f0..7df494b 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -735,6 +735,30 @@ void free_initmem(void)
 	printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
 }
 
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+	unsigned long addr = (unsigned long)&__start_rodata;
+
+	for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+		change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+
+	printk ("Write protecting the kernel read-only data: %luk\n",
+			(unsigned long)(&__end_rodata - &__start_rodata) >> 10);
+
+	/*
+	 * change_page_attr() requires a global_flush_tlb() call after it.
+	 * We do this after the printk so that if something went wrong in the
+	 * change, the printk gets out at least to give a better debug hint
+	 * of who is the culprit.
+	 */
+	global_flush_tlb();
+}
+#endif
+
+
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
diff --git a/include/asm-i386/cacheflush.h b/include/asm-i386/cacheflush.h
index 2ea36de..7199f7b 100644
--- a/include/asm-i386/cacheflush.h
+++ b/include/asm-i386/cacheflush.h
@@ -31,4 +31,8 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot);
 void kernel_map_pages(struct page *page, int numpages, int enable);
 #endif
 
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
 #endif /* _I386_CACHEFLUSH_H */
-- 
cgit v1.1


From c728252c7a072628bd3932ff87943d1e12423359 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 00:12:03 -0800
Subject: [PATCH] x86/x86_64: mark rodata section read only: generic x86-64
 bugfix

Bug fix required for the .rodata work on x86-64:

when change_page_attr() and friends need to break up a 2Mb page into 4Kb
pages, it always set the NX bit on the PMD, which causes the cpu to consider
the entire 2Mb region to be NX regardless of the actual PTE perms.  This is
fine in general, with one big exception: the 2Mb page that covers the last
part of the kernel .text!  The fix is to not invent a new permission for the
new PMD entry, but to just inherit the existing one minus the PSE bit.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/pageattr.c    | 9 +++++++--
 include/asm-x86_64/pgtable.h | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index b90e8fe..35f1f1a 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 	pte_t *kpte; 
 	struct page *kpte_page;
 	unsigned kpte_flags;
+	pgprot_t ref_prot2;
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
 	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  			 * split_large_page will take the reference for this change_page_attr
  			 * on the split page.
  			 */
-			struct page *split = split_large_page(address, prot, ref_prot); 
+
+			struct page *split;
+			ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
+
+			split = split_large_page(address, prot, ref_prot2);
 			if (!split)
 				return -ENOMEM;
-			set_pte(kpte,mk_pte(split, ref_prot));
+			set_pte(kpte,mk_pte(split, ref_prot2));
 			kpte_page = split;
 		}	
 		get_page(kpte_page);
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index ecf58c7..02888d7 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -122,6 +122,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 
 #define pte_same(a, b)		((a).pte == (b).pte)
 
+#define pte_pgprot(a)	(__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
+
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 #define PUD_SIZE	(1UL << PUD_SHIFT)
-- 
cgit v1.1


From 67df197b1a07944c2e0e40ded3d4fd07d108e110 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 00:12:04 -0800
Subject: [PATCH] x86/x86_64: mark rodata section read-only: x86-64 support

x86-64 specific parts to make the .rodata section read only

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig.debug       | 10 ++++++++++
 arch/x86_64/mm/init.c           | 23 +++++++++++++++++++++++
 include/asm-x86_64/cacheflush.h |  4 ++++
 3 files changed, 37 insertions(+)

diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index e2c6e64a..fcb06a5 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -9,6 +9,16 @@ config INIT_DEBUG
 	  Fill __init and __initdata at the end of boot. This helps debugging
 	  illegal uses of __init and __initdata after initialization.
 
+config DEBUG_RODATA
+       bool "Write protect kernel read-only data structures"
+       depends on DEBUG_KERNEL
+       help
+	 Mark the kernel read-only data as write-protected in the pagetables,
+	 in order to catch accidental (and incorrect) writes to such const data.
+	 This option may have a slight performance impact because a portion
+	 of the kernel code won't be covered by a 2MB TLB anymore.
+	 If in doubt, say "N".
+
 config IOMMU_DEBUG
        depends on GART_IOMMU && DEBUG_KERNEL
        bool "Enable IOMMU debugging"
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index c016dfe..1faae5f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -498,6 +498,29 @@ void free_initmem(void)
 	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
 }
 
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+	unsigned long addr = (unsigned long)&__start_rodata;
+
+	for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+
+	printk ("Write protecting the kernel read-only data: %luk\n",
+			(&__end_rodata - &__start_rodata) >> 10);
+
+	/*
+	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
+	 * We do this after the printk so that if something went wrong in the
+	 * change, the printk gets out at least to give a better debug hint
+	 * of who is the culprit.
+	 */
+	global_flush_tlb();
+}
+#endif
+
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
diff --git a/include/asm-x86_64/cacheflush.h b/include/asm-x86_64/cacheflush.h
index b3189fb..d32f7f5 100644
--- a/include/asm-x86_64/cacheflush.h
+++ b/include/asm-x86_64/cacheflush.h
@@ -27,4 +27,8 @@ void global_flush_tlb(void);
 int change_page_attr(struct page *page, int numpages, pgprot_t prot);
 int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
 
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
 #endif /* _X8664_CACHEFLUSH_H */
-- 
cgit v1.1


From bb152f53120d66c98c1f16518407df6a84f23714 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 6 Jan 2006 00:12:05 -0800
Subject: [PATCH] x86/x86_64: mark rodata section read-only: make some
 datastructures const

Mark some key kernel datastructures readonly.  This patch was previously
posted on Jun 28th but was back then not merged because nothing was enforcing
rodata anyway..  well that changed now :)

Patch by Christoph Lameter <christoph@lameter.com> and Dave Jones
<davej@redhat.com>

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S         | 1 +
 arch/i386/kernel/syscall_table.S | 1 -
 arch/x86_64/ia32/ia32entry.S     | 2 +-
 arch/x86_64/kernel/syscall.c     | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e50b9315..607c060 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -657,6 +657,7 @@ ENTRY(spurious_interrupt_bug)
 	pushl $do_spurious_interrupt_bug
 	jmp error_code
 
+.section .rodata,"a"
 #include "syscall_table.S"
 
 syscall_table_size=(.-sys_call_table)
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 9b21a31..f7ba4ac 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -1,4 +1,3 @@
-.data
 ENTRY(sys_call_table)
 	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
 	.long sys_exit
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c7..df0773c 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -341,7 +341,7 @@ ENTRY(ia32_ptregs_common)
 	jmp  ia32_sysret	/* misbalances the return cache */
 	CFI_ENDPROC
 
-	.data
+	.section .rodata,"a"
 	.align 8
 	.globl ia32_sys_call_table
 ia32_sys_call_table:
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index e263685..7c176b3 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -19,7 +19,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
 #include <asm-x86_64/unistd.h>
-- 
cgit v1.1


From 215c3409eed16c89b6d11ea1126bd9d4f36b9afd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 6 Jan 2006 00:12:06 -0800
Subject: [PATCH] i386 sparsemem for single node systems

Allow SPARSEMEM to be enabled on non-numa x86 systems.  This is made
dependant on EXPERIMENTAL also being set.  When an in-tree user (such as
simulated numa) exists it should be made dependant on that.

The plan is to have no options and no selector as normal when
!EXPERIMENTAL.  When EXPERIMENTAL we enable the FLATMEM and SPARSEMEM
options for X86_PC whilst maintaining DISCONTIGMEM and SPARSEMEM for NUMA.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig        | 8 ++++++--
 arch/i386/kernel/setup.c | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 6004bb0..968fabd 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -464,7 +464,6 @@ config NUMA
 	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
-	select SPARSEMEM_STATIC
 
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
@@ -493,6 +492,10 @@ config HAVE_ARCH_ALLOC_REMAP
 	depends on NUMA
 	default y
 
+config ARCH_FLATMEM_ENABLE
+	def_bool y
+	depends on (ARCH_SELECT_MEMORY_MODEL && X86_PC)
+
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
 	depends on NUMA
@@ -503,7 +506,8 @@ config ARCH_DISCONTIGMEM_DEFAULT
 
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
-	depends on NUMA
+	depends on (NUMA || (X86_PC && EXPERIMENTAL))
+	select SPARSEMEM_STATIC
 
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index fdfcb0c..27c956d 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -954,6 +954,12 @@ efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
 	return 0;
 }
 
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+	memory_present(0, start, end);
+	return 0;
+}
 
 /*
  * Find the highest page frame number we have available
@@ -965,6 +971,7 @@ void __init find_max_pfn(void)
 	max_pfn = 0;
 	if (efi_enabled) {
 		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+		efi_memmap_walk(efi_memory_present_wrapper, NULL);
 		return;
 	}
 
@@ -979,6 +986,7 @@ void __init find_max_pfn(void)
 			continue;
 		if (end > max_pfn)
 			max_pfn = end;
+		memory_present(0, start, end);
 	}
 }
 
-- 
cgit v1.1


From c898ec16e83331abde39118e22e9e38335bbb950 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Fri, 6 Jan 2006 00:12:07 -0800
Subject: [PATCH] allow flatmem to be disabled when only sparsemem is
 implemented

On architectures that implement sparsemem but not discontigmem we want to
be able to hide the flatmem option in some cases.  On ppc64 for example,
when we select NUMA we must not select flatmem.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d..b3db11f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
 
 config FLATMEM_MANUAL
 	bool "Flat Memory"
-	depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+	depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
 	help
 	  This option allows you to change some of the ways that
 	  Linux manages its memory internally.  Most users will
-- 
cgit v1.1


From 1855a2c4ce708b823b8b824f8b12937b45f5462a Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Fri, 6 Jan 2006 00:12:08 -0800
Subject: [PATCH] x86: convert bigsmp to use flat physical mode

When we bring up a new CPU via INIT/startup IPI messages, the CPU that's
coming up sends a xTPR message to the chipset.  Intel chipsets (at least)
don't provide any architectural guarantee on what the chipset will do with
this message.  For example, the E850x chipsets uses this xTPR message to
interpret the interrupt operating mode of the platform.  When the CPU
coming online sends this message, it always indicates that it is in logical
flat mode.  For the CPU hotplug case, the platform may already be
functioning in cluster APIC mode at this time, the chipset can get confused
and mishandle I/O device and IPI interrupt routing.

The situation eventually gets corrected when the new CPU sends another xTPR
update when we switch it to cluster mode, but there's a window during which
the chipset may be in an inconsistent state.  This patch avoids this
problem by using the flat physical interrupt delivery mode instead of
cluster mode for bigsmp (>8 cpu) support.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mach-bigsmp/mach_apic.h    | 79 +++++++++++++----------------
 include/asm-i386/mach-bigsmp/mach_apicdef.h |  4 +-
 2 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h
index ba936d4..18b19a7 100644
--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -1,17 +1,10 @@
 #ifndef __ASM_MACH_APIC_H
 #define __ASM_MACH_APIC_H
-#include <asm/smp.h>
-
-#define SEQUENTIAL_APICID
-#ifdef SEQUENTIAL_APICID
-#define xapic_phys_to_log_apicid(phys_apic) ( (1ul << ((phys_apic) & 0x3)) |\
-		((phys_apic<<2) & (~0xf)) )
-#elif CLUSTERED_APICID
-#define xapic_phys_to_log_apicid(phys_apic) ( (1ul << ((phys_apic) & 0x3)) |\
-		((phys_apic) & (~0xf)) )
-#endif
-
-#define NO_BALANCE_IRQ (1)
+
+
+extern u8 bios_cpu_apicid[];
+
+#define xapic_phys_to_log_apicid(cpu) (bios_cpu_apicid[cpu])
 #define esr_disable (1)
 
 static inline int apic_id_registered(void)
@@ -19,7 +12,6 @@ static inline int apic_id_registered(void)
 	return (1);
 }
 
-#define APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
 /* Round robin the irqs amoung the online cpus */
 static inline cpumask_t target_cpus(void)
 { 
@@ -32,29 +24,34 @@ static inline cpumask_t target_cpus(void)
 	} while (cpu >= NR_CPUS);
 	return cpumask_of_cpu(cpu);
 }
-#define TARGET_CPUS	(target_cpus())
 
-#define INT_DELIVERY_MODE dest_Fixed
-#define INT_DEST_MODE 1     /* logical delivery broadcast to all procs */
+#undef APIC_DEST_LOGICAL
+#define APIC_DEST_LOGICAL 	0
+#define TARGET_CPUS		(target_cpus())
+#define APIC_DFR_VALUE		(APIC_DFR_FLAT)
+#define INT_DELIVERY_MODE	(dest_Fixed)
+#define INT_DEST_MODE		(0)    /* phys delivery to target proc */
+#define NO_BALANCE_IRQ		(0)
+#define WAKE_SECONDARY_VIA_INIT
+
 
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
 {
-	return 0;
+	return (0);
 }
 
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static inline unsigned long check_apicid_present(int bit) 
+static inline unsigned long check_apicid_present(int bit)
 {
-	return 1;
+	return (1);
 }
 
-#define apicid_cluster(apicid) (apicid & 0xF0)
-
-static inline unsigned long calculate_ldr(unsigned long old)
+static inline unsigned long calculate_ldr(int cpu)
 {
-	unsigned long id;
-	id = xapic_phys_to_log_apicid(hard_smp_processor_id());
-	return ((old & ~APIC_LDR_MASK) | SET_APIC_LOGICAL_ID(id));
+	unsigned long val, id;
+	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+	id = xapic_phys_to_log_apicid(cpu);
+	val |= SET_APIC_LOGICAL_ID(id);
+	return val;
 }
 
 /*
@@ -67,37 +64,35 @@ static inline unsigned long calculate_ldr(unsigned long old)
 static inline void init_apic_ldr(void)
 {
 	unsigned long val;
+	int cpu = smp_processor_id();
 
 	apic_write_around(APIC_DFR, APIC_DFR_VALUE);
-	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
-	val = calculate_ldr(val);
+	val = calculate_ldr(cpu);
 	apic_write_around(APIC_LDR, val);
 }
 
 static inline void clustered_apic_check(void)
 {
 	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
-		"Cluster", nr_ioapics);
+		"Physflat", nr_ioapics);
 }
 
 static inline int multi_timer_check(int apic, int irq)
 {
-	return 0;
+	return (0);
 }
 
 static inline int apicid_to_node(int logical_apicid)
 {
-	return 0;
+	return (0);
 }
 
-extern u8 bios_cpu_apicid[];
-
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
 	if (mps_cpu < NR_CPUS)
-		return (int)bios_cpu_apicid[mps_cpu];
-	else
-		return BAD_APICID;
+		return (int) bios_cpu_apicid[mps_cpu];
+
+	return BAD_APICID;
 }
 
 static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
@@ -109,10 +104,10 @@ extern u8 cpu_2_logical_apicid[];
 /* Mapping from cpu number to logical apicid */
 static inline int cpu_to_logical_apicid(int cpu)
 {
-       if (cpu >= NR_CPUS)
-	       return BAD_APICID;
-       return (int)cpu_2_logical_apicid[cpu];
- }
+	if (cpu >= NR_CPUS)
+		return BAD_APICID;
+	return cpu_physical_id(cpu);
+}
 
 static inline int mpc_apic_id(struct mpc_config_processor *m,
 			struct mpc_config_translation *translation_record)
@@ -128,11 +123,9 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
 static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
 {
 	/* For clustered we don't have a good way to do this yet - hack */
-	return physids_promote(0xFUL);
+	return physids_promote(0xFFL);
 }
 
-#define WAKE_SECONDARY_VIA_INIT
-
 static inline void setup_portio_remap(void)
 {
 }
diff --git a/include/asm-i386/mach-bigsmp/mach_apicdef.h b/include/asm-i386/mach-bigsmp/mach_apicdef.h
index 23e58b3..a58ab5a 100644
--- a/include/asm-i386/mach-bigsmp/mach_apicdef.h
+++ b/include/asm-i386/mach-bigsmp/mach_apicdef.h
@@ -1,11 +1,11 @@
 #ifndef __ASM_MACH_APICDEF_H
 #define __ASM_MACH_APICDEF_H
 
-#define		APIC_ID_MASK		(0x0F<<24)
+#define		APIC_ID_MASK		(0xFF<<24)
 
 static inline unsigned get_apic_id(unsigned long x) 
 { 
-	return (((x)>>24)&0x0F);
+	return (((x)>>24)&0xFF);
 } 
 
 #define		GET_APIC_ID(x)	get_apic_id(x)
-- 
cgit v1.1


From e72c8585e09f127a69a1608bb5ccd1e3fc0dd41e Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Fri, 6 Jan 2006 00:12:09 -0800
Subject: [PATCH] make bigsmp the default mode if CONFIG_HOTPLUG_CPU

If we are using hotplug enabled kernel, then make bigsmp the default mode.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/mpparse.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 1ca5269..91a6401 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -38,6 +38,12 @@
 int smp_found_config;
 unsigned int __initdata maxcpus = NR_CPUS;
 
+#ifdef CONFIG_HOTPLUG_CPU
+#define CPU_HOTPLUG_ENABLED	(1)
+#else
+#define CPU_HOTPLUG_ENABLED	(0)
+#endif
+
 /*
  * Various Linux-internal data structures created from the
  * MP-table.
@@ -219,14 +225,18 @@ static void __devinit MP_processor_info (struct mpc_config_processor *m)
 	cpu_set(num_processors, cpu_possible_map);
 	num_processors++;
 
-	if ((num_processors > 8) &&
-	    ((APIC_XAPIC(ver) &&
-	     (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) ||
-	     (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)))
-		def_to_bigsmp = 1;
-	else
-		def_to_bigsmp = 0;
-
+	if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			if (!APIC_XAPIC(ver)) {
+				def_to_bigsmp = 0;
+				break;
+			}
+			/* If P4 and above fall through */
+		case X86_VENDOR_AMD:
+			def_to_bigsmp = 1;
+		}
+	}
 	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
 }
 
-- 
cgit v1.1


From f8af095d3a4c8300b4e63ee2c4bb198b565d9431 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 6 Jan 2006 00:12:10 -0800
Subject: [PATCH] x86: change_page_attr() fix

The 'make rodata read-only' patch in -mm exposes a latent bug in the 32-bit
change_page_attr() function, which causes certain CPUs (Those with NX
basically) to reboot instantly after pages are marked read-only.

The same bug got fixed a while back on x86-64, but never got propagated to
i386.

Stuart Hayes from Dell also picked up on this last June, but it never got
fixed, as the only thing affected by it aparently was the nvidia driver.

Blatantly stealing description from his post..

"It doesn't appear to be fixed (in the i386 arch).  The
 change_page_attr()/split_large_page() code will still still set all the
 4K PTEs to PAGE_KERNEL (setting the _PAGE_NX bit) when a large page
 needs to be split.

 This wouldn't be a problem for the bulk of the kernel memory, but there
 are pages in the lower 4MB of memory that's free, and are part of large
 executable pages that also contain kernel code.  If change_page_attr()
 is called on these, it will set the _PAGE_NX bit on the whole 2MB region
 that was covered by the large page, causing a large chunk of kernel code
 to be non-executable."

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Cc: <Stuart_Hayes@Dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pageattr.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index f600fc2..c30a16d 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
+#include <asm/sections.h>
 
 static DEFINE_SPINLOCK(cpa_lock);
 static struct list_head df_list = LIST_HEAD_INIT(df_list);
@@ -36,7 +37,8 @@ pte_t *lookup_address(unsigned long address)
         return pte_offset_kernel(pmd, address);
 } 
 
-static struct page *split_large_page(unsigned long address, pgprot_t prot)
+static struct page *split_large_page(unsigned long address, pgprot_t prot,
+					pgprot_t ref_prot)
 { 
 	int i; 
 	unsigned long addr;
@@ -54,7 +56,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot)
 	pbase = (pte_t *)page_address(base);
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
-                                          addr == address ? prot : PAGE_KERNEL));
+                                          addr == address ? prot : ref_prot));
 	}
 	return base;
 } 
@@ -98,11 +100,18 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
  */
 static inline void revert_page(struct page *kpte_page, unsigned long address)
 {
-	pte_t *linear = (pte_t *) 
+	pgprot_t ref_prot;
+	pte_t *linear;
+
+	ref_prot =
+	((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+		? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
+
+	linear = (pte_t *)
 		pmd_offset(pud_offset(pgd_offset_k(address), address), address);
 	set_pmd_pte(linear,  address,
 		    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
-			    PAGE_KERNEL_LARGE));
+			    ref_prot));
 }
 
 static int
@@ -123,10 +132,16 @@ __change_page_attr(struct page *page, pgprot_t prot)
 		if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
 		} else {
-			struct page *split = split_large_page(address, prot); 
+			pgprot_t ref_prot;
+			struct page *split;
+
+			ref_prot =
+			((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
+				? PAGE_KERNEL_EXEC : PAGE_KERNEL;
+			split = split_large_page(address, prot, ref_prot);
 			if (!split)
 				return -ENOMEM;
-			set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
+			set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
 			kpte_page = split;
 		}	
 		get_page(kpte_page);
-- 
cgit v1.1


From e31b88ba49460653bab87423287bb68743f5de5c Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 6 Jan 2006 00:12:11 -0800
Subject: [PATCH] x86: missing printk newline in apic boot option parser

Missing newline in printk.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 496a2c9..d8f94e7 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -721,7 +721,7 @@ static int __init apic_set_verbosity(char *str)
 		apic_verbosity = APIC_VERBOSE;
 	else
 		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-				" use apic=verbose or apic=debug", str);
+				" use apic=verbose or apic=debug\n", str);
 
 	return 0;
 }
-- 
cgit v1.1


From d832245d7cc16d50b29c1b708ccbe9c75ac376a3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Fri, 6 Jan 2006 00:12:12 -0800
Subject: [PATCH] x86: fls() in asm

There is a single instruction on i386 to find largest set bit; so it makes
sense to use it (like we use bfs for ffs()).

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/bitops.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index 26eb981..65679ac 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -367,11 +367,6 @@ static inline unsigned long ffz(unsigned long word)
 	return word;
 }
 
-/*
- * fls: find last bit set.
- */
-
-#define fls(x) generic_fls(x)
 #define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
@@ -415,6 +410,23 @@ static inline int ffs(int x)
 }
 
 /**
+ * fls - find last bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs.
+ */
+static inline int fls(int x)
+{
+	int r;
+
+	__asm__("bsrl %1,%0\n\t"
+		"jnz 1f\n\t"
+		"movl $-1,%0\n"
+		"1:" : "=r" (r) : "rm" (x));
+	return r+1;
+}
+
+/**
  * hweightN - returns the hamming weight of a N-bit word
  * @x: the word to weigh
  *
-- 
cgit v1.1


From 6926d570b6159c6a7f65921ca119a675b12fef86 Mon Sep 17 00:00:00 2001
From: Daniel Marjamaki <daniel.marjamaki@comhem.se>
Date: Fri, 6 Jan 2006 00:12:12 -0800
Subject: [PATCH] arch/i386/kernel/msr.c: removed unused variable

Removed the unused variable "rv".

Signed-off-by: Daniel Marjamaki <daniel.marjamaki@comhem.se>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/msr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 44470fe..1d0a55e 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -172,7 +172,6 @@ static ssize_t msr_read(struct file *file, char __user * buf,
 {
 	u32 __user *tmp = (u32 __user *) buf;
 	u32 data[2];
-	size_t rv;
 	u32 reg = *ppos;
 	int cpu = iminor(file->f_dentry->d_inode);
 	int err;
@@ -180,7 +179,7 @@ static ssize_t msr_read(struct file *file, char __user * buf,
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
-	for (rv = 0; count; count -= 8) {
+	for (; count; count -= 8) {
 		err = do_rdmsr(cpu, reg, &data[0], &data[1]);
 		if (err)
 			return err;
-- 
cgit v1.1


From 6b7f430ee0a269464aa29159eb464e647ca313d3 Mon Sep 17 00:00:00 2001
From: Daniel Marjamaki <daniel.marjamaki@comhem.se>
Date: Fri, 6 Jan 2006 00:12:13 -0800
Subject: [PATCH] arch/i386/kernel/cpuid.c: unused variable

Removed the unused variable "rv".

Signed-off-by: Daniel Marjamaki <daniel.marjamaki@comhem.se>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpuid.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 13bae79..006141d 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -117,14 +117,13 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 {
 	char __user *tmp = buf;
 	u32 data[4];
-	size_t rv;
 	u32 reg = *ppos;
 	int cpu = iminor(file->f_dentry->d_inode);
 
 	if (count % 16)
 		return -EINVAL;	/* Invalid chunk size */
 
-	for (rv = 0; count; count -= 16) {
+	for (; count; count -= 16) {
 		do_cpuid(cpu, reg, data);
 		if (copy_to_user(tmp, &data, 16))
 			return -EFAULT;
-- 
cgit v1.1


From f90b8116032f4216d260e31f966a3585319387ac Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Fri, 6 Jan 2006 00:12:14 -0800
Subject: [PATCH] Base support for AMD Geode GX/LX processors

Provide basic support for the AMD Geode GX and LX processors.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS                  |  7 +++++++
 arch/i386/Kconfig.cpu        | 14 ++++++++++----
 arch/i386/kernel/cpu/amd.c   |  7 ++++++-
 arch/i386/kernel/cpu/cyrix.c | 27 ++++++++++++++++++++++++++-
 include/asm-i386/module.h    |  4 +++-
 include/linux/pci_ids.h      | 10 ++++++++++
 6 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index e9db0d6..cb536bb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -258,6 +258,13 @@ P:	Ivan Kokshaysky
 M:	ink@jurassic.park.msu.ru
 S:	Maintained for 2.4; PCI support for 2.6.
 
+AMD GEODE PROCESSOR/CHIPSET SUPPORT
+P:      Jordan Crouse
+M:      info-linux@geode.amd.com
+L:	info-linux@geode.amd.com
+W:	http://www.amd.com/us-en/ConnectivitySolutions/TechnicalResources/0,,50_2334_2452_11363,00.html
+S:	Supported
+
 APM DRIVER
 P:	Stephen Rothwell
 M:	sfr@canb.auug.org.au
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 53bbb3c..79603b3 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -39,6 +39,7 @@ config M386
 	  - "Winchip-2" for IDT Winchip 2.
 	  - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
 	  - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
+	  - "Geode GX/LX" For AMD Geode GX and LX processors.
 	  - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
 	  - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above).
 
@@ -171,6 +172,11 @@ config MGEODEGX1
 	help
 	  Select this for a Geode GX1 (Cyrix MediaGX) chip.
 
+config MGEODE_LX
+       bool "Geode GX/LX"
+       help
+         Select this for AMD Geode GX and LX processors.
+
 config MCYRIXIII
 	bool "CyrixIII/VIA-C3"
 	help
@@ -220,8 +226,8 @@ config X86_XADD
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || X86_GENERIC
-	default "4" if X86_ELAN || M486 || M386
-	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1
+	default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 	default "6" if MK7 || MK8 || MPENTIUMM
 
 config RWSEM_GENERIC_SPINLOCK
@@ -290,12 +296,12 @@ config X86_INTEL_USERCOPY
 
 config X86_USE_PPRO_CHECKSUM
 	bool
-	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON
+	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX
 	default y
 
 config X86_USE_3DNOW
 	bool
-	depends on MCYRIXIII || MK7
+	depends on MCYRIXIII || MK7 || MGEODE_LX
 	default y
 
 config X86_OOSTORE
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e344ef8..e7697e0 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -161,8 +161,13 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 					set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
 				break;
 			}
-			break;
 
+			if (c->x86_model == 10) {
+				/* AMD Geode LX is model 10 */
+				/* placeholder for any needed mods */
+				break;
+			}
+			break;
 		case 6: /* An Athlon/Duron */
  
 			/* Bit 15 of Athlon specific MSR 15, needs to be 0
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index ff87cc2..7501597 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -343,6 +343,31 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
 }
 
 /*
+ * Handle National Semiconductor branded processors
+ */
+static void __devinit init_nsc(struct cpuinfo_x86 *c)
+{
+	/* There may be GX1 processors in the wild that are branded
+	 * NSC and not Cyrix.
+	 *
+	 * This function only handles the GX processor, and kicks every
+	 * thing else to the Cyrix init function above - that should
+	 * cover any processors that might have been branded differently
+	 * after NSC aquired Cyrix.
+	 *
+	 * If this breaks your GX1 horribly, please e-mail
+	 * info-linux@ldcmail.amd.com to tell us.
+	 */
+
+	/* Handle the GX (Formally known as the GX2) */
+
+	if (c->x86 == 5 && c->x86_model == 5)
+		display_cacheinfo(c);
+	else
+		init_cyrix(c);
+}
+
+/*
  * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
  * by the fact that they preserve the flags across the division of 5/2.
  * PII and PPro exhibit this behavior too, but they have cpuid available.
@@ -422,7 +447,7 @@ int __init cyrix_init_cpu(void)
 static struct cpu_dev nsc_cpu_dev __initdata = {
 	.c_vendor	= "NSC",
 	.c_ident 	= { "Geode by NSC" },
-	.c_init		= init_cyrix,
+	.c_init		= init_nsc,
 	.c_identify	= generic_identify,
 };
 
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index eb7f2b4..424661d 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -52,8 +52,10 @@ struct mod_arch_specific
 #define MODULE_PROC_FAMILY "CYRIXIII "
 #elif defined CONFIG_MVIAC3_2
 #define MODULE_PROC_FAMILY "VIAC3-2 "
-#elif CONFIG_MGEODEGX1
+#elif defined CONFIG_MGEODEGX1
 #define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
 #else
 #error unknown processor family
 #endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4f01710..24db724 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -394,6 +394,13 @@
 #define PCI_DEVICE_ID_NS_87410		0xd001
 #define PCI_DEVICE_ID_NS_CS5535_IDE	0x002d
 
+#define PCI_DEVICE_ID_NS_CS5535_HOST_BRIDGE  0x0028
+#define PCI_DEVICE_ID_NS_CS5535_ISA_BRIDGE   0x002b
+#define PCI_DEVICE_ID_NS_CS5535_IDE          0x002d
+#define PCI_DEVICE_ID_NS_CS5535_AUDIO        0x002e
+#define PCI_DEVICE_ID_NS_CS5535_USB          0x002f
+#define PCI_DEVICE_ID_NS_CS5535_VIDEO        0x0030
+
 #define PCI_VENDOR_ID_TSENG		0x100c
 #define PCI_DEVICE_ID_TSENG_W32P_2	0x3202
 #define PCI_DEVICE_ID_TSENG_W32P_b	0x3205
@@ -496,6 +503,9 @@
 
 #define PCI_DEVICE_ID_AMD_CS5536_IDE	0x209A
 
+#define PCI_DEVICE_ID_AMD_LX_VIDEO  0x2081
+#define PCI_DEVICE_ID_AMD_LX_AES    0x2082
+
 #define PCI_VENDOR_ID_TRIDENT		0x1023
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX	0x2000
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_NX	0x2001
-- 
cgit v1.1


From a7a4ad0998dcd682f4968e8ec5fc1259914a1c4a Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Fri, 6 Jan 2006 00:12:15 -0800
Subject: [PATCH] Geode LX HW RNG Support

Add support to hw_random for the Geode LX HRNG device.

Signed-off-by: Jordan Crouse <jordan.crouse@amd.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/hw_random.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/drivers/char/hw_random.c b/drivers/char/hw_random.c
index 6f673d2..49769f5 100644
--- a/drivers/char/hw_random.c
+++ b/drivers/char/hw_random.c
@@ -1,4 +1,9 @@
 /*
+        Added support for the AMD Geode LX RNG
+	(c) Copyright 2004-2005 Advanced Micro Devices, Inc.
+
+	derived from
+
  	Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG)
 	(c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com>
  
@@ -95,6 +100,11 @@ static unsigned int via_data_present (void);
 static u32 via_data_read (void);
 #endif
 
+static int __init geode_init(struct pci_dev *dev);
+static void geode_cleanup(void);
+static unsigned int geode_data_present (void);
+static u32 geode_data_read (void);
+
 struct rng_operations {
 	int (*init) (struct pci_dev *dev);
 	void (*cleanup) (void);
@@ -122,6 +132,7 @@ enum {
 	rng_hw_intel,
 	rng_hw_amd,
 	rng_hw_via,
+	rng_hw_geode,
 };
 
 static struct rng_operations rng_vendor_ops[] = {
@@ -139,6 +150,9 @@ static struct rng_operations rng_vendor_ops[] = {
 	/* rng_hw_via */
 	{ via_init, via_cleanup, via_data_present, via_data_read, 1 },
 #endif
+
+	/* rng_hw_geode */
+	{ geode_init, geode_cleanup, geode_data_present, geode_data_read, 4 }
 };
 
 /*
@@ -159,6 +173,9 @@ static struct pci_device_id rng_pci_tbl[] = {
 	{ 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 	{ 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
 
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LX_AES,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_geode },
+
 	{ 0, },	/* terminate list */
 };
 MODULE_DEVICE_TABLE (pci, rng_pci_tbl);
@@ -460,6 +477,57 @@ static void via_cleanup(void)
 }
 #endif
 
+/***********************************************************************
+ *
+ * AMD Geode RNG operations
+ *
+ */
+
+static void __iomem *geode_rng_base = NULL;
+
+#define GEODE_RNG_DATA_REG   0x50
+#define GEODE_RNG_STATUS_REG 0x54
+
+static u32 geode_data_read(void)
+{
+	u32 val;
+
+	assert(geode_rng_base != NULL);
+	val = readl(geode_rng_base + GEODE_RNG_DATA_REG);
+	return val;
+}
+
+static unsigned int geode_data_present(void)
+{
+	u32 val;
+
+	assert(geode_rng_base != NULL);
+	val = readl(geode_rng_base + GEODE_RNG_STATUS_REG);
+	return val;
+}
+
+static void geode_cleanup(void)
+{
+	iounmap(geode_rng_base);
+  	geode_rng_base = NULL;
+}
+
+static int geode_init(struct pci_dev *dev)
+{
+	unsigned long rng_base = pci_resource_start(dev, 0);
+
+	if (rng_base == 0)
+		return 1;
+
+	geode_rng_base = ioremap(rng_base, 0x58);
+
+	if (geode_rng_base == NULL) {
+		printk(KERN_ERR PFX "Cannot ioremap RNG memory\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
 
 /***********************************************************************
  *
@@ -574,7 +642,7 @@ static int __init rng_init (void)
 
 	DPRINTK ("ENTER\n");
 
-	/* Probe for Intel, AMD RNGs */
+	/* Probe for Intel, AMD, Geode RNGs */
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(rng_pci_tbl, pdev);
 		if (ent) {
-- 
cgit v1.1


From 3841b0a173cb6fc52163e67c03280543f2412db3 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jordan.crouse@amd.com>
Date: Fri, 6 Jan 2006 00:12:16 -0800
Subject: [PATCH] APM Screen Blanking fix

- Fix screen blanking on BIOSes that return APM_NOT_ENGAGED when APM enabled
  screen blanking is not turned on.

  The original code only tried to set the state on device 0x100, and then
  0x1FF, and I added 0x101 to the mix too.

- Clean up logic in apm_console_blank().

- Prevent the error message from printing out twice.

Cc: Jordan Crouse <jordan.crouse@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index d0b4880..2d793d4 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -1064,22 +1064,23 @@ static int apm_engage_power_management(u_short device, int enable)
  
 static int apm_console_blank(int blank)
 {
-	int	error;
-	u_short	state;
+	int error, i;
+	u_short state;
+	static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
 
 	state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
-	/* Blank the first display device */
-	error = set_power_state(0x100, state);
-	if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) {
-		/* try to blank them all instead */
-		error = set_power_state(0x1ff, state);
-		if ((error != APM_SUCCESS) && (error != APM_NO_ERROR))
-			/* try to blank device one instead */
-			error = set_power_state(0x101, state);
+
+	for (i = 0; i < ARRAY_SIZE(dev); i++) {
+		error = set_power_state(dev[i], state);
+
+		if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+			return 1;
+
+		if (error == APM_NOT_ENGAGED)
+			break;
 	}
-	if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
-		return 1;
-	if (error == APM_NOT_ENGAGED) {
+
+	if (error == APM_NOT_ENGAGED && state != APM_STATE_READY) {
 		static int tried;
 		int eng_error;
 		if (tried++ == 0) {
-- 
cgit v1.1


From bcf0f0d233fc76e7c59c7f731caad555428d0e8d Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Fri, 6 Jan 2006 00:12:17 -0800
Subject: [PATCH] fix cpu frequency detection in
 arch/i386/kernel/timers/timer_tsc.c::recalibrate_cpu_khz()

When we re-calibrate the frequency, it is likely that an interrupt (as for
example the main system clock) will be triggered by the system.  Therefore
the calibration may not be accurate.  This will also provide a fix to bug
#5266.

Many thanks to Larry Finger for helping resolving this issue.

Signed-off-by: Bruno Ducrot <ducrot@poupinou.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/timers/timer_tsc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index d395e3b..47675bb 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -330,7 +330,9 @@ int recalibrate_cpu_khz(void)
 	unsigned int cpu_khz_old = cpu_khz;
 
 	if (cpu_has_tsc) {
+		local_irq_disable();
 		init_cpu_khz();
+		local_irq_enable();
 		cpu_data[0].loops_per_jiffy =
 		    cpufreq_scale(cpu_data[0].loops_per_jiffy,
 			          cpu_khz_old,
-- 
cgit v1.1


From 19d534842cc39df1b568722c18f96ae24fb0e136 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Fri, 6 Jan 2006 00:12:18 -0800
Subject: [PATCH] mpspec: remove unneeded packed attribute

GCC 4.1 gives the following warning: include/asm/mpspec.h:79: warning:
`packed' attribute ignored for field of type `unsigned char'

The packed attribute isn't really necessary anyways so just remove it.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Acked-by: Dave Jones <davej@codemonkey.org.uk>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mpspec_def.h | 2 +-
 include/asm-x86_64/mpspec.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/asm-i386/mpspec_def.h b/include/asm-i386/mpspec_def.h
index a961093..76feedf 100644
--- a/include/asm-i386/mpspec_def.h
+++ b/include/asm-i386/mpspec_def.h
@@ -75,7 +75,7 @@ struct mpc_config_bus
 {
 	unsigned char mpc_type;
 	unsigned char mpc_busid;
-	unsigned char mpc_bustype[6] __attribute((packed));
+	unsigned char mpc_bustype[6];
 };
 
 /* List of Bus Type string values, Intel MP Spec. */
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index 6f8a17d..10248a9 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -76,7 +76,7 @@ struct mpc_config_bus
 {
 	unsigned char mpc_type;
 	unsigned char mpc_busid;
-	unsigned char mpc_bustype[6] __attribute((packed));
+	unsigned char mpc_bustype[6];
 };
 
 /* List of Bus Type string values, Intel MP Spec. */
-- 
cgit v1.1


From 76865c3f87e825dda0c458b02f30dd8ae64b7bdc Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Fri, 6 Jan 2006 00:12:19 -0800
Subject: [PATCH] i386: ioapic virtual wire mode fix

o Currently, during kexec reboot, IOAPIC is re-programmed back to virtual
  wire mode if there was an i8259 connected to it. This enables getting
  timer interrupts in second kernel in legacy mode.

o After putting into virtual wire mode, IOAPIC delivers the i8259 interrupts
  to CPU0. This works well for kexec but not for kdump as we might crash
  on a different CPU and second kernel will not see timer interrupts.

o This patch modifies the redirection table entry to deliver the timer
  interrupts to the cpu we are rebooting (instead of hardcoding to zero).
  This ensures that second kernel receives timer interrupts even on a
  non-boot cpu.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 22c8675..7554f8f 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1722,8 +1722,8 @@ void disable_IO_APIC(void)
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest.physical.physical_dest = 0;
-
+		entry.dest.physical.physical_dest =
+					GET_APIC_ID(apic_read(APIC_ID));
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
-- 
cgit v1.1


From 766c3f94d4492ee4ec60b65693e71ee4b1d6fd68 Mon Sep 17 00:00:00 2001
From: Ben Collins <ben.collins@ubuntu.com>
Date: Fri, 6 Jan 2006 00:12:20 -0800
Subject: [PATCH] i386: Handle HP laptop rebooting properly.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/reboot.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 2afe0f8..2fa5803 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -111,12 +111,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
 		},
 	},
-	{	/* Handle problems with rebooting on HP nc6120 */
+	{	/* Handle problems with rebooting on HP laptops */
 		.callback = set_bios_reboot,
-		.ident = "HP Compaq nc6120",
+		.ident = "HP Compaq Laptop",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nc6120"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
 		},
 	},
 	{ }
-- 
cgit v1.1


From 1fa744e6e91a895750b9980d13fcfc5791a0cd91 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 6 Jan 2006 00:12:20 -0800
Subject: [PATCH] cpu hotplug/x86_64: disable interrupt in play_dead

With physical CPU hotplug, the CPU is hot removed and it should not receive
any interrupts.  Disabling interrupt is much safer.  This basically is what we
do in ia64 & x86.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c | 5 +++--
 include/asm-x86_64/system.h  | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7519fc5..3060ed9 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 DECLARE_PER_CPU(int, cpu_state);
 
 #include <asm/nmi.h>
-/* We don't actually take CPU down, just spin without interrupts. */
+/* We halt the CPU with physical CPU hotplug */
 static inline void play_dead(void)
 {
 	idle_task_exit();
@@ -166,8 +166,9 @@ static inline void play_dead(void)
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 
+	local_irq_disable();
 	while (1)
-		safe_halt();
+		halt();
 }
 #else
 static inline void play_dead(void)
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 85348e0..b34cc2e 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -315,6 +315,8 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 #define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
 
 #define irqs_disabled()			\
 ({					\
-- 
cgit v1.1


From eee45269b0f5979c70bc151c6c2f4e5f4f5ababe Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Fri, 6 Jan 2006 00:12:21 -0800
Subject: [PATCH] Alpha: convert to generic irq framework (generic part)

Thanks to Christoph for doing most of the work.

This allows automatic SMP IRQ affinity assignment other than default "all
interrupts on all CPUs" which is rather expensive.  This might be useful if
the hardware can be programmed to distribute interrupts among different
CPUs, like Alpha does.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h | 11 +++++++++++
 kernel/irq/manage.c |  2 ++
 kernel/irq/proc.c   |  4 +++-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f04ba20..60f8bc7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -221,6 +221,17 @@ extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
+
+#ifdef CONFIG_AUTO_IRQ_AFFINITY
+extern int select_smp_affinity(unsigned int irq);
+#else
+static inline int
+select_smp_affinity(unsigned int irq)
+{
+	return 1;
+}
+#endif
+
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4..97d5559 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
+	select_smp_affinity(irq);
+
 	retval = setup_irq(irq, action);
 	if (retval)
 		kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534..8a64a48 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
 	if (cpus_empty(tmp))
-		return -EINVAL;
+		/* Special case for empty set - allow the architecture
+		   code to set default SMP affinity. */
+		return select_smp_affinity(irq) ? -EINVAL : full_count;
 
 	proc_set_irq_affinity(irq, new_value);
 
-- 
cgit v1.1


From 0595bf3bca9d9932a05b06dd438f40f01d27cd33 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Fri, 6 Jan 2006 00:12:22 -0800
Subject: [PATCH] Alpha: convert to generic irq framework (alpha part)

Kconfig tweaks and tons of deletions.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/Kconfig              |  13 +
 arch/alpha/kernel/alpha_ksyms.c |   1 -
 arch/alpha/kernel/irq.c         | 630 +---------------------------------------
 include/asm-alpha/hardirq.h     |   2 +
 4 files changed, 24 insertions(+), 622 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 786491f..153337f 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -40,6 +40,19 @@ config GENERIC_IOMAP
 	bool
 	default n
 
+config GENERIC_HARDIRQS
+	bool
+	default y
+
+config GENERIC_IRQ_PROBE
+	bool
+	default y
+
+config AUTO_IRQ_AFFINITY
+	bool
+	depends on SMP
+	default y
+
 source "init/Kconfig"
 
 
diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index 24ae9a3..f3e98f8 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -175,7 +175,6 @@ EXPORT_SYMBOL(up);
  */
 
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(synchronize_irq);
 EXPORT_SYMBOL(flush_tlb_mm);
 EXPORT_SYMBOL(flush_tlb_range);
 EXPORT_SYMBOL(flush_tlb_page);
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index b6114f5..76be5cf 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -32,214 +32,25 @@
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
-/*
- * Controller mappings for all interrupt sources:
- */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
-	[0 ... NR_IRQS-1] = {
-		.handler = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
-	}
-};
-
-static void register_irq_proc(unsigned int irq);
-
 volatile unsigned long irq_err_count;
 
-/*
- * Special irq handlers.
- */
-
-irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
-{
-	return IRQ_NONE;
-}
-
-/*
- * Generic no controller code
- */
-
-static void no_irq_enable_disable(unsigned int irq) { }
-static unsigned int no_irq_startup(unsigned int irq) { return 0; }
-
-static void
-no_irq_ack(unsigned int irq)
+void ack_bad_irq(unsigned int irq)
 {
 	irq_err_count++;
 	printk(KERN_CRIT "Unexpected IRQ trap at vector %u\n", irq);
 }
 
-struct hw_interrupt_type no_irq_type = {
-	.typename	= "none",
-	.startup	= no_irq_startup,
-	.shutdown	= no_irq_enable_disable,
-	.enable		= no_irq_enable_disable,
-	.disable	= no_irq_enable_disable,
-	.ack		= no_irq_ack,
-	.end		= no_irq_enable_disable,
-};
-
-int
-handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-		 struct irqaction *action)
-{
-	int status = 1;	/* Force the "do bottom halves" bit */
-	int ret;
-
-	do {
-		if (!(action->flags & SA_INTERRUPT))
-			local_irq_enable();
-		else
-			local_irq_disable();
-
-		ret = action->handler(irq, action->dev_id, regs);
-		if (ret == IRQ_HANDLED)
-			status |= action->flags;
-		action = action->next;
-	} while (action);
-	if (status & SA_SAMPLE_RANDOM)
-		add_interrupt_randomness(irq);
-	local_irq_disable();
-
-	return status;
-}
-
-/*
- * Generic enable/disable code: this just calls
- * down into the PIC-specific version for the actual
- * hardware disable after having gotten the irq
- * controller lock. 
- */
-void inline
-disable_irq_nosync(unsigned int irq)
-{
-	irq_desc_t *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
-		desc->handler->disable(irq);
-	}
-	spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-/*
- * Synchronous version of the above, making sure the IRQ is
- * no longer running on any other IRQ..
- */
-void
-disable_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	synchronize_irq(irq);
-}
-
-void
-enable_irq(unsigned int irq)
-{
-	irq_desc_t *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	switch (desc->depth) {
-	case 1: {
-		unsigned int status = desc->status & ~IRQ_DISABLED;
-		desc->status = status;
-		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->handler,irq);
-		}
-		desc->handler->enable(irq);
-		/* fall-through */
-	}
-	default:
-		desc->depth--;
-		break;
-	case 0:
-		printk(KERN_ERR "enable_irq() unbalanced from %p\n",
-		       __builtin_return_address(0));
-	}
-	spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-int
-setup_irq(unsigned int irq, struct irqaction * new)
-{
-	int shared = 0;
-	struct irqaction *old, **p;
-	unsigned long flags;
-	irq_desc_t *desc = irq_desc + irq;
-
-        if (desc->handler == &no_irq_type)
-		return -ENOSYS;
-
-	/*
-	 * Some drivers like serial.c use request_irq() heavily,
-	 * so we have to be careful not to interfere with a
-	 * running system.
-	 */
-	if (new->flags & SA_SAMPLE_RANDOM) {
-		/*
-		 * This function might sleep, we want to call it first,
-		 * outside of the atomic block.
-		 * Yes, this might clear the entropy pool if the wrong
-		 * driver is attempted to be loaded, without actually
-		 * installing a new handler, but is this really a problem,
-		 * only the sysadmin is able to do this.
-		 */
-		rand_initialize_irq(irq);
-	}
-
-	/*
-	 * The following block of code has to be executed atomically
-	 */
-	spin_lock_irqsave(&desc->lock,flags);
-	p = &desc->action;
-	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&desc->lock,flags);
-			return -EBUSY;
-		}
-
-		/* add new interrupt at end of irq queue */
-		do {
-			p = &old->next;
-			old = *p;
-		} while (old);
-		shared = 1;
-	}
-
-	*p = new;
-
-	if (!shared) {
-		desc->depth = 0;
-		desc->status &=
-		    ~(IRQ_DISABLED|IRQ_AUTODETECT|IRQ_WAITING|IRQ_INPROGRESS);
-		desc->handler->startup(irq);
-	}
-	spin_unlock_irqrestore(&desc->lock,flags);
-
-	return 0;
-}
-
-static struct proc_dir_entry * root_irq_dir;
-static struct proc_dir_entry * irq_dir[NR_IRQS];
-
 #ifdef CONFIG_SMP 
-static struct proc_dir_entry * smp_affinity_entry[NR_IRQS];
 static char irq_user_affinity[NR_IRQS];
-static cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
-static void
-select_smp_affinity(int irq)
+int
+select_smp_affinity(unsigned int irq)
 {
 	static int last_cpu;
 	int cpu = last_cpu + 1;
 
-	if (! irq_desc[irq].handler->set_affinity || irq_user_affinity[irq])
-		return;
+	if (!irq_desc[irq].handler->set_affinity || irq_user_affinity[irq])
+		return 1;
 
 	while (!cpu_possible(cpu))
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
@@ -247,208 +58,10 @@ select_smp_affinity(int irq)
 
 	irq_affinity[irq] = cpumask_of_cpu(cpu);
 	irq_desc[irq].handler->set_affinity(irq, cpumask_of_cpu(cpu));
+	return 0;
 }
-
-static int
-irq_affinity_read_proc (char *page, char **start, off_t off,
-			int count, int *eof, void *data)
-{
-	int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
-	if (count - len < 2)
-		return -EINVAL;
-	len += sprintf(page + len, "\n");
-	return len;
-}
-
-static int
-irq_affinity_write_proc(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
-{
-	int irq = (long) data, full_count = count, err;
-	cpumask_t new_value;
-
-	if (!irq_desc[irq].handler->set_affinity)
-		return -EIO;
-
-	err = cpumask_parse(buffer, count, new_value);
-
-	/* The special value 0 means release control of the
-	   affinity to kernel.  */
-	cpus_and(new_value, new_value, cpu_online_map);
-	if (cpus_empty(new_value)) {
-		irq_user_affinity[irq] = 0;
-		select_smp_affinity(irq);
-	}
-	/* Do not allow disabling IRQs completely - it's a too easy
-	   way to make the system unusable accidentally :-) At least
-	   one online CPU still has to be targeted.  */
-	else {
-		irq_affinity[irq] = new_value;
-		irq_user_affinity[irq] = 1;
-		irq_desc[irq].handler->set_affinity(irq, new_value);
-	}
-
-	return full_count;
-}
-
 #endif /* CONFIG_SMP */
 
-#define MAX_NAMELEN 10
-
-static void
-register_irq_proc (unsigned int irq)
-{
-	char name [MAX_NAMELEN];
-
-	if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) ||
-	    irq_dir[irq])
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	sprintf(name, "%d", irq);
-
-	/* create /proc/irq/1234 */
-	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
-
-#ifdef CONFIG_SMP 
-	if (irq_desc[irq].handler->set_affinity) {
-		struct proc_dir_entry *entry;
-		/* create /proc/irq/1234/smp_affinity */
-		entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
-
-		if (entry) {
-			entry->nlink = 1;
-			entry->data = (void *)(long)irq;
-			entry->read_proc = irq_affinity_read_proc;
-			entry->write_proc = irq_affinity_write_proc;
-		}
-
-		smp_affinity_entry[irq] = entry;
-	}
-#endif
-}
-
-void
-init_irq_proc (void)
-{
-	int i;
-
-	/* create /proc/irq */
-	root_irq_dir = proc_mkdir("irq", NULL);
-
-#ifdef CONFIG_SMP 
-	/* create /proc/irq/prof_cpu_mask */
-	create_prof_cpu_mask(root_irq_dir);
-#endif
-
-	/*
-	 * Create entries for all existing IRQs.
-	 */
-	for (i = 0; i < ACTUAL_NR_IRQS; i++) {
-		if (irq_desc[i].handler == &no_irq_type)
-			continue;
-		register_irq_proc(i);
-	}
-}
-
-int
-request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *),
-	    unsigned long irqflags, const char * devname, void *dev_id)
-{
-	int retval;
-	struct irqaction * action;
-
-	if (irq >= ACTUAL_NR_IRQS)
-		return -EINVAL;
-	if (!handler)
-		return -EINVAL;
-
-#if 1
-	/*
-	 * Sanity-check: shared interrupts should REALLY pass in
-	 * a real dev-ID, otherwise we'll have trouble later trying
-	 * to figure out which interrupt is which (messes up the
-	 * interrupt freeing logic etc).
-	 */
-	if ((irqflags & SA_SHIRQ) && !dev_id) {
-		printk(KERN_ERR
-		       "Bad boy: %s (at %p) called us without a dev_id!\n",
-		       devname, __builtin_return_address(0));
-	}
-#endif
-
-	action = (struct irqaction *)
-			kmalloc(sizeof(struct irqaction), GFP_KERNEL);
-	if (!action)
-		return -ENOMEM;
-
-	action->handler = handler;
-	action->flags = irqflags;
-	cpus_clear(action->mask);
-	action->name = devname;
-	action->next = NULL;
-	action->dev_id = dev_id;
-
-#ifdef CONFIG_SMP
-	select_smp_affinity(irq);
-#endif
-
-	retval = setup_irq(irq, action);
-	if (retval)
-		kfree(action);
-	return retval;
-}
-
-EXPORT_SYMBOL(request_irq);
-
-void
-free_irq(unsigned int irq, void *dev_id)
-{
-	irq_desc_t *desc;
-	struct irqaction **p;
-	unsigned long flags;
-
-	if (irq >= ACTUAL_NR_IRQS) {
-		printk(KERN_CRIT "Trying to free IRQ%d\n", irq);
-		return;
-	}
-
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&desc->lock,flags);
-	p = &desc->action;
-	for (;;) {
-		struct irqaction * action = *p;
-		if (action) {
-			struct irqaction **pp = p;
-			p = &action->next;
-			if (action->dev_id != dev_id)
-				continue;
-
-			/* Found - now remove it from the list of entries.  */
-			*pp = action->next;
-			if (!desc->action) {
-				desc->status |= IRQ_DISABLED;
-				desc->handler->shutdown(irq);
-			}
-			spin_unlock_irqrestore(&desc->lock,flags);
-
-#ifdef CONFIG_SMP
-			/* Wait to make sure it's not being used on
-			   another CPU.  */
-			while (desc->status & IRQ_INPROGRESS)
-				barrier();
-#endif
-			kfree(action);
-			return;
-		}
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		spin_unlock_irqrestore(&desc->lock,flags);
-		return;
-	}
-}
-
-EXPORT_SYMBOL(free_irq);
-
 int
 show_interrupts(struct seq_file *p, void *v)
 {
@@ -531,10 +144,6 @@ handle_irq(int irq, struct pt_regs * regs)
 	 * 0 return value means that this irq is already being
 	 * handled by some other CPU. (or is disabled)
 	 */
-	int cpu = smp_processor_id();
-	irq_desc_t *desc = irq_desc + irq;
-	struct irqaction * action;
-	unsigned int status;
 	static unsigned int illegal_count=0;
 	
 	if ((unsigned) irq > ACTUAL_NR_IRQS && illegal_count < MAX_ILLEGAL_IRQS ) {
@@ -546,229 +155,8 @@ handle_irq(int irq, struct pt_regs * regs)
 	}
 
 	irq_enter();
-	kstat_cpu(cpu).irqs[irq]++;
-	spin_lock_irq(&desc->lock); /* mask also the higher prio events */
-	desc->handler->ack(irq);
-	/*
-	 * REPLAY is when Linux resends an IRQ that was dropped earlier.
-	 * WAITING is used by probe to mark irqs that are being tested.
-	 */
-	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-	status |= IRQ_PENDING; /* we _want_ to handle it */
-
-	/*
-	 * If the IRQ is disabled for whatever reason, we cannot
-	 * use the action we have.
-	 */
-	action = NULL;
-	if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) {
-		action = desc->action;
-		status &= ~IRQ_PENDING; /* we commit to handling */
-		status |= IRQ_INPROGRESS; /* we are handling it */
-	}
-	desc->status = status;
-
-	/*
-	 * If there is no IRQ handler or it was disabled, exit early.
-	 * Since we set PENDING, if another processor is handling
-	 * a different instance of this same irq, the other processor
-	 * will take care of it.
-	 */
-	if (!action)
-		goto out;
-
-	/*
-	 * Edge triggered interrupts need to remember pending events.
-	 * This applies to any hw interrupts that allow a second
-	 * instance of the same irq to arrive while we are in handle_irq
-	 * or in the handler. But the code here only handles the _second_
-	 * instance of the irq, not the third or fourth. So it is mostly
-	 * useful for irq hardware that does not mask cleanly in an
-	 * SMP environment.
-	 */
-	for (;;) {
-		spin_unlock(&desc->lock);
-		handle_IRQ_event(irq, regs, action);
-		spin_lock(&desc->lock);
-		
-		if (!(desc->status & IRQ_PENDING)
-		    || (desc->status & IRQ_LEVEL))
-			break;
-		desc->status &= ~IRQ_PENDING;
-	}
-	desc->status &= ~IRQ_INPROGRESS;
-out:
-	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
-	 */
-	desc->handler->end(irq);
-	spin_unlock(&desc->lock);
-
+	local_irq_disable();
+	__do_IRQ(irq, regs);
+	local_irq_enable();
 	irq_exit();
 }
-
-/*
- * IRQ autodetection code..
- *
- * This depends on the fact that any interrupt that
- * comes in on to an unassigned handler will get stuck
- * with "IRQ_WAITING" cleared and the interrupt
- * disabled.
- */
-unsigned long
-probe_irq_on(void)
-{
-	int i;
-	irq_desc_t *desc;
-	unsigned long delay;
-	unsigned long val;
-
-	/* Something may have generated an irq long ago and we want to
-	   flush such a longstanding irq before considering it as spurious. */
-	for (i = NR_IRQS-1; i >= 0; i--) {
-		desc = irq_desc + i;
-
-		spin_lock_irq(&desc->lock);
-		if (!irq_desc[i].action) 
-			irq_desc[i].handler->startup(i);
-		spin_unlock_irq(&desc->lock);
-	}
-
-	/* Wait for longstanding interrupts to trigger. */
-	for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
-		/* about 20ms delay */ barrier();
-
-	/* enable any unassigned irqs (we must startup again here because
-	   if a longstanding irq happened in the previous stage, it may have
-	   masked itself) first, enable any unassigned irqs. */
-	for (i = NR_IRQS-1; i >= 0; i--) {
-		desc = irq_desc + i;
-
-		spin_lock_irq(&desc->lock);
-		if (!desc->action) {
-			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-			if (desc->handler->startup(i))
-				desc->status |= IRQ_PENDING;
-		}
-		spin_unlock_irq(&desc->lock);
-	}
-
-	/*
-	 * Wait for spurious interrupts to trigger
-	 */
-	for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
-		/* about 100ms delay */ barrier();
-
-	/*
-	 * Now filter out any obviously spurious interrupts
-	 */
-	val = 0;
-	for (i=0; i<NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
-		unsigned int status;
-
-		spin_lock_irq(&desc->lock);
-		status = desc->status;
-
-		if (status & IRQ_AUTODETECT) {
-			/* It triggered already - consider it spurious. */
-			if (!(status & IRQ_WAITING)) {
-				desc->status = status & ~IRQ_AUTODETECT;
-				desc->handler->shutdown(i);
-			} else
-				if (i < 32)
-					val |= 1 << i;
-		}
-		spin_unlock_irq(&desc->lock);
-	}
-
-	return val;
-}
-
-EXPORT_SYMBOL(probe_irq_on);
-
-/*
- * Return a mask of triggered interrupts (this
- * can handle only legacy ISA interrupts).
- */
-unsigned int
-probe_irq_mask(unsigned long val)
-{
-	int i;
-	unsigned int mask;
-
-	mask = 0;
-	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
-		unsigned int status;
-
-		spin_lock_irq(&desc->lock);
-		status = desc->status;
-
-		if (status & IRQ_AUTODETECT) {
-			/* We only react to ISA interrupts */
-			if (!(status & IRQ_WAITING)) {
-				if (i < 16)
-					mask |= 1 << i;
-			}
-
-			desc->status = status & ~IRQ_AUTODETECT;
-			desc->handler->shutdown(i);
-		}
-		spin_unlock_irq(&desc->lock);
-	}
-
-	return mask & val;
-}
-
-/*
- * Get the result of the IRQ probe.. A negative result means that
- * we have several candidates (but we return the lowest-numbered
- * one).
- */
-
-int
-probe_irq_off(unsigned long val)
-{
-	int i, irq_found, nr_irqs;
-
-	nr_irqs = 0;
-	irq_found = 0;
-	for (i=0; i<NR_IRQS; i++) {
-		irq_desc_t *desc = irq_desc + i;
-		unsigned int status;
-
-		spin_lock_irq(&desc->lock);
-		status = desc->status;
-
-		if (status & IRQ_AUTODETECT) {
-			if (!(status & IRQ_WAITING)) {
-				if (!nr_irqs)
-					irq_found = i;
-				nr_irqs++;
-			}
-			desc->status = status & ~IRQ_AUTODETECT;
-			desc->handler->shutdown(i);
-		}
-		spin_unlock_irq(&desc->lock);
-	}
-
-	if (nr_irqs > 1)
-		irq_found = -irq_found;
-	return irq_found;
-}
-
-EXPORT_SYMBOL(probe_irq_off);
-
-#ifdef CONFIG_SMP
-void synchronize_irq(unsigned int irq)
-{
-        /* is there anything to synchronize with? */
-	if (!irq_desc[irq].action)
-		return;
-
-	while (irq_desc[irq].status & IRQ_INPROGRESS)
-		barrier();
-}
-#endif
diff --git a/include/asm-alpha/hardirq.h b/include/asm-alpha/hardirq.h
index c0593f9..7bb6a36 100644
--- a/include/asm-alpha/hardirq.h
+++ b/include/asm-alpha/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
 
+void ack_bad_irq(unsigned int irq);
+
 #define HARDIRQ_BITS	12
 
 /*
-- 
cgit v1.1


From f2d97f02961e8b1f8a24befb88ab0e5c886586ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:12:24 -0800
Subject: [PATCH] swsusp: remove encryption

This patch removes the image encryption that is only used by swsusp instead of
zeroing the image after resume in order to prevent someone from reading some
confidential data from it in the future and it does not protect the image from
being read by an unauthorized person before resume.  The functionality it
provides should really belong to the user space and will possibly be
reimplemented after the swap-handling functionality of swsusp is moved to the
user space.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 163 ++------------------------------------------------
 1 file changed, 4 insertions(+), 159 deletions(-)

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e..bd3097c 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,9 +30,6 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
- * Andreas Steinmetz <ast@domdv.de>:
- * Added encrypted suspend option
- *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -67,10 +64,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
@@ -81,10 +74,6 @@ static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
 #endif
 
-#define CIPHER "aes"
-#define MAXKEY 32
-#define MAXIV  32
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
@@ -102,8 +91,7 @@ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
-	u8 key_iv[MAXKEY+MAXIV];
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
 	swp_entry_t swsusp_info;
 	char	orig_sig[10];
 	char	sig[10];
@@ -123,131 +111,6 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
-static int write_page(unsigned long addr, swp_entry_t *loc);
-static int bio_read_page(pgoff_t page_off, void *page);
-
-static u8 key_iv[MAXKEY+MAXIV];
-
-#ifdef CONFIG_SWSUSP_ENCRYPT
-
-static int crypto_init(int mode, void **mem)
-{
-	int error = 0;
-	int len;
-	char *modemsg;
-	struct crypto_tfm *tfm;
-
-	modemsg = mode ? "suspend not possible" : "resume not possible";
-
-	tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
-	if(!tfm) {
-		printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
-		error = -EINVAL;
-		goto out;
-	}
-
-	if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
-		printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
-		error = -ENOKEY;
-		goto fail;
-	}
-
-	if (mode)
-		get_random_bytes(key_iv, MAXKEY+MAXIV);
-
-	len = crypto_tfm_alg_max_keysize(tfm);
-	if (len > MAXKEY)
-		len = MAXKEY;
-
-	if (crypto_cipher_setkey(tfm, key_iv, len)) {
-		printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
-		error = -EKEYREJECTED;
-		goto fail;
-	}
-
-	len = crypto_tfm_alg_ivsize(tfm);
-
-	if (MAXIV < len) {
-		printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
-		error = -EOVERFLOW;
-		goto fail;
-	}
-
-	crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
-
-	*mem=(void *)tfm;
-
-	goto out;
-
-fail:	crypto_free_tfm(tfm);
-out:	return error;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-	crypto_free_tfm((struct crypto_tfm *)mem);
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	src.page   = virt_to_page(p->address);
-	src.offset = 0;
-	src.length = PAGE_SIZE;
-	dst.page   = virt_to_page((void *)&swsusp_header);
-	dst.offset = 0;
-	dst.length = PAGE_SIZE;
-
-	error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
-					PAGE_SIZE);
-
-	if (!error)
-		error = write_page((unsigned long)&swsusp_header,
-				&(p->swap_address));
-	return error;
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-	if (!error) {
-		src.offset = 0;
-		src.length = PAGE_SIZE;
-		dst.offset = 0;
-		dst.length = PAGE_SIZE;
-		src.page = dst.page = virt_to_page((void *)p->address);
-
-		error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
-						&src, PAGE_SIZE);
-	}
-	return error;
-}
-#else
-static __inline__ int crypto_init(int mode, void *mem)
-{
-	return 0;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	return write_page(p->address, &(p->swap_address));
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-}
-#endif
-
 static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
@@ -259,7 +122,6 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
 		swsusp_header.swsusp_info = prev;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
@@ -405,10 +267,6 @@ static int data_write(void)
 	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
 	struct pbe *p;
-	void *tfm;
-
-	if ((error = crypto_init(1, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -417,14 +275,11 @@ static int data_write(void)
 	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = crypto_write(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = write_page(p->address, &p->swap_address)))
 			return error;
-		}
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -550,7 +405,6 @@ static int write_suspend_image(void)
 	if ((error = close_swap()))
 		goto FreePagedir;
  Done:
-	memset(key_iv, 0, MAXKEY+MAXIV);
 	return error;
  FreePagedir:
 	free_pagedir_entries();
@@ -812,8 +666,6 @@ static int check_sig(void)
 		return error;
 	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-		memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
-		memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
 
 		/*
 		 * Reset swap signature now.
@@ -840,10 +692,6 @@ static int data_read(struct pbe *pblist)
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
-	void *tfm;
-
-	if ((error = crypto_init(0, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -855,15 +703,13 @@ static int data_read(struct pbe *pblist)
 		if (!(i % mod))
 			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = crypto_read(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = bio_read_page(swp_offset(p->swap_address),
+						(void *)p->address)))
 			return error;
-		}
 
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -986,7 +832,6 @@ int swsusp_read(void)
 
 	error = read_suspend_image();
 	blkdev_put(resume_bdev);
-	memset(key_iv, 0, MAXKEY+MAXIV);
 
 	if (!error)
 		pr_debug("swsusp: Reading resume file was successful\n");
-- 
cgit v1.1


From 7088a5c00103ef48782d6c359cd12b13a10666e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:05 -0800
Subject: [PATCH] swsusp: introduce the swap map structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the swap map structure that can be used by swsusp for
keeping tracks of data pages written to the swap.   The structure itself is
described in a comment within the patch.

The overall idea is to reduce the amount of metadata written to the swap and
to write and read the image pages sequentially, in a file-alike way.  This
makes the swap-handling part of swsusp fairly independent of its
snapshot-handling part and will hopefully allow us to completely separate
these two parts in the future.

This patch is needed to remove the suspend image size limit imposed by the
limited size of the swsusp_info structure, which is essential for x86-64
systems with more than 512 MB of RAM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   6 +-
 kernel/power/disk.c     |   8 +-
 kernel/power/power.h    |  13 +-
 kernel/power/snapshot.c |  14 +-
 kernel/power/swsusp.c   | 558 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 418 insertions(+), 181 deletions(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index a61c04f..33bbaea 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -14,11 +14,7 @@
 typedef struct pbe {
 	unsigned long address;		/* address of the copy */
 	unsigned long orig_address;	/* original address of page */
-	swp_entry_t swap_address;	
-
-	struct pbe *next;	/* also used as scratch space at
-				 * end of page (see link, diskpage)
-				 */
+	struct pbe *next;
 } suspend_pagedir_t;
 
 #define for_each_pbe(pbe, pblist) \
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4d944b2..76a5131 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -25,9 +25,9 @@
 extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_suspend(void);
-extern int swsusp_write(void);
+extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
-extern int swsusp_read(void);
+extern int swsusp_read(struct pbe **pblist_ptr);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -176,7 +176,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(pagedir_nosave, nr_copy_pages);
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -247,7 +247,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read())) {
+	if ((error = swsusp_read(&pagedir_nosave))) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5..977877c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,14 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
-#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
-			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
-			- sizeof(void *)) / sizeof(swp_entry_t))
-
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
 	unsigned long		num_physpages;
 	int			cpus;
 	unsigned long		image_pages;
-	unsigned long		pagedir_pages;
-	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[MAX_PBES];
+	unsigned long		pages;
+	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -67,6 +62,8 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
-extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern unsigned int snapshot_nr_pages(void);
+extern struct pbe *snapshot_pblist(void);
+extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbce..152d56c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,6 +33,9 @@
 
 #include "power.h"
 
+struct pbe *pagedir_nosave;
+unsigned int nr_copy_pages;
+
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -244,7 +247,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
  *	of memory pages allocated with alloc_pagedir()
  */
 
-void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct pbe *pbpage, *p;
 	unsigned int num = PBES_PER_PAGE;
@@ -261,7 +264,6 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 			p->next = p + 1;
 		p->next = NULL;
 	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
 /**
@@ -332,7 +334,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
 		pblist = NULL;
-        }
+        } else
+        	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -395,7 +398,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
-	create_pbe_list(pblist, nr_pages);
 
 	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +423,6 @@ asmlinkage int swsusp_save(void)
 		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	/* This is needed because of the fixed size of swsusp_info */
-	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
-		return -ENOSPC;
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index bd3097c..b09bd7c 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,6 +30,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Added the swap map data structure and reworked the handling of swap
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -76,18 +79,6 @@ static int restore_highmem(void) { return 0; }
 
 extern char resume_file[];
 
-/* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
-
-/* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume
-
-   Warning: this is even more evil than it seems. Pagedirs this file
-   talks about are completely different from page directories used by
-   MMU hardware.
- */
-suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
@@ -238,48 +229,205 @@ static int write_page(unsigned long addr, swp_entry_t *loc)
 }
 
 /**
- *	data_free - Free the swap entries used by the saved image.
+ *	Swap map-handling functions
+ *
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to the swap.  It consists of many swap_map_page structures
+ *	that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are linked together with the help of either the
+ *	.next (in memory) or the .next_swap (in swap) member.
  *
- *	Walk the list of used swap entries and free each one.
- *	This is only used for cleanup when suspend fails.
+ *	The swap map is created during suspend.  At that time we need to keep
+ *	it in memory, because we have to free all of the allocated swap
+ *	entries if an error occurs.  The memory needed is preallocated
+ *	so that we know in advance if there's enough of it.
+ *
+ *	The first swap_map_page structure is filled with the swap entries that
+ *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
+ *	so on.  After the all of the data pages have been written, the order
+ *	of the swap_map_page structures in the map is reversed so that they
+ *	can be read from swap in the original order.  This causes the data
+ *	pages to be loaded in exactly the same order in which they have been
+ *	saved.
+ *
+ *	During resume we only need to use one swap_map_page structure
+ *	at a time, which means that we only need to use two memory pages for
+ *	reading the image - one for reading the swap_map_page structures
+ *	and the second for reading the data pages from swap.
  */
-static void data_free(void)
+
+#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+			/ sizeof(swp_entry_t))
+
+struct swap_map_page {
+	swp_entry_t		entries[MAP_PAGE_SIZE];
+	swp_entry_t		next_swap;
+	struct swap_map_page	*next;
+};
+
+static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-	swp_entry_t entry;
-	struct pbe *p;
+	struct swap_map_page *swp;
 
-	for_each_pbe (p, pagedir_nosave) {
-		entry = p->swap_address;
-		if (entry.val)
-			swap_free(entry);
-		else
-			break;
+	while (swap_map) {
+		swp = swap_map->next;
+		free_page((unsigned long)swap_map);
+		swap_map = swp;
+	}
+}
+
+static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+{
+	struct swap_map_page *swap_map, *swp;
+	unsigned n = 0;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	swp = swap_map;
+	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+		swp = swp->next;
+		if (!swp) {
+			free_swap_map(swap_map);
+			return NULL;
+		}
 	}
+	return swap_map;
 }
 
 /**
- *	data_write - Write saved image to swap.
- *
- *	Walk the list of pages in the image and sync each one to swap.
+ *	reverse_swap_map - reverse the order of pages in the swap map
+ *	@swap_map
  */
-static int data_write(void)
+
+static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
 {
-	int error = 0, i = 0;
-	unsigned int mod = nr_copy_pages / 100;
-	struct pbe *p;
+	struct swap_map_page *prev, *next;
+
+	prev = NULL;
+	while (swap_map) {
+		next = swap_map->next;
+		swap_map->next = prev;
+		prev = swap_map;
+		swap_map = next;
+	}
+	return prev;
+}
 
-	if (!mod)
-		mod = 1;
+/**
+ *	free_swap_map_entries - free the swap entries allocated to store
+ *	the swap map @swap_map (this is only called in case of an error)
+ */
+static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+{
+	while (swap_map) {
+		if (swap_map->next_swap.val)
+			swap_free(swap_map->next_swap);
+		swap_map = swap_map->next;
+	}
+}
 
-	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for_each_pbe (p, pagedir_nosave) {
-		if (!(i%mod))
-			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &p->swap_address)))
+/**
+ *	save_swap_map - save the swap map used for tracing the data pages
+ *	stored in the swap
+ */
+
+static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+{
+	swp_entry_t entry = (swp_entry_t){0};
+	int error;
+
+	while (swap_map) {
+		swap_map->next_swap = entry;
+		if ((error = write_page((unsigned long)swap_map, &entry)))
 			return error;
-		i++;
+		swap_map = swap_map->next;
 	}
-	printk("\b\b\b\bdone\n");
+	*start = entry;
+	return 0;
+}
+
+/**
+ *	free_image_entries - free the swap entries allocated to store
+ *	the image data pages (this is only called in case of an error)
+ */
+
+static inline void free_image_entries(struct swap_map_page *swp)
+{
+	unsigned k;
+
+	while (swp) {
+		for (k = 0; k < MAP_PAGE_SIZE; k++)
+			if (swp->entries[k].val)
+				swap_free(swp->entries[k]);
+		swp = swp->next;
+	}
+}
+
+/**
+ *	The swap_map_handle structure is used for handling the swap map in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned int k;
+};
+
+static inline void init_swap_map_handle(struct swap_map_handle *handle,
+                                        struct swap_map_page *map)
+{
+	handle->cur = map;
+	handle->k = 0;
+}
+
+static inline int swap_map_write_page(struct swap_map_handle *handle,
+                                      unsigned long addr)
+{
+	int error;
+
+	error = write_page(addr, handle->cur->entries + handle->k);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->cur = handle->cur->next;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+/**
+ *	save_image_data - save the data pages pointed to by the PBEs
+ *	from the list @pblist using the swap map handle @handle
+ *	(assume there are @nr_pages data pages to save)
+ */
+
+static int save_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
+{
+	unsigned int m;
+	struct pbe *p;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	for_each_pbe (p, pblist) {
+		error = swap_map_write_page(handle, p->address);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -295,19 +443,20 @@ static void dump_info(void)
 	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
 	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
 	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
 }
 
-static void init_header(void)
+static void init_header(unsigned int nr_pages)
 {
 	memset(&swsusp_info, 0, sizeof(swsusp_info));
 	swsusp_info.version_code = LINUX_VERSION_CODE;
 	swsusp_info.num_physpages = num_physpages;
 	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
 
-	swsusp_info.suspend_pagedir = pagedir_nosave;
 	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_copy_pages;
+	swsusp_info.image_pages = nr_pages;
+	swsusp_info.pages = nr_pages +
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
 }
 
 static int close_swap(void)
@@ -326,39 +475,53 @@ static int close_swap(void)
 }
 
 /**
- *	free_pagedir_entries - Free pages used by the page directory.
- *
- *	This is used during suspend for error recovery.
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
  */
 
-static void free_pagedir_entries(void)
+static inline struct pbe *pack_orig_addresses(unsigned long *buf,
+                                              struct pbe *pbe)
 {
-	int i;
+	int j;
 
-	for (i = 0; i < swsusp_info.pagedir_pages; i++)
-		swap_free(swsusp_info.pagedir[i]);
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
 }
 
-
 /**
- *	write_pagedir - Write the array of pages holding the page directory.
- *	@last:	Last swap entry we write (needed for header).
+ *	save_image_metadata - save the .orig_address fields of the PBEs
+ *	from the list @pblist using the swap map handle @handle
  */
 
-static int write_pagedir(void)
+static int save_image_metadata(struct pbe *pblist,
+                               struct swap_map_handle *handle)
 {
-	int error = 0;
+	unsigned long *buf;
 	unsigned int n = 0;
-	struct pbe *pbe;
+	struct pbe *p;
+	int error = 0;
 
-	printk( "Writing pagedir...");
-	for_each_pb_page (pbe, pagedir_nosave) {
-		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
-			return error;
+	printk("Saving image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		p = pack_orig_addresses(buf, p);
+		error = swap_map_write_page(handle, (unsigned long)buf);
+		if (error)
+			break;
+		n++;
 	}
-
-	swsusp_info.pagedir_pages = n;
-	printk("done (%u pages)\n", n);
+	free_page((unsigned long)buf);
+	if (!error)
+		printk("done (%u pages saved)\n", n);
 	return error;
 }
 
@@ -384,33 +547,48 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	write_suspend_image - Write entire image and metadata.
- *
  */
-static int write_suspend_image(void)
+static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
 {
+	struct swap_map_page *swap_map;
+	struct swap_map_handle handle;
 	int error;
 
-	if (!enough_swap(nr_copy_pages)) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	init_header();
-	if ((error = data_write()))
-		goto FreeData;
+	init_header(nr_pages);
+	swap_map = alloc_swap_map(swsusp_info.pages);
+	if (!swap_map)
+		return -ENOMEM;
+	init_swap_map_handle(&handle, swap_map);
 
-	if ((error = write_pagedir()))
-		goto FreePagedir;
+	error = save_image_metadata(pblist, &handle);
+	if (!error)
+		error = save_image_data(pblist, &handle, nr_pages);
+	if (error)
+		goto Free_image_entries;
 
-	if ((error = close_swap()))
-		goto FreePagedir;
- Done:
+	swap_map = reverse_swap_map(swap_map);
+	error = save_swap_map(swap_map, &swsusp_info.start);
+	if (error)
+		goto Free_map_entries;
+
+	error = close_swap();
+	if (error)
+		goto Free_map_entries;
+
+Free_swap_map:
+	free_swap_map(swap_map);
 	return error;
- FreePagedir:
-	free_pagedir_entries();
- FreeData:
-	data_free();
-	goto Done;
+
+Free_map_entries:
+	free_swap_map_entries(swap_map);
+Free_image_entries:
+	free_image_entries(swap_map);
+	goto Free_swap_map;
 }
 
 /* It is important _NOT_ to umount filesystems at this point. We want
@@ -418,7 +596,7 @@ static int write_suspend_image(void)
  * filesystem clean: it is not. (And it does not matter, if we resume
  * correctly, we'll mark system clean, anyway.)
  */
-int swsusp_write(void)
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	int error;
 
@@ -427,14 +605,12 @@ int swsusp_write(void)
 		return error;
 	}
 	lock_swapdevices();
-	error = write_suspend_image();
+	error = write_suspend_image(pblist, nr_pages);
 	/* This will unlock ignored swap devices since writing is finished */
 	lock_swapdevices();
 	return error;
 }
 
-
-
 int swsusp_suspend(void)
 {
 	int error;
@@ -531,7 +707,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
 	/* We assume both lists contain the same number of elements */
 	while (src) {
 		dst->orig_address = src->orig_address;
-		dst->swap_address = src->swap_address;
 		dst = dst->next;
 		src = src->next;
 	}
@@ -611,6 +786,61 @@ static int bio_write_page(pgoff_t page_off, void *page)
 	return submit(WRITE, page_off, page);
 }
 
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static inline void release_swap_map_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static inline int get_swap_map_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_map_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = swp_offset(handle->cur->entries[handle->k]);
+	if (!offset)
+		return -EINVAL;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->k = 0;
+		offset = swp_offset(handle->cur->next_swap);
+		if (!offset)
+			release_swap_map_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
 /*
  * Sanity check if this image makes sense with this kernel/swap context
  * I really don't think that it's foolproof but more than nothing..
@@ -639,7 +869,6 @@ static const char *sanity_check(void)
 	return NULL;
 }
 
-
 static int check_header(void)
 {
 	const char *reason = NULL;
@@ -653,7 +882,6 @@ static int check_header(void)
 		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
 		return -EPERM;
 	}
-	nr_copy_pages = swsusp_info.image_pages;
 	return error;
 }
 
@@ -680,75 +908,88 @@ static int check_sig(void)
 }
 
 /**
- *	data_read - Read image pages from swap.
- *
- *	You do not need to check for overlaps, check_pagedir()
- *	already did that.
+ *	load_image_data - load the image data using the swap map handle
+ *	@handle and store them using the page backup list @pblist
+ *	(assume there are @nr_pages pages to load)
  */
 
-static int data_read(struct pbe *pblist)
+static int load_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
 {
+	int error;
+	unsigned int m;
 	struct pbe *p;
-	int error = 0;
-	int i = 0;
-	int mod = swsusp_info.image_pages / 100;
-
-	if (!mod)
-		mod = 1;
-
-	printk("swsusp: Reading image data (%lu pages):     ",
-			swsusp_info.image_pages);
-
-	for_each_pbe (p, pblist) {
-		if (!(i % mod))
-			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = bio_read_page(swp_offset(p->swap_address),
-						(void *)p->address)))
-			return error;
-
-		i++;
+	if (!pblist)
+		return -EINVAL;
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, (void *)p->address);
+		if (error)
+			break;
+		p = p->next;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
 	}
-	printk("\b\b\b\bdone\n");
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
 /**
- *	read_pagedir - Read page backup list pages from swap
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
  */
 
-static int read_pagedir(struct pbe *pblist)
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
 {
-	struct pbe *pbpage, *p;
-	unsigned int i = 0;
-	int error;
+	int j;
 
-	if (!pblist)
-		return -EFAULT;
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
 
-	printk("swsusp: Reading pagedir (%lu pages)\n",
-			swsusp_info.pagedir_pages);
+/**
+ *	load_image_metadata - load the image metadata using the swap map
+ *	handle @handle and put them into the PBEs in the list @pblist
+ */
 
-	for_each_pb_page (pbpage, pblist) {
-		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
+{
+	struct pbe *p;
+	unsigned long *buf;
+	unsigned int n = 0;
+	int error = 0;
 
-		error = -EFAULT;
-		if (offset) {
-			p = (pbpage + PB_PAGE_SKIP)->next;
-			error = bio_read_page(offset, (void *)pbpage);
-			(pbpage + PB_PAGE_SKIP)->next = p;
-		}
+	printk("Loading image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, buf);
 		if (error)
 			break;
+		p = unpack_orig_addresses(buf, p);
+		n++;
 	}
-
+	free_page((unsigned long)buf);
 	if (!error)
-		BUG_ON(i != swsusp_info.pagedir_pages);
-
+		printk("done (%u pages loaded)\n", n);
 	return error;
 }
 
-
 static int check_suspend_image(void)
 {
 	int error = 0;
@@ -762,34 +1003,39 @@ static int check_suspend_image(void)
 	return 0;
 }
 
-static int read_suspend_image(void)
+static int read_suspend_image(struct pbe **pblist_ptr)
 {
 	int error = 0;
-	struct pbe *p;
+	struct pbe *p, *pblist;
+	struct swap_map_handle handle;
+	unsigned int nr_pages = swsusp_info.image_pages;
 
-	if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
+	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
+	if (!p)
 		return -ENOMEM;
-
-	if ((error = read_pagedir(p)))
+	error = get_swap_map_reader(&handle, swsusp_info.start);
+	if (error)
+		/* The PBE list at p will be released by swsusp_free() */
 		return error;
-	create_pbe_list(p, nr_copy_pages);
-	mark_unsafe_pages(p);
-	pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
-	if (pagedir_nosave) {
-		create_pbe_list(pagedir_nosave, nr_copy_pages);
-		copy_page_backup_list(pagedir_nosave, p);
+	error = load_image_metadata(p, &handle);
+	if (!error) {
+		mark_unsafe_pages(p);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+
+		/* Allocate memory for the image and read the data from swap */
+		if (!error)
+			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+		if (!error)
+			error = load_image_data(pblist, &handle, nr_pages);
+		if (!error)
+			*pblist_ptr = pblist;
 	}
-	free_pagedir(p);
-	if (!pagedir_nosave)
-		return -ENOMEM;
-
-	/* Allocate memory for the image and read the data from swap */
-
-	error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
-
-	if (!error)
-		error = data_read(pagedir_nosave);
-
+	release_swap_map_reader(&handle);
 	return error;
 }
 
@@ -821,7 +1067,7 @@ int swsusp_check(void)
  *	swsusp_read - Read saved image from swap.
  */
 
-int swsusp_read(void)
+int swsusp_read(struct pbe **pblist_ptr)
 {
 	int error;
 
@@ -830,7 +1076,7 @@ int swsusp_read(void)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image();
+	error = read_suspend_image(pblist_ptr);
 	blkdev_put(resume_bdev);
 
 	if (!error)
-- 
cgit v1.1


From 72a97e08394a3b2e75481ff680ec2a0591e3cba4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:46 -0800
Subject: [PATCH] swsusp: improve freeing of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes swsusp free only as much memory as needed to complete the
suspend and not as much as possible.   In the most of cases this should speed
up the suspend and make the system much more responsive after resume,
especially if a GUI (eg.  X Windows) is used.

If needed, the old behavior (ie to free as much memory as possible during
suspend) can be restored by unsetting FAST_FREE in power.h

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |  2 +-
 kernel/power/disk.c     | 30 +++--------------------
 kernel/power/power.h    | 14 ++++++++---
 kernel/power/snapshot.c | 65 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/power/swsusp.c   | 52 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 33bbaea..5dc94e7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -73,6 +73,6 @@ unsigned long get_safe_page(gfp_t gfp_mask);
  * XXX: We try to keep some more pages free so that I/O operations succeed
  * without paging. Might this be more?
  */
-#define PAGES_FOR_IO	512
+#define PAGES_FOR_IO	1024
 
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 76a5131..9e51cdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,6 +24,7 @@
 
 extern suspend_disk_method_t pm_disk_mode;
 
+extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
 extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
 static int in_suspend __nosavedata = 0;
 
 
-/**
- *	free_some_memory -  Try to free as much memory as possible
- *
- *	... but do not OOM-kill anyone
- *
- *	Notice: all userland should be stopped at this point, or
- *	livelock is possible.
- */
-
-static void free_some_memory(void)
-{
-	unsigned int i = 0;
-	unsigned int tmp;
-	unsigned long pages = 0;
-	char *p = "-\\|/";
-
-	printk("Freeing memory...  ");
-	while ((tmp = shrink_all_memory(10000))) {
-		pages += tmp;
-		printk("\b%c", p[i++ % 4]);
-	}
-	printk("\bdone (%li pages freed)\n", pages);
-}
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
 	}
 
 	/* Free memory before shutting down devices. */
-	free_some_memory();
-	return 0;
+	if (!(error = swsusp_shrink_memory()))
+		return 0;
 thaw:
 	thaw_processes();
 	enable_nonboot_cpus();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 977877c..acdc83b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,18 +49,26 @@ extern void thaw_processes(void);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
 extern unsigned int nr_copy_pages;
-extern suspend_pagedir_t *pagedir_nosave;
-extern suspend_pagedir_t *pagedir_save;
+extern struct pbe *pagedir_nosave;
+
+/*
+ * This compilation switch determines the way in which memory will be freed
+ * during suspend.  If defined, only as much memory will be freed as needed
+ * to complete the suspend, which will make it go faster.  Otherwise, the
+ * largest possible amount of memory will be freed.
+ */
+#define FAST_FREE	1
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern unsigned int count_data_pages(void);
 extern void free_pagedir(struct pbe *pblist);
+extern void release_eaten_pages(void);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 152d56c..e80d282 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,31 @@ struct pbe *pagedir_nosave;
 unsigned int nr_copy_pages;
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (is_highmem(zone)) {
+			mark_free_pages(zone);
+			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
+				struct page *page;
+				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+				if (!pfn_valid(pfn))
+					continue;
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					continue;
+				if (PageNosaveFree(page))
+					continue;
+				n++;
+			}
+		}
+	return n;
+}
+
 struct highmem_page {
 	char *data;
 	struct page *page;
@@ -152,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
+	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return 0;
-	}
 	if (PageNosaveFree(page))
 		return 0;
 
 	return 1;
 }
 
-static unsigned count_data_pages(void)
+unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -267,6 +290,35 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 }
 
 /**
+ *	On resume it is necessary to trace and eventually free the unsafe
+ *	pages that have been allocated, because they are needed for I/O
+ *	(on x86-64 we likely will "eat" these pages once again while
+ *	creating the temporary page translation tables)
+ */
+
+struct eaten_page {
+	struct eaten_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct eaten_page *eaten_pages = NULL;
+
+void release_eaten_pages(void)
+{
+	struct eaten_page *p, *q;
+
+	p = eaten_pages;
+	while (p) {
+		q = p->next;
+		/* We don't want swsusp_free() to free this page again */
+		ClearPageNosave(virt_to_page(p));
+		free_page((unsigned long)p);
+		p = q;
+	}
+	eaten_pages = NULL;
+}
+
+/**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
  *	which had been used before suspend.
@@ -284,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 	if (safe_needed)
 		do {
 			res = (void *)get_zeroed_page(gfp_mask);
-			if (res && PageNosaveFree(virt_to_page(res)))
+			if (res && PageNosaveFree(virt_to_page(res))) {
 				/* This is for swsusp_free() */
 				SetPageNosave(virt_to_page(res));
+				((struct eaten_page *)res)->next = eaten_pages;
+				eaten_pages = res;
+			}
 		} while (res && PageNosaveFree(virt_to_page(res)));
 	else
 		res = (void *)get_zeroed_page(gfp_mask);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index b09bd7c..f77f939 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -70,11 +70,13 @@
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
 int save_highmem(void);
 int restore_highmem(void);
 #else
 static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
+static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
 extern char resume_file[];
@@ -611,6 +613,52 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 	return error;
 }
 
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+
+	printk("Shrinking memory...  ");
+	do {
+#ifdef FAST_FREE
+		tmp = 2 * count_highmem_pages();
+		tmp += tmp / 50 + count_data_pages();
+		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+			PAGES_FOR_IO;
+		for_each_zone (zone)
+			if (!is_highmem(zone))
+				tmp -= zone->free_pages;
+		if (tmp > 0) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		}
+#else
+		tmp = shrink_all_memory(SHRINK_BITE);
+		pages += tmp;
+#endif
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	printk("\bdone (%lu pages freed)\n", pages);
+
+	return 0;
+}
+
 int swsusp_suspend(void)
 {
 	int error;
@@ -1030,8 +1078,10 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 		/* Allocate memory for the image and read the data from swap */
 		if (!error)
 			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error)
+		if (!error) {
+			release_eaten_pages();
 			error = load_image_data(pblist, &handle, nr_pages);
+		}
 		if (!error)
 			*pblist_ptr = pblist;
 	}
-- 
cgit v1.1


From e5e2fa7857f6bf46605c77d949fa6698b9b0bc28 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:14:20 -0800
Subject: [PATCH] swsusp: fix enough_free_mem

This patch fixes a problem with the function enough_free_mem() used by
swsusp to verify if there is a sufficient number of memory pages available
to it to create and save the suspend image.

Namely, enough_free_mem() uses nr_free_pages() to obtain the number of free
memory pages, which is incorrect, because this function returns the total
number of free pages, including free highmem pages, and the highmem pages
cannot be used by swsusp for storing the image data.

The patch makes enough_free_mem() avoid counting the free highmem
pages as available to swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index e80d282..41f6636 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -428,8 +428,14 @@ void swsusp_free(void)
 
 static int enough_free_mem(unsigned int nr_pages)
 {
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (!is_highmem(zone))
+			n += zone->free_pages;
+	pr_debug("swsusp: available memory: %u pages\n", n);
+	return n > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-- 
cgit v1.1


From e7c045c14bffcab2d329e86b80dc8ff7d528e05b Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:07 -0800
Subject: [PATCH] oss: remove deprecated PM interface from ad1848 driver

This change removes the old, deprecated interface from the ad1848 driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/ad1848.c | 92 ------------------------------------------------------
 1 file changed, 92 deletions(-)

diff --git a/sound/oss/ad1848.c b/sound/oss/ad1848.c
index 3f30c57..49796be 100644
--- a/sound/oss/ad1848.c
+++ b/sound/oss/ad1848.c
@@ -46,8 +46,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/stddef.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
 #include <linux/isapnp.h>
 #include <linux/pnp.h>
 #include <linux/spinlock.h>
@@ -105,9 +103,6 @@ typedef struct
 	int             irq_ok;
 	mixer_ents     *mix_devices;
 	int             mixer_output_port;
-
-	/* Power management */
-	struct		pm_dev *pmdev;
 } ad1848_info;
 
 typedef struct ad1848_port_info
@@ -201,7 +196,6 @@ static void     ad1848_halt(int dev);
 static void     ad1848_halt_input(int dev);
 static void     ad1848_halt_output(int dev);
 static void     ad1848_trigger(int dev, int bits);
-static int	ad1848_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data);
 
 #ifndef EXCLUDE_TIMERS
 static int ad1848_tmr_install(int dev);
@@ -2027,10 +2021,6 @@ int ad1848_init (char *name, struct resource *ports, int irq, int dma_playback,
 
 	nr_ad1848_devs++;
 
-	devc->pmdev = pm_register(PM_ISA_DEV, my_dev, ad1848_pm_callback);
-	if (devc->pmdev)
-		devc->pmdev->data = devc;
-
 	ad1848_init_hw(devc);
 
 	if (irq > 0)
@@ -2197,9 +2187,6 @@ void ad1848_unload(int io_base, int irq, int dma_playback, int dma_capture, int
 		if(mixer>=0)
 			sound_unload_mixerdev(mixer);
 
-		if (devc->pmdev)
-			pm_unregister(devc->pmdev);
-
 		nr_ad1848_devs--;
 		for ( ; i < nr_ad1848_devs ; i++)
 			adev_info[i] = adev_info[i+1];
@@ -2811,85 +2798,6 @@ static int ad1848_tmr_install(int dev)
 }
 #endif /* EXCLUDE_TIMERS */
 
-static int ad1848_suspend(ad1848_info *devc)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&devc->lock,flags);
-
-	ad_mute(devc);
-	
-	spin_unlock_irqrestore(&devc->lock,flags);
-	return 0;
-}
-
-static int ad1848_resume(ad1848_info *devc)
-{
-	int mixer_levels[32], i;
-
-	/* Thinkpad is a bit more of PITA than normal. The BIOS tends to
-	   restore it in a different config to the one we use.  Need to
-	   fix this somehow */
-
-	/* store old mixer levels */
-	memcpy(mixer_levels, devc->levels, sizeof (mixer_levels));  
-	ad1848_init_hw(devc);
-
-	/* restore mixer levels */
-	for (i = 0; i < 32; i++)
-		ad1848_mixer_set(devc, devc->dev_no, mixer_levels[i]);
-
-	if (!devc->subtype) {
-		static signed char interrupt_bits[12] = { -1, -1, -1, -1, -1, 0x00, -1, 0x08, -1, 0x10, 0x18, 0x20 };
-		static char dma_bits[4] = { 1, 2, 0, 3 };
-		unsigned long flags;
-		signed char bits;
-		char dma2_bit = 0;
-
-		int config_port = devc->base + 0;
-
-		bits = interrupt_bits[devc->irq];
-		if (bits == -1) {
-			printk(KERN_ERR "MSS: Bad IRQ %d\n", devc->irq);
-			return -1;
-		}
-
-		spin_lock_irqsave(&devc->lock,flags);
-	
-		outb((bits | 0x40), config_port); 
-
-		if (devc->dma2 != -1 && devc->dma2 != devc->dma1)
-			if ( (devc->dma1 == 0 && devc->dma2 == 1) ||
-			     (devc->dma1 == 1 && devc->dma2 == 0) ||
-			     (devc->dma1 == 3 && devc->dma2 == 0))
-				dma2_bit = 0x04;
-
-		outb((bits | dma_bits[devc->dma1] | dma2_bit), config_port);
-		spin_unlock_irqrestore(&devc->lock,flags);
-	}
-
-	return 0;
-}
-
-static int ad1848_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data) 
-{
-	ad1848_info *devc = dev->data;
-	if (devc) {
-		DEB(printk("ad1848: pm event received: 0x%x\n", rqst));
-
-		switch (rqst) {
-		case PM_SUSPEND:
-			ad1848_suspend(devc);
-			break;
-		case PM_RESUME:
-			ad1848_resume(devc);
-			break;
-		}
-	}
-	return 0;
-}
-
-
 EXPORT_SYMBOL(ad1848_detect);
 EXPORT_SYMBOL(ad1848_init);
 EXPORT_SYMBOL(ad1848_unload);
-- 
cgit v1.1


From ee77e2754247d011a11f572788040cda2493c998 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:14 -0800
Subject: [PATCH] oss: remove deprecated PM interface from cs4281 driver

This change removes the old, deprecated interface from the cs4281 driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

Note that this driver has been obsoleted by an ALSA equivalent.

Note that this driver has hooks for PCI power management, but does not
implement the ->suspend()/->resume() methods.

Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/cs4281/cs4281m.c     | 21 +--------------------
 sound/oss/cs4281/cs4281pm-24.c | 39 ---------------------------------------
 2 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/sound/oss/cs4281/cs4281m.c b/sound/oss/cs4281/cs4281m.c
index adc6896..46dd41d 100644
--- a/sound/oss/cs4281/cs4281m.c
+++ b/sound/oss/cs4281/cs4281m.c
@@ -298,7 +298,6 @@ struct cs4281_state {
 	struct cs4281_pipeline pl[CS4281_NUMBER_OF_PIPELINES];
 };
 
-#include <linux/pm_legacy.h>
 #include "cs4281pm-24.c"
 
 #if CSDEBUG
@@ -4256,9 +4255,6 @@ static void __devinit cs4281_InitPM(struct cs4281_state *s)
 static int __devinit cs4281_probe(struct pci_dev *pcidev,
 				  const struct pci_device_id *pciid)
 {
-#ifndef NOT_CS4281_PM
-	struct pm_dev *pmdev;
-#endif
 	struct cs4281_state *s;
 	dma_addr_t dma_mask;
 	mm_segment_t fs;
@@ -4374,19 +4370,7 @@ static int __devinit cs4281_probe(struct pci_dev *pcidev,
 	}
 #ifndef NOT_CS4281_PM
 	cs4281_InitPM(s);
-	pmdev = cs_pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev), cs4281_pm_callback);
-	if (pmdev)
-	{
-		CS_DBGOUT(CS_INIT | CS_PM, 4, printk(KERN_INFO
-			 "cs4281: probe() pm_register() succeeded (%p).\n", pmdev));
-		pmdev->data = s;
-	}
-	else
-	{
-		CS_DBGOUT(CS_INIT | CS_PM | CS_ERROR, 0, printk(KERN_INFO
-			 "cs4281: probe() pm_register() failed (%p).\n", pmdev));
-		s->pm.flags |= CS4281_PM_NOT_REGISTERED;
-	}
+	s->pm.flags |= CS4281_PM_NOT_REGISTERED;
 #endif
 
 	pci_set_master(pcidev);	// enable bus mastering 
@@ -4487,9 +4471,6 @@ static int __init cs4281_init_module(void)
 static void __exit cs4281_cleanup_module(void)
 {
 	pci_unregister_driver(&cs4281_pci_driver);
-#ifndef NOT_CS4281_PM
-	cs_pm_unregister_all(cs4281_pm_callback);
-#endif
 	CS_DBGOUT(CS_INIT | CS_FUNCTION, 2,
 		  printk(KERN_INFO "cs4281: cleanup_cs4281() finished\n"));
 }
diff --git a/sound/oss/cs4281/cs4281pm-24.c b/sound/oss/cs4281/cs4281pm-24.c
index d2a453a..90cbd76 100644
--- a/sound/oss/cs4281/cs4281pm-24.c
+++ b/sound/oss/cs4281/cs4281pm-24.c
@@ -27,9 +27,6 @@
 #ifndef NOT_CS4281_PM
 #include <linux/pm.h>
 
-#define cs_pm_register(a, b, c) pm_register((a), (b), (c));
-#define cs_pm_unregister_all(a) pm_unregister_all((a));
-
 static int cs4281_suspend(struct cs4281_state *s);
 static int cs4281_resume(struct cs4281_state *s);
 /* 
@@ -41,42 +38,6 @@ static int cs4281_resume(struct cs4281_state *s);
 #define CS4281_SUSPEND_TBL cs4281_suspend_null
 #define CS4281_RESUME_TBL cs4281_resume_null
 
-static int cs4281_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-	struct cs4281_state *state;
-
-	CS_DBGOUT(CS_PM, 2, printk(KERN_INFO 
-		"cs4281: cs4281_pm_callback dev=%p rqst=0x%x state=%p\n",
-			dev,(unsigned)rqst,data));
-	state = (struct cs4281_state *) dev->data;
-	if (state) {
-		switch(rqst) {
-			case PM_SUSPEND:
-				CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
-					"cs4281: PM suspend request\n"));
-				if(cs4281_suspend(state))
-				{
-				    CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
-					"cs4281: PM suspend request refused\n"));
-					return 1; 
-				}
-				break;
-			case PM_RESUME:
-				CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
-					"cs4281: PM resume request\n"));
-				if(cs4281_resume(state))
-				{
-				    CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
-					"cs4281: PM resume request refused\n"));
-					return 1;
-				}
-				break;
-		}
-	}
-
-	return 0;
-}
-
 #else /* CS4281_PM */
 #define CS4281_SUSPEND_TBL cs4281_suspend_null
 #define CS4281_RESUME_TBL cs4281_resume_null
-- 
cgit v1.1


From 94661e7c33e6e3001be07d76d3a87eaa41dad3df Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:17 -0800
Subject: [PATCH] oss: remove deprecated PM interface from cs46xx driver

This change removes the old, deprecated interface from the cs46xx driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

Note this driver has PCI PM hooks which are set properly.  It also has the
ability to trigger suspend/resume from an ioctl.  This functionality was not
touched, though it could use a serious review if this driver continues to
persist in the mainline tree..

Note that this driver has been obsoleted by an ALSA equivalent.

Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/cs46xx.c      | 60 -------------------------------------------------
 sound/oss/cs46xxpm-24.h |  4 ----
 2 files changed, 64 deletions(-)

diff --git a/sound/oss/cs46xx.c b/sound/oss/cs46xx.c
index cb998e8..0da4d93 100644
--- a/sound/oss/cs46xx.c
+++ b/sound/oss/cs46xx.c
@@ -391,10 +391,6 @@ static void cs461x_clear_serial_FIFOs(struct cs_card *card, int type);
 static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state);
 static int cs46xx_resume_tbl(struct pci_dev *pcidev);
 
-#ifndef CS46XX_ACPI_SUPPORT
-static int cs46xx_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data);
-#endif
-
 #if CSDEBUG
 
 /* DEBUG ROUTINES */
@@ -5320,7 +5316,6 @@ static const char fndmsg[] = KERN_INFO "cs46xx: Found %d audio device(s).\n";
 static int __devinit cs46xx_probe(struct pci_dev *pci_dev,
 				  const struct pci_device_id *pciid)
 {
-	struct pm_dev *pmdev;
 	int i,j;
 	u16 ss_card, ss_vendor;
 	struct cs_card *card;
@@ -5530,22 +5525,6 @@ static int __devinit cs46xx_probe(struct pci_dev *pci_dev,
 	PCI_SET_DMA_MASK(pci_dev, dma_mask);
 	list_add(&card->list, &cs46xx_devs);
 
-	pmdev = cs_pm_register(PM_PCI_DEV, PM_PCI_ID(pci_dev), cs46xx_pm_callback);
-	if (pmdev)
-	{
-		CS_DBGOUT(CS_INIT | CS_PM, 4, printk(KERN_INFO
-			 "cs46xx: probe() pm_register() succeeded (%p).\n",
-				pmdev));
-		pmdev->data = card;
-	}
-	else
-	{
-		CS_DBGOUT(CS_INIT | CS_PM | CS_ERROR, 2, printk(KERN_INFO
-			 "cs46xx: probe() pm_register() failed (%p).\n",
-				pmdev));
-		card->pm.flags |= CS46XX_PM_NOT_REGISTERED;
-	}
-
 	CS_DBGOUT(CS_PM, 9, printk(KERN_INFO "cs46xx: pm.flags=0x%x card=%p\n",
 		(unsigned)card->pm.flags,card));
 
@@ -5727,7 +5706,6 @@ static int __init cs46xx_init_module(void)
 static void __exit cs46xx_cleanup_module(void)
 {
 	pci_unregister_driver(&cs46xx_pci_driver);
-	cs_pm_unregister_all(cs46xx_pm_callback);
 	CS_DBGOUT(CS_INIT | CS_FUNCTION, 2,
 		  printk(KERN_INFO "cs46xx: cleanup_cs46xx() finished\n"));
 }
@@ -5735,44 +5713,6 @@ static void __exit cs46xx_cleanup_module(void)
 module_init(cs46xx_init_module);
 module_exit(cs46xx_cleanup_module);
 
-#ifndef CS46XX_ACPI_SUPPORT
-static int cs46xx_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-	struct cs_card *card;
-
-	CS_DBGOUT(CS_PM, 2, printk(KERN_INFO 
-		"cs46xx: cs46xx_pm_callback dev=%p rqst=0x%x card=%p\n",
-			dev,(unsigned)rqst,data));
-	card = (struct cs_card *) dev->data;
-	if (card) {
-		switch(rqst) {
-			case PM_SUSPEND:
-				CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
-					"cs46xx: PM suspend request\n"));
-				if(cs46xx_suspend(card, PMSG_SUSPEND))
-				{
-				    CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
-					"cs46xx: PM suspend request refused\n"));
-					return 1; 
-				}
-				break;
-			case PM_RESUME:
-				CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
-					"cs46xx: PM resume request\n"));
-				if(cs46xx_resume(card))
-				{
-				    CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
-					"cs46xx: PM resume request refused\n"));
-					return 1;
-				}
-				break;
-		}
-	}
-
-	return 0;
-}
-#endif
-
 #if CS46XX_ACPI_SUPPORT
 static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state)
 {
diff --git a/sound/oss/cs46xxpm-24.h b/sound/oss/cs46xxpm-24.h
index e220bd7..ad82db8 100644
--- a/sound/oss/cs46xxpm-24.h
+++ b/sound/oss/cs46xxpm-24.h
@@ -38,13 +38,9 @@
 */
 static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state);
 static int cs46xx_resume_tbl(struct pci_dev *pcidev);
-#define cs_pm_register(a, b, c)  NULL
-#define cs_pm_unregister_all(a) 
 #define CS46XX_SUSPEND_TBL cs46xx_suspend_tbl
 #define CS46XX_RESUME_TBL cs46xx_resume_tbl
 #else
-#define cs_pm_register(a, b, c) pm_register((a), (b), (c));
-#define cs_pm_unregister_all(a) pm_unregister_all((a));
 #define CS46XX_SUSPEND_TBL cs46xx_null
 #define CS46XX_RESUME_TBL cs46xx_null
 #endif
-- 
cgit v1.1


From 53052539f3e2b29ccaf2064b0d3b8cee51d05621 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:18 -0800
Subject: [PATCH] oss: remove deprecated PM interface from maestro driver

This change removes the old, deprecated interface from the maestro driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

The check_suspend() function and associated logic was not removed, even though
it is now unnecessary.

Note that this driver has been obsoleted by an ALSA equivalent.

Acked-by: Zach Brown <zab@zabbo.net>
Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/maestro.c | 149 ----------------------------------------------------
 1 file changed, 149 deletions(-)

diff --git a/sound/oss/maestro.c b/sound/oss/maestro.c
index 3abd354..f9ac5b1 100644
--- a/sound/oss/maestro.c
+++ b/sound/oss/maestro.c
@@ -230,10 +230,6 @@
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
-static int maestro_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *d);
-
 #include "maestro.h"
 
 static struct pci_driver maestro_pci_driver;
@@ -3404,7 +3400,6 @@ maestro_probe(struct pci_dev *pcidev,const struct pci_device_id *pdid)
 	int i, ret;
 	struct ess_card *card;
 	struct ess_state *ess;
-	struct pm_dev *pmdev;
 	int num = 0;
 
 /* when built into the kernel, we only print version if device is found */
@@ -3450,11 +3445,6 @@ maestro_probe(struct pci_dev *pcidev,const struct pci_device_id *pdid)
 	memset(card, 0, sizeof(*card));
 	card->pcidev = pcidev;
 
-	pmdev = pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev),
-			maestro_pm_callback);
-	if (pmdev)
-		pmdev->data = card;
-
 	card->iobase = iobase;
 	card->card_type = card_type;
 	card->irq = pcidev->irq;
@@ -3670,7 +3660,6 @@ static int maestro_notifier(struct notifier_block *nb, unsigned long event, void
 static void cleanup_maestro(void) {
 	M_printk("maestro: unloading\n");
 	pci_unregister_driver(&maestro_pci_driver);
-	pm_unregister_all(maestro_pm_callback);
 	unregister_reboot_notifier(&maestro_nb);
 }
 
@@ -3691,143 +3680,5 @@ check_suspend(struct ess_card *card)
 	current->state = TASK_RUNNING;
 }
 
-static int 
-maestro_suspend(struct ess_card *card)
-{
-	unsigned long flags;
-	int i,j;
-
-	spin_lock_irqsave(&card->lock,flags); /* over-kill */
-
-	M_printk("maestro: apm in dev %p\n",card);
-
-	/* we have to read from the apu regs, need
-		to power it up */
-	maestro_power(card,ACPI_D0);
-
-	for(i=0;i<NR_DSPS;i++) {
-		struct ess_state *s = &card->channels[i];
-
-		if(s->dev_audio == -1)
-			continue;
-
-		M_printk("maestro: stopping apus for device %d\n",i);
-		stop_dac(s);
-		stop_adc(s);
-		for(j=0;j<6;j++) 
-			card->apu_map[s->apu[j]][5]=apu_get_register(s,j,5);
-
-	}
-
-	/* get rid of interrupts? */
-	if( card->dsps_open > 0)
-		stop_bob(&card->channels[0]);
-
-	card->in_suspend++;
-
-	spin_unlock_irqrestore(&card->lock,flags);
-
-	/* we trust in the bios to power down the chip on suspend.
-	 * XXX I'm also not sure that in_suspend will protect
-	 * against all reg accesses from here on out. 
-	 */
-	return 0;
-}
-static int 
-maestro_resume(struct ess_card *card)
-{
-	unsigned long flags;
-	int i;
-
-	spin_lock_irqsave(&card->lock,flags); /* over-kill */
-
-	card->in_suspend = 0;
-
-	M_printk("maestro: resuming card at %p\n",card);
-
-	/* restore all our config */
-	maestro_config(card);
-	/* need to restore the base pointers.. */ 
-	if(card->dmapages) 
-		set_base_registers(&card->channels[0],card->dmapages);
-
-	mixer_push_state(card);
-
-	/* set each channels' apu control registers before
-	 * restoring audio 
-	 */
-	for(i=0;i<NR_DSPS;i++) {
-		struct ess_state *s = &card->channels[i];
-		int chan,reg;
-
-		if(s->dev_audio == -1)
-			continue;
-
-		for(chan = 0 ; chan < 6 ; chan++) {
-			wave_set_register(s,s->apu[chan]<<3,s->apu_base[chan]);
-			for(reg = 1 ; reg < NR_APU_REGS ; reg++)  
-				apu_set_register(s,chan,reg,s->card->apu_map[s->apu[chan]][reg]);
-		}
-		for(chan = 0 ; chan < 6 ; chan++)  
-			apu_set_register(s,chan,0,s->card->apu_map[s->apu[chan]][0] & 0xFF0F);
-	}
-
-	/* now we flip on the music */
-
-	if( card->dsps_open <= 0) {
-		/* this card's idle */
-		maestro_power(card,ACPI_D2);
-	} else {
-		/* ok, we're actually playing things on
-			this card */
-		maestro_power(card,ACPI_D0);
-		start_bob(&card->channels[0]);
-		for(i=0;i<NR_DSPS;i++) {
-			struct ess_state *s = &card->channels[i];
-
-			/* these use the apu_mode, and can handle
-				spurious calls */
-			start_dac(s);	
-			start_adc(s);	
-		}
-	}
-
-	spin_unlock_irqrestore(&card->lock,flags);
-
-	/* all right, we think things are ready, 
-		wake up people who were using the device
-		when we suspended */
-	wake_up(&(card->suspend_queue));
-
-	return 0;
-}
-
-int 
-maestro_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data) 
-{
-	struct ess_card *card = (struct ess_card*) dev->data;
-
-	if ( ! card ) goto out;
-
-	M_printk("maestro: pm event 0x%x received for card %p\n", rqst, card);
-	
-	switch (rqst) {
-		case PM_SUSPEND: 
-			maestro_suspend(card);
-		break;
-		case PM_RESUME: 
-			maestro_resume(card);
-		break;
-		/*
-		 * we'd also like to find out about
-		 * power level changes because some biosen
-		 * do mean things to the maestro when they
-		 * change their power state.
-		 */
-        }
-out:
-	return 0;
-}
-
 module_init(init_maestro);
 module_exit(cleanup_maestro);
-- 
cgit v1.1


From 76cd48a397f126ea883835f5889ee1837596f021 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:19 -0800
Subject: [PATCH] oss: remove deprecated PM interface from nm256 driver

This change removes the old, deprecated interface from the nm256 driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

Note that this driver has been obsoleted by an ALSA equivalent.

Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/nm256_audio.c | 47 -----------------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/sound/oss/nm256_audio.c b/sound/oss/nm256_audio.c
index 0ce2c40..42d8f05 100644
--- a/sound/oss/nm256_audio.c
+++ b/sound/oss/nm256_audio.c
@@ -24,8 +24,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include "sound_config.h"
@@ -49,7 +47,6 @@ static int nm256_grabInterrupt (struct nm256_info *card);
 static int nm256_releaseInterrupt (struct nm256_info *card);
 static irqreturn_t nm256_interrupt (int irq, void *dev_id, struct pt_regs *dummy);
 static irqreturn_t nm256_interrupt_zx (int irq, void *dev_id, struct pt_regs *dummy);
-static int handle_pm_event (struct pm_dev *dev, pm_request_t rqst, void *data);
 
 /* These belong in linux/pci.h. */
 #define PCI_DEVICE_ID_NEOMAGIC_NM256AV_AUDIO 0x8005
@@ -992,15 +989,6 @@ nm256_install_mixer (struct nm256_info *card)
     return 0;
 }
 
-/* Perform a full reset on the hardware; this is invoked when an APM
-   resume event occurs.  */
-static void
-nm256_full_reset (struct nm256_info *card)
-{
-    nm256_initHw (card);
-    ac97_reset (&(card->mdev));
-}
-
 /* 
  * See if the signature left by the NM256 BIOS is intact; if so, we use
  * the associated address as the end of our audio buffer in the video
@@ -1053,7 +1041,6 @@ static int __devinit
 nm256_install(struct pci_dev *pcidev, enum nm256rev rev, char *verstr)
 {
     struct nm256_info *card;
-    struct pm_dev *pmdev;
     int x;
 
     if (pci_enable_device(pcidev))
@@ -1234,43 +1221,10 @@ nm256_install(struct pci_dev *pcidev, enum nm256rev rev, char *verstr)
 
     nm256_install_mixer (card);
 
-    pmdev = pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev), handle_pm_event);
-    if (pmdev)
-        pmdev->data = card;
-
     return 1;
 }
 
 
-/*
- * PM event handler, so the card is properly reinitialized after a power
- * event.
- */
-static int
-handle_pm_event (struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-    struct nm256_info *crd = (struct nm256_info*) dev->data;
-    if (crd) {
-        switch (rqst) {
-	case PM_SUSPEND:
-	    break;
-	case PM_RESUME:
-            {
-                int playing = crd->playing;
-                nm256_full_reset (crd);
-                /*
-                 * A little ugly, but that's ok; pretend the
-                 * block we were playing is done. 
-                 */
-                if (playing)
-                    DMAbuf_outputintr (crd->dev_for_play, 1);
-            }
-	    break;
-	}
-    }
-    return 0;
-}
-
 static int __devinit
 nm256_probe(struct pci_dev *pcidev,const struct pci_device_id *pciid)
 {
@@ -1696,7 +1650,6 @@ static int __init do_init_nm256(void)
 static void __exit cleanup_nm256 (void)
 {
     pci_unregister_driver(&nm256_pci_driver);
-    pm_unregister_all (&handle_pm_event);
 }
 
 module_init(do_init_nm256);
-- 
cgit v1.1


From 45029c3207840edb9c9b795de0145ded1c675fce Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@digitalimplant.org>
Date: Fri, 6 Jan 2006 00:15:20 -0800
Subject: [PATCH] oss: remove deprecated PM interface from opl3sa2 driver

This change removes the old, deprecated interface from the opl3sa2 driver,
including the pm_{,un}register() calls, the local storage of the pmdev object
and the reference to the old header files.  This change is done to assist in
eradicating the users of the legacy interface so as to help facilitate the
removal of the interface itself.

Signed-off-by: Patrick Mochel <mochel@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 sound/oss/opl3sa2.c | 110 ----------------------------------------------------
 1 file changed, 110 deletions(-)

diff --git a/sound/oss/opl3sa2.c b/sound/oss/opl3sa2.c
index cd41d0e..5cecdbc 100644
--- a/sound/oss/opl3sa2.c
+++ b/sound/oss/opl3sa2.c
@@ -69,8 +69,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
 #include "sound_config.h"
 
 #include "ad1848.h"
@@ -139,10 +137,6 @@ typedef struct {
 	struct pnp_dev* pdev;
 	int activated;			/* Whether said devices have been activated */
 #endif
-#ifdef CONFIG_PM_LEGACY
-	unsigned int	in_suspend;
-	struct pm_dev	*pmdev;
-#endif
 	unsigned int	card;
 	int		chipset;	/* What's my version(s)? */
 	char		*chipset_name;
@@ -341,22 +335,6 @@ static void opl3sa2_mixer_reset(opl3sa2_state_t* devc)
 	}
 }
 
-/* Currently only used for power management */
-#ifdef CONFIG_PM_LEGACY
-static void opl3sa2_mixer_restore(opl3sa2_state_t* devc)
-{
-	if (devc) {
-		opl3sa2_set_volume(devc, devc->volume_l, devc->volume_r);
-		opl3sa2_set_mic(devc, devc->mic);
-
-		if (devc->chipset == CHIPSET_OPL3SA3) {
-			opl3sa3_set_bass(devc, devc->bass_l, devc->bass_r);
-			opl3sa3_set_treble(devc, devc->treble_l, devc->treble_r);
-		}
-	}
-}
-#endif /* CONFIG_PM_LEGACY */
-
 static inline void arg_to_vol_mono(unsigned int vol, int* value)
 {
 	int left;
@@ -832,84 +810,6 @@ static struct pnp_driver opl3sa2_driver = {
 
 /* End of component functions */
 
-#ifdef CONFIG_PM_LEGACY
-
-static DEFINE_SPINLOCK(opl3sa2_lock);
-
-/* Power Management support functions */
-static int opl3sa2_suspend(struct pm_dev *pdev, unsigned int pm_mode)
-{
-	unsigned long flags;
-	opl3sa2_state_t *p;
-
-	if (!pdev)
-		return -EINVAL;
-
-	spin_lock_irqsave(&opl3sa2_lock,flags);
-
-	p = (opl3sa2_state_t *) pdev->data;
-	switch (pm_mode) {
-	case 1:
-		pm_mode = OPL3SA2_PM_MODE1;
-		break;
-	case 2:
-		pm_mode = OPL3SA2_PM_MODE2;
-		break;
-	case 3:
-		pm_mode = OPL3SA2_PM_MODE3;
-		break;
-	default:
-		/* we don't know howto handle this... */
-		spin_unlock_irqrestore(&opl3sa2_lock, flags);
-		return -EBUSY;
-	}
-
-	p->in_suspend = 1;
-
-	/* its supposed to automute before suspending, so we won't bother */
-	opl3sa2_write(p->cfg_port, OPL3SA2_PM, pm_mode);
-	/* wait a while for the clock oscillator to stabilise */
-	mdelay(10);
-
-	spin_unlock_irqrestore(&opl3sa2_lock,flags);
-	return 0;
-}
-
-static int opl3sa2_resume(struct pm_dev *pdev)
-{
-	unsigned long flags;
-	opl3sa2_state_t *p;
-
- 	if (!pdev)
- 		return -EINVAL;
-
-	p = (opl3sa2_state_t *) pdev->data;
-	spin_lock_irqsave(&opl3sa2_lock,flags);
-
- 	/* I don't think this is necessary */
-	opl3sa2_write(p->cfg_port, OPL3SA2_PM, OPL3SA2_PM_MODE0);
-	opl3sa2_mixer_restore(p);
- 	p->in_suspend = 0;
-
-	spin_unlock_irqrestore(&opl3sa2_lock,flags);
-	return 0;
-}
-
-static int opl3sa2_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data)
-{
-	unsigned long mode = (unsigned  long)data;
-
-	switch (rqst) {
-		case PM_SUSPEND:
-			return opl3sa2_suspend(pdev, mode);
-
-		case PM_RESUME:
-			return opl3sa2_resume(pdev);
-	}
-	return 0;
-}
-#endif /* CONFIG_PM_LEGACY */
-
 /*
  * Install OPL3-SA2 based card(s).
  *
@@ -1021,12 +921,6 @@ static int __init init_opl3sa2(void)
 
 		/* ewww =) */
 		opl3sa2_state[card].card = card;
-#ifdef CONFIG_PM_LEGACY
-		/* register our power management capabilities */
-		opl3sa2_state[card].pmdev = pm_register(PM_ISA_DEV, card, opl3sa2_pm_callback);
-		if (opl3sa2_state[card].pmdev)
-			opl3sa2_state[card].pmdev->data = &opl3sa2_state[card];
-#endif /* CONFIG_PM_LEGACY */
 
 		/*
 		 * Set the Yamaha 3D enhancement mode (aka Ymersion) if asked to and
@@ -1083,10 +977,6 @@ static void __exit cleanup_opl3sa2(void)
 	int card;
 
 	for(card = 0; card < opl3sa2_cards_num; card++) {
-#ifdef CONFIG_PM_LEGACY
-		if (opl3sa2_state[card].pmdev)
-			pm_unregister(opl3sa2_state[card].pmdev);
-#endif
 	        if (opl3sa2_state[card].cfg_mpu.slots[1] != -1) {
 			unload_opl3sa2_mpu(&opl3sa2_state[card].cfg_mpu);
  		}
-- 
cgit v1.1


From c050ca78705592d440c22055865bf4de40fe2a4c Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Fri, 6 Jan 2006 00:15:21 -0800
Subject: [PATCH] swsusp: Drop duplicate prototypes

These two prototypes are already present in sched.h, remove duplicate
version.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index acdc83b..e521e61 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -43,9 +43,6 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
-extern int freeze_processes(void);
-extern void thaw_processes(void);
-
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-- 
cgit v1.1


From b3a93a255ec33a04776ec50efb30b7a99168dda2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:22 -0800
Subject: [PATCH] swsusp: limit image size

Limit the size of the suspend image to approx.  500 MB, which should
improve the overall performance of swsusp on systems with more than 1 GB of
RAM.

It introduces the constant IMAGE_SIZE that can be set to the preferred size
of the image (in MB) and modifies the memory-shrinking part of swsusp to
take this constant into account (500 is the default value of IMAGE_SIZE).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |  8 +++-----
 kernel/power/swsusp.c | 17 ++++++++---------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e521e61..9b04599 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,12 +53,10 @@ extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /*
- * This compilation switch determines the way in which memory will be freed
- * during suspend.  If defined, only as much memory will be freed as needed
- * to complete the suspend, which will make it go faster.  Otherwise, the
- * largest possible amount of memory will be freed.
+ * Preferred image size in MB (set it to zero to get the smallest
+ * image possible)
  */
-#define FAST_FREE	1
+#define IMAGE_SIZE	500
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f77f939..6d5ceaf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -626,7 +626,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 
 int swsusp_shrink_memory(void)
 {
-	long tmp;
+	long size, tmp;
 	struct zone *zone;
 	unsigned long pages = 0;
 	unsigned int i = 0;
@@ -634,11 +634,11 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-#ifdef FAST_FREE
-		tmp = 2 * count_highmem_pages();
-		tmp += tmp / 50 + count_data_pages();
-		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+		size = 2 * count_highmem_pages();
+		size += size / 50 + count_data_pages();
+		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
+		tmp = size;
 		for_each_zone (zone)
 			if (!is_highmem(zone))
 				tmp -= zone->free_pages;
@@ -647,11 +647,10 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
+		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			pages += tmp;
 		}
-#else
-		tmp = shrink_all_memory(SHRINK_BITE);
-		pages += tmp;
-#endif
 		printk("\b%c", p[i++%4]);
 	} while (tmp > 0);
 	printk("\bdone (%lu pages freed)\n", pages);
-- 
cgit v1.1


From ca0aec0f7a94bf9f07fefa8bfd23282d4e8ceb8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:56 -0800
Subject: [PATCH] swsusp: make image size limit tunable

Make the suspend image size limit tunable via /sys/power/image_size.

It is necessary for systems on which there is a limited amount of swap
available for suspend.  It can also be useful for optimizing performance of
swsusp on systems with 1 GB of RAM or more.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/interface.txt | 11 +++++++++++
 Documentation/power/swsusp.txt    |  5 +++++
 kernel/power/disk.c               | 20 ++++++++++++++++++++
 kernel/power/power.h              |  7 ++-----
 kernel/power/swsusp.c             | 10 +++++++++-
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index f5ebda5..bd4ffb5 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -41,3 +41,14 @@ to. Writing to this file will accept one of
 It will only change to 'firmware' or 'platform' if the system supports
 it. 
 
+/sys/power/image_size controls the size of the image created by
+the suspend-to-disk mechanism.  It can be written a string
+representing a non-negative integer that will be used as an upper
+limit of the image size, in megabytes.  The suspend-to-disk mechanism will
+do its best to ensure the image size will not exceed that number.  However,
+if this turns out to be impossible, it will try to suspend anyway using the
+smallest image possible.  In particular, if "0" is written to this file, the
+suspend image will be as small as possible.
+
+Reading from this file will display the current image size limit, which
+is set to 500 MB by default.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index b0d5084..cd0fcd8 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -27,6 +27,11 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
+If you want to limit the suspend image size to N megabytes, do
+
+echo N > /sys/power/image_size
+
+before suspend (it is limited to 500 MB by default).
 
 Encrypted suspend image:
 ------------------------
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9e51cdf..e24446f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -365,9 +365,29 @@ out:
 
 power_attr(resume);
 
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%u\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned int size;
+
+	if (sscanf(buf, "%u", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
 static struct attribute * g[] = {
 	&disk_attr.attr,
 	&resume_attr.attr,
+	&image_size_attr.attr,
 	NULL,
 };
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9b04599..273a5b1 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -52,11 +52,8 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
-/*
- * Preferred image size in MB (set it to zero to get the smallest
- * image possible)
- */
-#define IMAGE_SIZE	500
+/* Preferred image size in MB (default 500) */
+extern unsigned int image_size;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6d5ceaf..d760a6a 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -69,6 +69,14 @@
 
 #include "power.h"
 
+/*
+ * Preferred image size in MB (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N MB, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned int image_size = 500;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -647,7 +655,7 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
-		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+		} else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
 			tmp = shrink_all_memory(SHRINK_BITE);
 			pages += tmp;
 		}
-- 
cgit v1.1


From 3a291a20bd6fcfafb2109031f0760a0d3e92ecd7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:16:37 -0800
Subject: [PATCH] mm: add a new function (needed for swap suspend)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds the function get_swap_page_of_type() allowing us to specify an index
in swap_info[] and select a swap_info_struct structure to be used for
allocating a swap page.

This function (or another one of similar functionality) will be necessary for
implementing the image-writing part of swsusp in the user space.   It can also
be used for simplifying the current in-kernel implementation of the
image-writing part of swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bd66417..556617b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -209,6 +209,7 @@ extern unsigned int nr_swapfiles;
 extern struct swap_info_struct swap_info[];
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page_of_type(int type);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeac..6da4b28 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
 	return (swp_entry_t) {0};
 }
 
+swp_entry_t get_swap_page_of_type(int type)
+{
+	struct swap_info_struct *si;
+	pgoff_t offset;
+
+	spin_lock(&swap_lock);
+	si = swap_info + type;
+	if (si->flags & SWP_WRITEOK) {
+		nr_swap_pages--;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
+		}
+		nr_swap_pages++;
+	}
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
+}
+
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct * p;
-- 
cgit v1.1


From 1adf6c8ea916bc4a2587a881ec7715fece63fb5e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:16 -0800
Subject: [PATCH] swsusp: improve handling of swap partitions

This changes the handling of swap partitions by swsusp to avoid locking of the
swap devices that are not used for suspend and, consequently, simplifies the
code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 128 ++++++++++++++------------------------------------
 1 file changed, 36 insertions(+), 92 deletions(-)

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d760a6a..0479c9b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -104,13 +104,7 @@ static struct swsusp_info swsusp_info;
  * Saving part...
  */
 
-/* We memorize in swapfile_used what swap devices are used for suspension */
-#define SWAPFILE_UNUSED    0
-#define SWAPFILE_SUSPEND   1	/* This is the suspending device */
-#define SWAPFILE_IGNORED   2	/* Those are other swap devices ignored for suspension */
-
-static unsigned short swapfile_used[MAX_SWAPFILES];
-static unsigned short root_swap;
+static unsigned short root_swap = 0xffff;
 
 static int mark_swapfiles(swp_entry_t prev)
 {
@@ -146,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
  * devfs, since the resume code can only recognize the form /dev/hda4,
  * but the suspend code would see the long name.)
  */
-static int is_resume_device(const struct swap_info_struct *swap_info)
+static inline int is_resume_device(const struct swap_info_struct *swap_info)
 {
 	struct file *file = swap_info->swap_file;
 	struct inode *inode = file->f_dentry->d_inode;
@@ -157,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int i, len;
-
-	len=strlen(resume_file);
-	root_swap = 0xFFFF;
-
-	spin_lock(&swap_lock);
-	for (i=0; i<MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK)) {
-			swapfile_used[i]=SWAPFILE_UNUSED;
-		} else {
-			if (!len) {
-	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
-				if (root_swap == 0xFFFF) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else
-					swapfile_used[i] = SWAPFILE_IGNORED;
-			} else {
-	  			/* we ignore all swap devices that are not the resume_file */
-				if (is_resume_device(&swap_info[i])) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else {
-				  	swapfile_used[i] = SWAPFILE_IGNORED;
-				}
-			}
-		}
-	}
-	spin_unlock(&swap_lock);
-	return (root_swap != 0xffff) ? 0 : -ENODEV;
-}
-
-/**
- * This is called after saving image so modification
- * will be lost after resume... and that's what we want.
- * we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices.
- */
-static void lock_swapdevices(void)
-{
 	int i;
 
+	if (!swsusp_resume_device)
+		return -ENODEV;
 	spin_lock(&swap_lock);
-	for (i = 0; i< MAX_SWAPFILES; i++)
-		if (swapfile_used[i] == SWAPFILE_IGNORED) {
-			swap_info[i].flags ^= SWP_WRITEOK;
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		if (is_resume_device(swap_info + i)) {
+			spin_unlock(&swap_lock);
+			root_swap = i;
+			return 0;
 		}
+	}
 	spin_unlock(&swap_lock);
+	return -ENODEV;
 }
 
 /**
@@ -222,19 +184,14 @@ static void lock_swapdevices(void)
 static int write_page(unsigned long addr, swp_entry_t *loc)
 {
 	swp_entry_t entry;
-	int error = 0;
+	int error = -ENOSPC;
 
-	entry = get_swap_page();
-	if (swp_offset(entry) &&
-	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
-		error = rw_swap_page_sync(WRITE, entry,
-					  virt_to_page(addr));
-		if (error == -EIO)
-			error = 0;
-		if (!error)
+	entry = get_swap_page_of_type(root_swap);
+	if (swp_offset(entry)) {
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
+		if (!error || error == -EIO)
 			*loc = entry;
-	} else
-		error = -ENOSPC;
+	}
 	return error;
 }
 
@@ -539,31 +496,38 @@ static int save_image_metadata(struct pbe *pblist,
  *	enough_swap - Make sure we have enough swap to save the image.
  *
  *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable.
- *
- *	FIXME: si_swapinfo(&i) returns all swap devices information.
- *	We should only consider resume_device.
+ *	space avaiable from the resume partition.
  */
 
 static int enough_swap(unsigned int nr_pages)
 {
-	struct sysinfo i;
+	unsigned int free_swap = swap_info[root_swap].pages -
+		swap_info[root_swap].inuse_pages;
 
-	si_swapinfo(&i);
-	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 /**
- *	write_suspend_image - Write entire image and metadata.
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
  */
-static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
+
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
 	int error;
 
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		return error;
+	}
 	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
@@ -601,26 +565,6 @@ Free_image_entries:
 	goto Free_swap_map;
 }
 
-/* It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
- */
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
-	int error;
-
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
-		return error;
-	}
-	lock_swapdevices();
-	error = write_suspend_image(pblist, nr_pages);
-	/* This will unlock ignored swap devices since writing is finished */
-	lock_swapdevices();
-	return error;
-}
-
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
-- 
cgit v1.1


From 277c6e2ad7369558dbd7ffbcc6dcbe16458bf723 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:58 -0800
Subject: [PATCH] swsusp: save image header first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes the swsusp_info structure become the header of the image in the
literal sense (ie.  it is saved to the swap and read before any other image
data with the help of the swsusp's swap map structure, so generally it is
treated in the same way as the rest of the image).

The main thing it does is to make swsusp_header contain the offset of the swap
map used to track the image data pages rather than the offset of swsusp_info.
 Simultaneously, swsusp_info becomes the first image page written to the swap.

The other changes are generally consequences of the above with a few
exceptions (there's some consolidation in the image reading part as a few
functions turn into trivial wrappers around something else).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |   1 -
 kernel/power/swsusp.c | 190 +++++++++++++++++---------------------------------
 2 files changed, 65 insertions(+), 126 deletions(-)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 273a5b1..7e8492f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -16,7 +16,6 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
-	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0479c9b..55a18d2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -93,7 +93,7 @@ extern char resume_file[];
 
 static struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t swsusp_info;
+	swp_entry_t image;
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -106,7 +106,7 @@ static struct swsusp_info swsusp_info;
 
 static unsigned short root_swap = 0xffff;
 
-static int mark_swapfiles(swp_entry_t prev)
+static int mark_swapfiles(swp_entry_t start)
 {
 	int error;
 
@@ -117,7 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.swsusp_info = prev;
+		swsusp_header.image = start;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
 					  virt_to_page((unsigned long)
@@ -423,22 +423,7 @@ static void init_header(unsigned int nr_pages)
 	swsusp_info.cpus = num_online_cpus();
 	swsusp_info.image_pages = nr_pages;
 	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
-}
-
-static int close_swap(void)
-{
-	swp_entry_t entry;
-	int error;
-
-	dump_info();
-	error = write_page((unsigned long)&swsusp_info, &entry);
-	if (!error) {
-		printk( "S" );
-		error = mark_swapfiles(entry);
-		printk( "|\n" );
-	}
-	return error;
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
 }
 
 /**
@@ -522,6 +507,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
+	swp_entry_t start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
@@ -539,18 +525,23 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 		return -ENOMEM;
 	init_swap_map_handle(&handle, swap_map);
 
-	error = save_image_metadata(pblist, &handle);
+	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
+	if (!error)
+		error = save_image_metadata(pblist, &handle);
 	if (!error)
 		error = save_image_data(pblist, &handle, nr_pages);
 	if (error)
 		goto Free_image_entries;
 
 	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &swsusp_info.start);
+	error = save_swap_map(swap_map, &start);
 	if (error)
 		goto Free_map_entries;
 
-	error = close_swap();
+	dump_info();
+	printk( "S" );
+	error = mark_swapfiles(start);
+	printk( "|\n" );
 	if (error)
 		goto Free_map_entries;
 
@@ -840,70 +831,28 @@ static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
 	return error;
 }
 
-/*
- * Sanity check if this image makes sense with this kernel/swap context
- * I really don't think that it's foolproof but more than nothing..
- */
-
-static const char *sanity_check(void)
+static int check_header(void)
 {
+	char *reason = NULL;
+
 	dump_info();
 	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		return "kernel version";
+		reason = "kernel version";
 	if (swsusp_info.num_physpages != num_physpages)
-		return "memory size";
+		reason = "memory size";
 	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		return "system type";
+		reason = "system type";
 	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		return "kernel release";
+		reason = "kernel release";
 	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		return "version";
+		reason = "version";
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		return "machine";
-#if 0
-	/* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
-	if (swsusp_info.cpus != num_possible_cpus())
-		return "number of cpus";
-#endif
-	return NULL;
-}
-
-static int check_header(void)
-{
-	const char *reason = NULL;
-	int error;
-
-	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
-		return error;
-
- 	/* Is this same machine? */
-	if ((reason = sanity_check())) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
 		return -EPERM;
 	}
-	return error;
-}
-
-static int check_sig(void)
-{
-	int error;
-
-	memset(&swsusp_header, 0, sizeof(swsusp_header));
-	if ((error = bio_read_page(0, &swsusp_header)))
-		return error;
-	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-
-		/*
-		 * Reset swap signature now.
-		 */
-		error = bio_write_page(0, &swsusp_header);
-	} else {
-		return -EINVAL;
-	}
-	if (!error)
-		pr_debug("swsusp: Signature found, resuming\n");
-	return error;
+	return 0;
 }
 
 /**
@@ -989,33 +938,29 @@ static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handl
 	return error;
 }
 
-static int check_suspend_image(void)
-{
-	int error = 0;
-
-	if ((error = check_sig()))
-		return error;
-
-	if ((error = check_header()))
-		return error;
-
-	return 0;
-}
-
-static int read_suspend_image(struct pbe **pblist_ptr)
+int swsusp_read(struct pbe **pblist_ptr)
 {
-	int error = 0;
+	int error;
 	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
-	unsigned int nr_pages = swsusp_info.image_pages;
+	unsigned int nr_pages;
 
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	error = get_swap_map_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_map_read_page(&handle, &swsusp_info);
+	if (!error)
+		error = check_header();
+	if (error)
+		return error;
+	nr_pages = swsusp_info.image_pages;
 	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
 	if (!p)
 		return -ENOMEM;
-	error = get_swap_map_reader(&handle, swsusp_info.start);
-	if (error)
-		/* The PBE list at p will be released by swsusp_free() */
-		return error;
 	error = load_image_metadata(p, &handle);
 	if (!error) {
 		mark_unsafe_pages(p);
@@ -1037,11 +982,18 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 			*pblist_ptr = pblist;
 	}
 	release_swap_map_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
 /**
- *      swsusp_check - Check for saved image in swap
+ *      swsusp_check - Check for swsusp signature in the resume device
  */
 
 int swsusp_check(void)
@@ -1051,39 +1003,27 @@ int swsusp_check(void)
 	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
-		error = check_suspend_image();
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		if ((error = bio_read_page(0, &swsusp_header)))
+			return error;
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(0, &swsusp_header);
+		} else {
+			return -EINVAL;
+		}
 		if (error)
-		    blkdev_put(resume_bdev);
-	} else
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
 		error = PTR_ERR(resume_bdev);
-
-	if (!error)
-		pr_debug("swsusp: resume file found\n");
-	else
-		pr_debug("swsusp: Error %d check for resume file\n", error);
-	return error;
-}
-
-/**
- *	swsusp_read - Read saved image from swap.
- */
-
-int swsusp_read(struct pbe **pblist_ptr)
-{
-	int error;
-
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image(pblist_ptr);
-	blkdev_put(resume_bdev);
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
 
-	if (!error)
-		pr_debug("swsusp: Reading resume file was successful\n");
-	else
-		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
-- 
cgit v1.1


From 8c1d286e6aa5581e9d214cbaec2bee0394bb8de8 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 6 Jan 2006 00:18:38 -0800
Subject: [PATCH] don't freeze firewire on suspend.

We had a report from one loony user who tried out suspend to disk using a
swap partition on a firewire drive.  As the firewire thread was put to
sleep it didn't work out too well.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Ben Collins <bcollins@debian.org>
Cc: Jody McIntyre <scjody@modernduck.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ieee1394/ieee1394_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 64fbbb0..25ef5a8 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -1027,10 +1027,10 @@ static int hpsbpkt_thread(void *__hi)
 
 	daemonize("khpsbpkt");
 
+	current->flags |= PF_NOFREEZE;
+
 	while (1) {
 		if (down_interruptible(&khpsbpkt_sig)) {
-			if (try_to_freeze())
-				continue;
 			printk("khpsbpkt: received unexpected signal?!\n" );
 			break;
 		}
-- 
cgit v1.1


From 60c83c77c4a6a399d55e4f9ad156bccdfe51c96b Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:39 -0800
Subject: [PATCH] m32r: trivial fix to remove unused instructions

A trivial fix to remove unused instructions.

Signed-off-by: Naoto Sugai <Sugai.Naoto@ak.MitsubishiElectric.co.jp>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m32r/kernel/entry.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/m32r/kernel/entry.S b/arch/m32r/kernel/entry.S
index 396c942..f6d4a58 100644
--- a/arch/m32r/kernel/entry.S
+++ b/arch/m32r/kernel/entry.S
@@ -651,8 +651,6 @@ ENTRY(rie_handler)
 /* void rie_handler(int error_code) */
 	SWITCH_TO_KERNEL_STACK
 	SAVE_ALL
-	mvfc	r0, bpc
-	ld	r1, @r0
 	ldi	r1, #0x20			; error_code
 	mv	r0, sp				; pt_regs
 	bl	do_rie_handler
-- 
cgit v1.1


From 9287d95ea194abf32fab24c6909f8ea55ab0292f Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:41 -0800
Subject: [PATCH] m32r: Support M32104UT target platform

This patch is for supporting a new target platform, Renesas M32104UT
evaluation board.

The M32104UT is an eval board based on an uT-Engine specification.  This board
has an MMU-less M32R family processor, M32104.
http://www-wa0.personal-media.co.jp/pmc/archive/te/te_m32104_e.pdf

This board is one of the most popular M32R platform, so we have ported
Linux/M32R to it.

Signed-off-by: Naoto Sugai <Sugai.Naoto@ak.MitsubishiElectric.co.jp>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m32r/Kconfig                        |  26 +-
 arch/m32r/boot/compressed/head.S         |   5 +
 arch/m32r/boot/setup.S                   |   9 +
 arch/m32r/kernel/Makefile                |   1 +
 arch/m32r/kernel/entry.S                 |  17 +-
 arch/m32r/kernel/io_m32104ut.c           | 298 ++++++++++++++
 arch/m32r/kernel/setup.c                 |   7 +-
 arch/m32r/kernel/setup_m32104ut.c        | 162 ++++++++
 arch/m32r/kernel/time.c                  |   4 +-
 arch/m32r/m32104ut/defconfig.m32104ut    | 657 +++++++++++++++++++++++++++++++
 arch/m32r/mm/cache.c                     |  10 +
 include/asm-m32r/assembler.h             |  10 +-
 include/asm-m32r/cacheflush.h            |   2 +-
 include/asm-m32r/irq.h                   |  16 +
 include/asm-m32r/m32102.h                |  31 +-
 include/asm-m32r/m32104ut/m32104ut_pld.h | 163 ++++++++
 include/asm-m32r/m32r.h                  |   6 +-
 include/asm-m32r/system.h                |  12 +-
 18 files changed, 1407 insertions(+), 29 deletions(-)
 create mode 100644 arch/m32r/kernel/io_m32104ut.c
 create mode 100644 arch/m32r/kernel/setup_m32104ut.c
 create mode 100644 arch/m32r/m32104ut/defconfig.m32104ut
 create mode 100644 include/asm-m32r/m32104ut/m32104ut_pld.h

diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 4d100f3..fae67bb 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -81,6 +81,12 @@ config PLAT_MAPPI2
 config PLAT_MAPPI3
        bool "Mappi-III(M3A-2170)"
 
+config PLAT_M32104UT
+	bool "M32104UT"
+	help
+	  The M3T-M32104UT is an reference board based on uT-Engine
+	  specification.  This board has a M32104 chip.
+
 endchoice
 
 choice
@@ -93,6 +99,10 @@ config CHIP_M32700
 config CHIP_M32102
 	bool "M32102"
 
+config CHIP_M32104
+	bool "M32104"
+	depends on PLAT_M32104UT
+
 config CHIP_VDEC2
        bool "VDEC2"
 
@@ -115,7 +125,7 @@ config TLB_ENTRIES
 
 config ISA_M32R
         bool
-	depends on CHIP_M32102
+	depends on CHIP_M32102 || CHIP_M32104
 	default y
 
 config ISA_M32R2
@@ -140,6 +150,7 @@ config BUS_CLOCK
 	default "50000000" if PLAT_MAPPI3
 	default "50000000" if PLAT_M32700UT
 	default "50000000" if PLAT_OPSPUT
+	default "54000000" if PLAT_M32104UT
 	default "33333333" if PLAT_OAKS32R
 	default "20000000" if PLAT_MAPPI2
 
@@ -157,6 +168,7 @@ config MEMORY_START
 	default "08000000" if PLAT_USRV
 	default "08000000" if PLAT_M32700UT
 	default "08000000" if PLAT_OPSPUT
+	default "04000000" if PLAT_M32104UT
 	default "01000000" if PLAT_OAKS32R
 
 config MEMORY_SIZE
@@ -166,6 +178,7 @@ config MEMORY_SIZE
 	default "02000000" if PLAT_USRV
 	default "01000000" if PLAT_M32700UT
 	default "01000000" if PLAT_OPSPUT
+	default "01000000" if PLAT_M32104UT
 	default "00800000" if PLAT_OAKS32R
 
 config NOHIGHMEM
@@ -174,21 +187,22 @@ config NOHIGHMEM
 
 config ARCH_DISCONTIGMEM_ENABLE
 	bool "Internal RAM Support"
-	depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP
+	depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104
 	default y
 
 source "mm/Kconfig"
 
 config IRAM_START
 	hex "Internal memory start address (hex)"
-	default "00f00000"
-	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP) && DISCONTIGMEM
+	default "00f00000" if !CHIP_M32104
+	default "00700000" if CHIP_M32104
+	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
 
 config IRAM_SIZE
 	hex "Internal memory size (hex)"
-	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP) && DISCONTIGMEM
+	depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
 	default "00080000" if CHIP_M32700
-	default "00010000" if CHIP_M32102 || CHIP_OPSP
+	default "00010000" if CHIP_M32102 || CHIP_OPSP || CHIP_M32104
 	default "00008000" if CHIP_VDEC2
 
 #
diff --git a/arch/m32r/boot/compressed/head.S b/arch/m32r/boot/compressed/head.S
index 07cfd6a..234d8b1 100644
--- a/arch/m32r/boot/compressed/head.S
+++ b/arch/m32r/boot/compressed/head.S
@@ -143,6 +143,11 @@ startup:
 	ldi	r0, -2
 	ldi	r1, 0x0100	; invalidate
 	stb	r1, @r0
+#elif defined(CONFIG_CHIP_M32104)
+	/* Cache flush */
+	ldi	r0, -2
+	ldi	r1, 0x0700	; invalidate i-cache, copy back d-cache
+	sth	r1, @r0
 #else
 #error "put your cache flush function, please"
 #endif
diff --git a/arch/m32r/boot/setup.S b/arch/m32r/boot/setup.S
index 5d25643..742669f 100644
--- a/arch/m32r/boot/setup.S
+++ b/arch/m32r/boot/setup.S
@@ -80,6 +80,10 @@ ENTRY(boot)
 	ldi	r1, #0x101		; cache on (with invalidation)
 ;	ldi	r1, #0x00		; cache off
 	st	r1, @r0
+#elif defined(CONFIG_CHIP_M32104)
+	ldi	r0, #-4              ;LDIMM	(r0, M32R_MCCR)
+	ldi	r1, #0x703		; cache on (with invalidation)
+	st	r1, @r0
 #else
 #error unknown chip configuration
 #endif
@@ -115,10 +119,15 @@ mmu_on:
 	st      r1, @(MATM_offset,r0)		; Set MATM (T bit ON)
 	ld      r0, @(MATM_offset,r0)		; Check
 #else
+#if defined(CONFIG_CHIP_M32700)
 	seth	r0,#high(M32R_MCDCAR)
 	or3	r0,r0,#low(M32R_MCDCAR)
 	ld24	r1,#0x8080
 	st	r1,@r0
+#elif defined(CONFIG_CHIP_M32104)
+	LDIMM	(r2, eit_vector)		; set EVB(cr5)
+	mvtc    r2, cr5
+#endif
 #endif	/* CONFIG_MMU */
 	jmp	r13
 	nop
diff --git a/arch/m32r/kernel/Makefile b/arch/m32r/kernel/Makefile
index 6c6b6c3..5a2fa88 100644
--- a/arch/m32r/kernel/Makefile
+++ b/arch/m32r/kernel/Makefile
@@ -16,5 +16,6 @@ obj-$(CONFIG_PLAT_M32700UT)	+= setup_m32700ut.o io_m32700ut.o
 obj-$(CONFIG_PLAT_OPSPUT)	+= setup_opsput.o io_opsput.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_PLAT_OAKS32R)	+= setup_oaks32r.o io_oaks32r.o
+obj-$(CONFIG_PLAT_M32104UT)	+= setup_m32104ut.o io_m32104ut.o
 
 EXTRA_AFLAGS	:= -traditional
diff --git a/arch/m32r/kernel/entry.S b/arch/m32r/kernel/entry.S
index f6d4a58..3871b65 100644
--- a/arch/m32r/kernel/entry.S
+++ b/arch/m32r/kernel/entry.S
@@ -315,7 +315,7 @@ ENTRY(ei_handler)
 	mv	r1, sp			; arg1(regs)
 #if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \
 	|| defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
-	|| defined(CONFIG_CHIP_OPSP)
+	|| defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 
 ;    GET_ICU_STATUS;
 	seth	r0, #shigh(M32R_ICU_ISTS_ADDR)
@@ -541,7 +541,20 @@ check_int2:
 	bra	check_end
 	.fillinsn
 check_end:
-#endif  /* CONFIG_PLAT_OPSPUT */
+#elif defined(CONFIG_PLAT_M32104UT)
+	add3	r2, r0, #-(M32R_IRQ_INT1)       ; INT1# interrupt
+	bnez	r2, check_end
+	; read ICU status register of PLD
+	seth	r0, #high(PLD_ICUISTS)
+	or3	r0, r0, #low(PLD_ICUISTS)
+	lduh	r0, @r0
+	slli	r0, #21
+	srli	r0, #27                         ; ISN
+	addi	r0, #(M32104UT_PLD_IRQ_BASE)
+	bra	check_end
+	.fillinsn
+check_end:
+#endif  /* CONFIG_PLAT_M32104UT */
 	bl	do_IRQ
 #endif  /* CONFIG_SMP */
 	ld	r14, @sp+
diff --git a/arch/m32r/kernel/io_m32104ut.c b/arch/m32r/kernel/io_m32104ut.c
new file mode 100644
index 0000000..3df4215
--- /dev/null
+++ b/arch/m32r/kernel/io_m32104ut.c
@@ -0,0 +1,298 @@
+/*
+ *  linux/arch/m32r/kernel/io_m32104ut.c
+ *
+ *  Typical I/O routines for M32104UT board.
+ *
+ *  Copyright (c) 2001-2005  Hiroyuki Kondo, Hirokazu Takata,
+ *                           Hitoshi Yamamoto, Mamoru Sakugawa,
+ *                           Naoto Sugai, Hayato Fujiwara
+ */
+
+#include <linux/config.h>
+#include <asm/m32r.h>
+#include <asm/page.h>
+#include <asm/io.h>
+#include <asm/byteorder.h>
+
+#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
+#include <linux/types.h>
+
+#define M32R_PCC_IOMAP_SIZE 0x1000
+
+#define M32R_PCC_IOSTART0 0x1000
+#define M32R_PCC_IOEND0   (M32R_PCC_IOSTART0 + M32R_PCC_IOMAP_SIZE - 1)
+
+extern void pcc_ioread_byte(int, unsigned long, void *, size_t, size_t, int);
+extern void pcc_ioread_word(int, unsigned long, void *, size_t, size_t, int);
+extern void pcc_iowrite_byte(int, unsigned long, void *, size_t, size_t, int);
+extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
+#endif /* CONFIG_PCMCIA && CONFIG_M32R_CFC */
+
+#define PORT2ADDR(port)  _port2addr(port)
+
+static inline void *_port2addr(unsigned long port)
+{
+	return (void *)(port + NONCACHE_OFFSET);
+}
+
+#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
+static inline void *__port2addr_ata(unsigned long port)
+{
+	static int	dummy_reg;
+
+	switch (port) {
+	case 0x1f0:	return (void *)0xac002000;
+	case 0x1f1:	return (void *)0xac012800;
+	case 0x1f2:	return (void *)0xac012002;
+	case 0x1f3:	return (void *)0xac012802;
+	case 0x1f4:	return (void *)0xac012004;
+	case 0x1f5:	return (void *)0xac012804;
+	case 0x1f6:	return (void *)0xac012006;
+	case 0x1f7:	return (void *)0xac012806;
+	case 0x3f6:	return (void *)0xac01200e;
+	default: 	return (void *)&dummy_reg;
+	}
+}
+#endif
+
+/*
+ * M32104T-LAN is located in the extended bus space
+ * from 0x01000000 to 0x01ffffff on physical address.
+ * The base address of LAN controller(LAN91C111) is 0x300.
+ */
+#define LAN_IOSTART	0x300
+#define LAN_IOEND	0x320
+static inline void *_port2addr_ne(unsigned long port)
+{
+	return (void *)(port + NONCACHE_OFFSET + 0x01000000);
+}
+
+static inline void delay(void)
+{
+	__asm__ __volatile__ ("push r0; \n\t pop r0;" : : :"memory");
+}
+
+/*
+ * NIC I/O function
+ */
+
+#define PORT2ADDR_NE(port)  _port2addr_ne(port)
+
+static inline unsigned char _ne_inb(void *portp)
+{
+	return *(volatile unsigned char *)portp;
+}
+
+static inline unsigned short _ne_inw(void *portp)
+{
+	return (unsigned short)le16_to_cpu(*(volatile unsigned short *)portp);
+}
+
+static inline void _ne_insb(void *portp, void *addr, unsigned long count)
+{
+	unsigned char *buf = (unsigned char *)addr;
+
+	while (count--)
+		*buf++ = _ne_inb(portp);
+}
+
+static inline void _ne_outb(unsigned char b, void *portp)
+{
+	*(volatile unsigned char *)portp = b;
+}
+
+static inline void _ne_outw(unsigned short w, void *portp)
+{
+	*(volatile unsigned short *)portp = cpu_to_le16(w);
+}
+
+unsigned char _inb(unsigned long port)
+{
+	if (port >= LAN_IOSTART && port < LAN_IOEND)
+		return _ne_inb(PORT2ADDR_NE(port));
+
+	return *(volatile unsigned char *)PORT2ADDR(port);
+}
+
+unsigned short _inw(unsigned long port)
+{
+	if (port >= LAN_IOSTART && port < LAN_IOEND)
+		return _ne_inw(PORT2ADDR_NE(port));
+
+	return *(volatile unsigned short *)PORT2ADDR(port);
+}
+
+unsigned long _inl(unsigned long port)
+{
+	return *(volatile unsigned long *)PORT2ADDR(port);
+}
+
+unsigned char _inb_p(unsigned long port)
+{
+	unsigned char v = _inb(port);
+	delay();
+	return (v);
+}
+
+unsigned short _inw_p(unsigned long port)
+{
+	unsigned short v = _inw(port);
+	delay();
+	return (v);
+}
+
+unsigned long _inl_p(unsigned long port)
+{
+	unsigned long v = _inl(port);
+	delay();
+	return (v);
+}
+
+void _outb(unsigned char b, unsigned long port)
+{
+	if (port >= LAN_IOSTART && port < LAN_IOEND)
+		_ne_outb(b, PORT2ADDR_NE(port));
+	else
+		*(volatile unsigned char *)PORT2ADDR(port) = b;
+}
+
+void _outw(unsigned short w, unsigned long port)
+{
+	if (port >= LAN_IOSTART && port < LAN_IOEND)
+		_ne_outw(w, PORT2ADDR_NE(port));
+	else
+		*(volatile unsigned short *)PORT2ADDR(port) = w;
+}
+
+void _outl(unsigned long l, unsigned long port)
+{
+	*(volatile unsigned long *)PORT2ADDR(port) = l;
+}
+
+void _outb_p(unsigned char b, unsigned long port)
+{
+	_outb(b, port);
+	delay();
+}
+
+void _outw_p(unsigned short w, unsigned long port)
+{
+	_outw(w, port);
+	delay();
+}
+
+void _outl_p(unsigned long l, unsigned long port)
+{
+	_outl(l, port);
+	delay();
+}
+
+void _insb(unsigned int port, void *addr, unsigned long count)
+{
+	if (port >= LAN_IOSTART && port < LAN_IOEND)
+		_ne_insb(PORT2ADDR_NE(port), addr, count);
+	else {
+		unsigned char *buf = addr;
+		unsigned char *portp = PORT2ADDR(port);
+		while (count--)
+			*buf++ = *(volatile unsigned char *)portp;
+	}
+}
+
+void _insw(unsigned int port, void *addr, unsigned long count)
+{
+	unsigned short *buf = addr;
+	unsigned short *portp;
+
+	if (port >= LAN_IOSTART && port < LAN_IOEND) {
+		/*
+		 * This portion is only used by smc91111.c to read data
+		 * from the DATA_REG. Do not swap the data.
+		 */
+		portp = PORT2ADDR_NE(port);
+		while (count--)
+			*buf++ = *(volatile unsigned short *)portp;
+#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
+	} else if (port >= M32R_PCC_IOSTART0 && port <= M32R_PCC_IOEND0) {
+		pcc_ioread_word(9, port, (void *)addr, sizeof(unsigned short),
+				count, 1);
+#endif
+#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
+	} else if ((port >= 0x1f0 && port <=0x1f7) || port == 0x3f6) {
+		portp = __port2addr_ata(port);
+		while (count--)
+			*buf++ = *(volatile unsigned short *)portp;
+#endif
+	} else {
+		portp = PORT2ADDR(port);
+		while (count--)
+			*buf++ = *(volatile unsigned short *)portp;
+	}
+}
+
+void _insl(unsigned int port, void *addr, unsigned long count)
+{
+	unsigned long *buf = addr;
+	unsigned long *portp;
+
+	portp = PORT2ADDR(port);
+	while (count--)
+		*buf++ = *(volatile unsigned long *)portp;
+}
+
+void _outsb(unsigned int port, const void *addr, unsigned long count)
+{
+	const unsigned char *buf = addr;
+	unsigned char *portp;
+
+	if (port >= LAN_IOSTART && port < LAN_IOEND) {
+		portp = PORT2ADDR_NE(port);
+		while (count--)
+			_ne_outb(*buf++, portp);
+	} else {
+		portp = PORT2ADDR(port);
+		while (count--)
+			*(volatile unsigned char *)portp = *buf++;
+	}
+}
+
+void _outsw(unsigned int port, const void *addr, unsigned long count)
+{
+	const unsigned short *buf = addr;
+	unsigned short *portp;
+
+	if (port >= LAN_IOSTART && port < LAN_IOEND) {
+		/*
+		 * This portion is only used by smc91111.c to write data
+		 * into the DATA_REG. Do not swap the data.
+		 */
+		portp = PORT2ADDR_NE(port);
+		while (count--)
+			*(volatile unsigned short *)portp = *buf++;
+#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
+	} else if ((port >= 0x1f0 && port <=0x1f7) || port == 0x3f6) {
+		portp = __port2addr_ata(port);
+		while (count--)
+			*(volatile unsigned short *)portp = *buf++;
+#endif
+#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
+	} else if (port >= M32R_PCC_IOSTART0 && port <= M32R_PCC_IOEND0) {
+		pcc_iowrite_word(9, port, (void *)addr, sizeof(unsigned short),
+				 count, 1);
+#endif
+	} else {
+		portp = PORT2ADDR(port);
+		while (count--)
+			*(volatile unsigned short *)portp = *buf++;
+	}
+}
+
+void _outsl(unsigned int port, const void *addr, unsigned long count)
+{
+	const unsigned long *buf = addr;
+	unsigned char *portp;
+
+	portp = PORT2ADDR(port);
+	while (count--)
+		*(volatile unsigned long *)portp = *buf++;
+}
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index f722ec8..c2e4dcc 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -320,6 +320,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 #elif defined(CONFIG_CHIP_MP)
 	seq_printf(m, "cpu family\t: M32R-MP\n"
 		"cache size\t: I-xxKB/D-xxKB\n");
+#elif  defined(CONFIG_CHIP_M32104)
+	seq_printf(m,"cpu family\t: M32104\n"
+		"cache size\t: I-8KB/D-8KB\n");
 #else
 	seq_printf(m, "cpu family\t: Unknown\n");
 #endif
@@ -340,6 +343,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	seq_printf(m, "Machine\t\t: uServer\n");
 #elif defined(CONFIG_PLAT_OAKS32R)
 	seq_printf(m, "Machine\t\t: OAKS32R\n");
+#elif  defined(CONFIG_PLAT_M32104UT)
+	seq_printf(m, "Machine\t\t: M3T-M32104UT uT Engine board\n");
 #else
 	seq_printf(m, "Machine\t\t: Unknown\n");
 #endif
@@ -389,7 +394,7 @@ unsigned long cpu_initialized __initdata = 0;
  */
 #if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2)	\
 	|| defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
-	|| defined(CONFIG_CHIP_OPSP)
+	|| defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 void __init cpu_init (void)
 {
 	int cpu_id = smp_processor_id();
diff --git a/arch/m32r/kernel/setup_m32104ut.c b/arch/m32r/kernel/setup_m32104ut.c
new file mode 100644
index 0000000..ab16c66
--- /dev/null
+++ b/arch/m32r/kernel/setup_m32104ut.c
@@ -0,0 +1,162 @@
+/*
+ *  linux/arch/m32r/kernel/setup_m32104ut.c
+ *
+ *  Setup routines for M32104UT Board
+ *
+ *  Copyright (c) 2002-2005  Hiroyuki Kondo, Hirokazu Takata,
+ *                           Hitoshi Yamamoto, Mamoru Sakugawa,
+ *                           Naoto Sugai, Hayato Fujiwara
+ */
+
+#include <linux/config.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/device.h>
+
+#include <asm/system.h>
+#include <asm/m32r.h>
+#include <asm/io.h>
+
+#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
+
+#ifndef CONFIG_SMP
+typedef struct {
+	unsigned long icucr;  /* ICU Control Register */
+} icu_data_t;
+#endif /* CONFIG_SMP */
+
+icu_data_t icu_data[NR_IRQS];
+
+static void disable_m32104ut_irq(unsigned int irq)
+{
+	unsigned long port, data;
+
+	port = irq2port(irq);
+	data = icu_data[irq].icucr|M32R_ICUCR_ILEVEL7;
+	outl(data, port);
+}
+
+static void enable_m32104ut_irq(unsigned int irq)
+{
+	unsigned long port, data;
+
+	port = irq2port(irq);
+	data = icu_data[irq].icucr|M32R_ICUCR_IEN|M32R_ICUCR_ILEVEL6;
+	outl(data, port);
+}
+
+static void mask_and_ack_m32104ut(unsigned int irq)
+{
+	disable_m32104ut_irq(irq);
+}
+
+static void end_m32104ut_irq(unsigned int irq)
+{
+	enable_m32104ut_irq(irq);
+}
+
+static unsigned int startup_m32104ut_irq(unsigned int irq)
+{
+	enable_m32104ut_irq(irq);
+	return (0);
+}
+
+static void shutdown_m32104ut_irq(unsigned int irq)
+{
+	unsigned long port;
+
+	port = irq2port(irq);
+	outl(M32R_ICUCR_ILEVEL7, port);
+}
+
+static struct hw_interrupt_type m32104ut_irq_type =
+{
+	.typename = "M32104UT-IRQ",
+	.startup = startup_m32104ut_irq,
+	.shutdown = shutdown_m32104ut_irq,
+	.enable = enable_m32104ut_irq,
+	.disable = disable_m32104ut_irq,
+	.ack = mask_and_ack_m32104ut,
+	.end = end_m32104ut_irq
+};
+
+void __init init_IRQ(void)
+{
+	static int once = 0;
+
+	if (once)
+		return;
+	else
+		once++;
+
+#if defined(CONFIG_SMC91X)
+	/* INT#0: LAN controller on M32104UT-LAN (SMC91C111)*/
+	irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
+	irq_desc[M32R_IRQ_INT0].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_INT0].action = 0;
+	irq_desc[M32R_IRQ_INT0].depth = 1;
+	icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD11; /* "H" level sense */
+	disable_m32104ut_irq(M32R_IRQ_INT0);
+#endif  /* CONFIG_SMC91X */
+
+	/* MFT2 : system timer */
+	irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
+	irq_desc[M32R_IRQ_MFT2].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_MFT2].action = 0;
+	irq_desc[M32R_IRQ_MFT2].depth = 1;
+	icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
+	disable_m32104ut_irq(M32R_IRQ_MFT2);
+
+#ifdef CONFIG_SERIAL_M32R_SIO
+	/* SIO0_R : uart receive data */
+	irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
+	irq_desc[M32R_IRQ_SIO0_R].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_R].action = 0;
+	irq_desc[M32R_IRQ_SIO0_R].depth = 1;
+	icu_data[M32R_IRQ_SIO0_R].icucr = M32R_ICUCR_IEN;
+	disable_m32104ut_irq(M32R_IRQ_SIO0_R);
+
+	/* SIO0_S : uart send data */
+	irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
+	irq_desc[M32R_IRQ_SIO0_S].handler = &m32104ut_irq_type;
+	irq_desc[M32R_IRQ_SIO0_S].action = 0;
+	irq_desc[M32R_IRQ_SIO0_S].depth = 1;
+	icu_data[M32R_IRQ_SIO0_S].icucr = M32R_ICUCR_IEN;
+	disable_m32104ut_irq(M32R_IRQ_SIO0_S);
+#endif /* CONFIG_SERIAL_M32R_SIO */
+}
+
+#if defined(CONFIG_SMC91X)
+
+#define LAN_IOSTART     0x300
+#define LAN_IOEND       0x320
+static struct resource smc91x_resources[] = {
+	[0] = {
+		.start  = (LAN_IOSTART),
+		.end    = (LAN_IOEND),
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = M32R_IRQ_INT0,
+		.end    = M32R_IRQ_INT0,
+		.flags  = IORESOURCE_IRQ,
+	}
+};
+
+static struct platform_device smc91x_device = {
+	.name		= "smc91x",
+	.id		= 0,
+	.num_resources  = ARRAY_SIZE(smc91x_resources),
+	.resource       = smc91x_resources,
+};
+#endif
+
+static int __init platform_init(void)
+{
+#if defined(CONFIG_SMC91X)
+	platform_device_register(&smc91x_device);
+#endif
+	return 0;
+}
+arch_initcall(platform_init);
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 2ebce20..b8e68b5 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -57,7 +57,7 @@ static unsigned long do_gettimeoffset(void)
 
 #if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \
 	|| defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \
-	|| defined(CONFIG_CHIP_OPSP)
+	|| defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 #ifndef CONFIG_SMP
 
 	unsigned long count;
@@ -268,7 +268,7 @@ void __init time_init(void)
 
 #if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \
 	|| defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \
-	|| defined(CONFIG_CHIP_OPSP)
+	|| defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 
 	/* M32102 MFT setup */
 	setup_irq(M32R_IRQ_MFT2, &irq0);
diff --git a/arch/m32r/m32104ut/defconfig.m32104ut b/arch/m32r/m32104ut/defconfig.m32104ut
new file mode 100644
index 0000000..454de33
--- /dev/null
+++ b/arch/m32r/m32104ut/defconfig.m32104ut
@@ -0,0 +1,657 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.14
+# Wed Nov  9 16:04:51 2005
+#
+CONFIG_M32R=y
+# CONFIG_UID16 is not set
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_IRQ_PROBE=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+CONFIG_LOCALVERSION_AUTO=y
+# CONFIG_POSIX_MQUEUE is not set
+# CONFIG_BSD_PROCESS_ACCT is not set
+CONFIG_SYSCTL=y
+# CONFIG_AUDIT is not set
+CONFIG_HOTPLUG=y
+# CONFIG_KOBJECT_UEVENT is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_EMBEDDED=y
+# CONFIG_KALLSYMS is not set
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_BASE_FULL=y
+# CONFIG_FUTEX is not set
+# CONFIG_EPOLL is not set
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+CONFIG_CC_ALIGN_FUNCTIONS=0
+CONFIG_CC_ALIGN_LABELS=0
+CONFIG_CC_ALIGN_LOOPS=0
+CONFIG_CC_ALIGN_JUMPS=0
+CONFIG_TINY_SHMEM=y
+CONFIG_BASE_SMALL=0
+
+#
+# Loadable module support
+#
+# CONFIG_MODULES is not set
+
+#
+# Processor type and features
+#
+# CONFIG_PLAT_MAPPI is not set
+# CONFIG_PLAT_USRV is not set
+# CONFIG_PLAT_M32700UT is not set
+# CONFIG_PLAT_OPSPUT is not set
+# CONFIG_PLAT_OAKS32R is not set
+# CONFIG_PLAT_MAPPI2 is not set
+# CONFIG_PLAT_MAPPI3 is not set
+CONFIG_PLAT_M32104UT=y
+# CONFIG_CHIP_M32700 is not set
+# CONFIG_CHIP_M32102 is not set
+CONFIG_CHIP_M32104=y
+# CONFIG_CHIP_VDEC2 is not set
+# CONFIG_CHIP_OPSP is not set
+CONFIG_ISA_M32R=y
+CONFIG_BUS_CLOCK=54000000
+CONFIG_TIMER_DIVIDE=128
+# CONFIG_CPU_LITTLE_ENDIAN is not set
+CONFIG_MEMORY_START=04000000
+CONFIG_MEMORY_SIZE=01000000
+CONFIG_NOHIGHMEM=y
+# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+# CONFIG_PREEMPT is not set
+# CONFIG_SMP is not set
+
+#
+# Bus options (PCI, PCMCIA, EISA, MCA, ISA)
+#
+# CONFIG_ISA is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+CONFIG_PCCARD=y
+# CONFIG_PCMCIA_DEBUG is not set
+CONFIG_PCMCIA=y
+CONFIG_PCMCIA_LOAD_CIS=y
+CONFIG_PCMCIA_IOCTL=y
+
+#
+# PC-card bridges
+#
+
+#
+# PCI Hotplug Support
+#
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_FLAT=y
+# CONFIG_BINFMT_ZFLAT is not set
+# CONFIG_BINFMT_SHARED_FLAT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_PACKET is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_IP_PNP_BOOTP is not set
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_TUNNEL is not set
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+# CONFIG_TCP_CONG_ADVANCED is not set
+CONFIG_TCP_CONG_BIC=y
+# CONFIG_IPV6 is not set
+# CONFIG_NETFILTER is not set
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_DCCP is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_SCTP is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_SCHED is not set
+# CONFIG_NET_CLS_ROUTE is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+# CONFIG_IEEE80211 is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+# CONFIG_CONNECTOR is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+# CONFIG_PARPORT is not set
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_CRYPTOLOOP is not set
+CONFIG_BLK_DEV_NBD=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_CDROM_PKTCDVD is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+# CONFIG_IOSCHED_AS is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+# CONFIG_ATA_OVER_ETH is not set
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Fusion MPT device support
+#
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+
+#
+# I2O device support
+#
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_DUMMY=y
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+
+#
+# PHY device support
+#
+# CONFIG_PHYLIB is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+CONFIG_SMC91X=y
+# CONFIG_NE2000 is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+
+#
+# Ethernet (10000 Mbit)
+#
+
+#
+# Token Ring devices
+#
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# PCMCIA network device support
+#
+# CONFIG_NET_PCMCIA is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_SHAPER is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_SERIAL_M32R_SIO=y
+CONFIG_SERIAL_M32R_SIO_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=y
+# CONFIG_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+
+#
+# PCMCIA character devices
+#
+# CONFIG_SYNCLINK_CS is not set
+# CONFIG_RAW_DRIVER is not set
+
+#
+# TPM devices
+#
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Hardware Monitoring support
+#
+# CONFIG_HWMON is not set
+# CONFIG_HWMON_VID is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia Capabilities Port drivers
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+# CONFIG_FB is not set
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+# CONFIG_USB_ARCH_HAS_HCD is not set
+# CONFIG_USB_ARCH_HAS_OHCI is not set
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# InfiniBand support
+#
+
+#
+# SN Devices
+#
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+# CONFIG_EXT3_FS_SECURITY is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+# CONFIG_XFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+CONFIG_DNOTIFY=y
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=932
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+# CONFIG_HUGETLB_PAGE is not set
+CONFIG_RAMFS=y
+# CONFIG_RELAYFS_FS is not set
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=y
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+# CONFIG_NFS_DIRECTIO is not set
+# CONFIG_NFSD is not set
+CONFIG_ROOT_NFS=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+# CONFIG_9P_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+CONFIG_NLS_CODEPAGE_932=y
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+# CONFIG_NLS_ASCII is not set
+# CONFIG_NLS_ISO8859_1 is not set
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+CONFIG_NLS_UTF8=y
+
+#
+# Profiling support
+#
+# CONFIG_PROFILING is not set
+
+#
+# Kernel hacking
+#
+# CONFIG_PRINTK_TIME is not set
+CONFIG_DEBUG_KERNEL=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_KOBJECT is not set
+# CONFIG_DEBUG_BUGVERBOSE is not set
+CONFIG_DEBUG_INFO=y
+# CONFIG_DEBUG_FS is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_DEBUG_STACKOVERFLOW is not set
+# CONFIG_DEBUG_STACK_USAGE is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Hardware crypto devices
+#
+
+#
+# Library routines
+#
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+CONFIG_CRC32=y
+CONFIG_LIBCRC32C=y
+CONFIG_ZLIB_INFLATE=y
diff --git a/arch/m32r/mm/cache.c b/arch/m32r/mm/cache.c
index 31b0789..c6f72a6 100644
--- a/arch/m32r/mm/cache.c
+++ b/arch/m32r/mm/cache.c
@@ -26,6 +26,16 @@
 #define MCCR		((volatile unsigned char*)0xfffffffe)
 #define MCCR_IIV	(1UL << 0)	/* I-cache invalidate */
 #define MCCR_ICACHE_INV		MCCR_IIV
+#elif defined(CONFIG_CHIP_M32104)
+#define MCCR		((volatile unsigned long*)0xfffffffc)
+#define MCCR_IIV	(1UL << 8)	/* I-cache invalidate */
+#define MCCR_DIV	(1UL << 9)	/* D-cache invalidate */
+#define MCCR_DCB	(1UL << 10)	/* D-cache copy back */
+#define MCCR_ICM	(1UL << 0)	/* I-cache mode [0:off,1:on] */
+#define MCCR_DCM	(1UL << 1)	/* D-cache mode [0:off,1:on] */
+#define MCCR_ICACHE_INV		MCCR_IIV
+#define MCCR_DCACHE_CB		MCCR_DCB
+#define MCCR_DCACHE_CBINV	(MCCR_DIV|MCCR_DCB)
 #endif /* CONFIG_CHIP_XNUX2 || CONFIG_CHIP_M32700 */
 
 #ifndef MCCR
diff --git a/include/asm-m32r/assembler.h b/include/asm-m32r/assembler.h
index e1dff9d..b7f4d8a 100644
--- a/include/asm-m32r/assembler.h
+++ b/include/asm-m32r/assembler.h
@@ -52,7 +52,7 @@
 	or3	\reg, \reg, #low(\x)
 	.endm
 
-#if !defined(CONFIG_CHIP_M32102)
+#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
 #define STI(reg) STI_M reg
 	.macro STI_M reg
 	setpsw  #0x40	    ->	nop
@@ -64,7 +64,7 @@
 	clrpsw  #0x40	    ->	nop
 	; WORKAROUND: "-> nop" is a workaround for the M32700(TS1).
 	.endm
-#else	/* CONFIG_CHIP_M32102 */
+#else	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 #define STI(reg) STI_M reg
 	.macro STI_M reg
 	mvfc	\reg, psw
@@ -191,12 +191,12 @@
 	and  \reg, sp
 	.endm
 
-#if !defined(CONFIG_CHIP_M32102)
+#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
 	.macro	SWITCH_TO_KERNEL_STACK
 	; switch to kernel stack (spi)
 	clrpsw	#0x80	    ->	nop
 	.endm
-#else	/* CONFIG_CHIP_M32102 */
+#else	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 	.macro	SWITCH_TO_KERNEL_STACK
 	push	r0		; save r0 for working
 	mvfc	r0, psw
@@ -218,7 +218,7 @@
 	.fillinsn
 2:
 	.endm
-#endif	/* CONFIG_CHIP_M32102 */
+#endif	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 
 #endif	/* __ASSEMBLY__ */
 
diff --git a/include/asm-m32r/cacheflush.h b/include/asm-m32r/cacheflush.h
index 46fc4c3..e57427b 100644
--- a/include/asm-m32r/cacheflush.h
+++ b/include/asm-m32r/cacheflush.h
@@ -7,7 +7,7 @@
 extern void _flush_cache_all(void);
 extern void _flush_cache_copyback_all(void);
 
-#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_OPSP)
+#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 #define flush_cache_all()			do { } while (0)
 #define flush_cache_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
diff --git a/include/asm-m32r/irq.h b/include/asm-m32r/irq.h
index 8ed7796..ca94395 100644
--- a/include/asm-m32r/irq.h
+++ b/include/asm-m32r/irq.h
@@ -65,6 +65,22 @@
 #define NR_IRQS \
 	(OPSPUT_NUM_CPU_IRQ + OPSPUT_NUM_PLD_IRQ \
 	+ OPSPUT_NUM_LCD_PLD_IRQ + OPSPUT_NUM_LAN_PLD_IRQ)
+
+#elif defined(CONFIG_PLAT_M32104UT)
+/*
+ * IRQ definitions for M32104UT
+ *  M32104 Chip: 64 interrupts
+ *  ICU of M32104UT-on-board PLD: 32 interrupts cascaded to INT1# chip pin
+ */
+#define	M32104UT_NUM_CPU_IRQ	(64)
+#define M32104UT_NUM_PLD_IRQ	(32)
+#define M32104UT_IRQ_BASE	0
+#define M32104UT_CPU_IRQ_BASE	M32104UT_IRQ_BASE
+#define M32104UT_PLD_IRQ_BASE	(M32104UT_CPU_IRQ_BASE + M32104UT_NUM_CPU_IRQ)
+
+#define NR_IRQS	\
+    (M32104UT_NUM_CPU_IRQ + M32104UT_NUM_PLD_IRQ)
+
 #else
 #define NR_IRQS	64
 #endif
diff --git a/include/asm-m32r/m32102.h b/include/asm-m32r/m32102.h
index cb98101..0bd0a3f 100644
--- a/include/asm-m32r/m32102.h
+++ b/include/asm-m32r/m32102.h
@@ -11,7 +11,11 @@
 /*======================================================================*
  * Special Function Register
  *======================================================================*/
+#if !defined(CONFIG_CHIP_M32104)
 #define M32R_SFR_OFFSET  (0x00E00000)  /* 0x00E00000-0x00EFFFFF 1[MB] */
+#else
+#define M32R_SFR_OFFSET  (0x00700000)  /* 0x00700000-0x007FFFFF 1[MB] */
+#endif
 
 /*
  * Clock and Power Management registers.
@@ -100,7 +104,7 @@
 #define M32R_MFT5RLD_PORTL     (0x0C+M32R_MFT5_OFFSET)  /* MFT4 reload */
 #define M32R_MFT5CMPRLD_PORTL  (0x10+M32R_MFT5_OFFSET)  /* MFT4 compare reload */
 
-#ifdef CONFIG_CHIP_M32700
+#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32104)
 #define M32R_MFTCR_MFT0MSK  (1UL<<31)  /* b0 */
 #define M32R_MFTCR_MFT1MSK  (1UL<<30)  /* b1 */
 #define M32R_MFTCR_MFT2MSK  (1UL<<29)  /* b2 */
@@ -113,7 +117,7 @@
 #define M32R_MFTCR_MFT3EN   (1UL<<20)  /* b11 */
 #define M32R_MFTCR_MFT4EN   (1UL<<19)  /* b12 */
 #define M32R_MFTCR_MFT5EN   (1UL<<18)  /* b13 */
-#else	/* not CONFIG_CHIP_M32700 */
+#else	/* not CONFIG_CHIP_M32700 && not CONFIG_CHIP_M32104 */
 #define M32R_MFTCR_MFT0MSK  (1UL<<15)  /* b16 */
 #define M32R_MFTCR_MFT1MSK  (1UL<<14)  /* b17 */
 #define M32R_MFTCR_MFT2MSK  (1UL<<13)  /* b18 */
@@ -126,7 +130,7 @@
 #define M32R_MFTCR_MFT3EN   (1UL<<4)   /* b27 */
 #define M32R_MFTCR_MFT4EN   (1UL<<3)   /* b28 */
 #define M32R_MFTCR_MFT5EN   (1UL<<2)   /* b29 */
-#endif	/* not CONFIG_CHIP_M32700 */
+#endif	/* not CONFIG_CHIP_M32700 && not CONFIG_CHIP_M32104 */
 
 #define M32R_MFTMOD_CC_MASK    (1UL<<15)  /* b16 */
 #define M32R_MFTMOD_TCCR       (1UL<<13)  /* b18 */
@@ -241,8 +245,24 @@
 #define M32R_IRQ_MFT1    (17)  /* MFT1 */
 #define M32R_IRQ_MFT2    (18)  /* MFT2 */
 #define M32R_IRQ_MFT3    (19)  /* MFT3 */
-#define M32R_IRQ_MFT4    (20)  /* MFT4 */
-#define M32R_IRQ_MFT5    (21)  /* MFT5 */
+#ifdef CONFIG_CHIP_M32104
+#define M32R_IRQ_MFTX0   (24)  /* MFTX0 */
+#define M32R_IRQ_MFTX1   (25)  /* MFTX1 */
+#define M32R_IRQ_DMA0    (32)  /* DMA0 */
+#define M32R_IRQ_DMA1    (33)  /* DMA1 */
+#define M32R_IRQ_DMA2    (34)  /* DMA2 */
+#define M32R_IRQ_DMA3    (35)  /* DMA3 */
+#define M32R_IRQ_SIO0_R  (40)  /* SIO0 send    */
+#define M32R_IRQ_SIO0_S  (41)  /* SIO0 receive */
+#define M32R_IRQ_SIO1_R  (42)  /* SIO1 send    */
+#define M32R_IRQ_SIO1_S  (43)  /* SIO1 receive */
+#define M32R_IRQ_SIO2_R  (44)  /* SIO2 send    */
+#define M32R_IRQ_SIO2_S  (45)  /* SIO2 receive */
+#define M32R_IRQ_SIO3_R  (46)  /* SIO3 send    */
+#define M32R_IRQ_SIO3_S  (47)  /* SIO3 receive */
+#define M32R_IRQ_ADC     (56)  /* ADC */
+#define M32R_IRQ_PC      (57)  /* PC */
+#else /* ! M32104 */
 #define M32R_IRQ_DMA0    (32)  /* DMA0 */
 #define M32R_IRQ_DMA1    (33)  /* DMA1 */
 #define M32R_IRQ_SIO0_R  (48)  /* SIO0 send    */
@@ -255,6 +275,7 @@
 #define M32R_IRQ_SIO3_S  (55)  /* SIO3 receive */
 #define M32R_IRQ_SIO4_R  (56)  /* SIO4 send    */
 #define M32R_IRQ_SIO4_S  (57)  /* SIO4 receive */
+#endif /* ! M32104 */
 
 #ifdef CONFIG_SMP
 #define M32R_IRQ_IPI0    (56)
diff --git a/include/asm-m32r/m32104ut/m32104ut_pld.h b/include/asm-m32r/m32104ut/m32104ut_pld.h
new file mode 100644
index 0000000..a4eac20
--- /dev/null
+++ b/include/asm-m32r/m32104ut/m32104ut_pld.h
@@ -0,0 +1,163 @@
+/*
+ * include/asm/m32104ut/m32104ut_pld.h
+ *
+ * Definitions for Programable Logic Device(PLD) on M32104UT board.
+ * Based on m32700ut_pld.h
+ *
+ * Copyright (c) 2002	Takeo Takahashi
+ * Copyright (c) 2005	Naoto Sugai
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ */
+
+#ifndef _M32104UT_M32104UT_PLD_H
+#define _M32104UT_M32104UT_PLD_H
+
+#include <linux/config.h>
+
+#if defined(CONFIG_PLAT_M32104UT)
+#define PLD_PLAT_BASE		0x02c00000
+#else
+#error "no platform configuration"
+#endif
+
+#ifndef __ASSEMBLY__
+/*
+ * C functions use non-cache address.
+ */
+#define PLD_BASE		(PLD_PLAT_BASE /* + NONCACHE_OFFSET */)
+#define __reg8			(volatile unsigned char *)
+#define __reg16			(volatile unsigned short *)
+#define __reg32			(volatile unsigned int *)
+#else
+#define PLD_BASE		(PLD_PLAT_BASE + NONCACHE_OFFSET)
+#define __reg8
+#define __reg16
+#define __reg32
+#endif	/* __ASSEMBLY__ */
+
+/* CFC */
+#define	PLD_CFRSTCR		__reg16(PLD_BASE + 0x0000)
+#define PLD_CFSTS		__reg16(PLD_BASE + 0x0002)
+#define PLD_CFIMASK		__reg16(PLD_BASE + 0x0004)
+#define PLD_CFBUFCR		__reg16(PLD_BASE + 0x0006)
+
+/* MMC */
+#define PLD_MMCCR		__reg16(PLD_BASE + 0x4000)
+#define PLD_MMCMOD		__reg16(PLD_BASE + 0x4002)
+#define PLD_MMCSTS		__reg16(PLD_BASE + 0x4006)
+#define PLD_MMCBAUR		__reg16(PLD_BASE + 0x400a)
+#define PLD_MMCCMDBCUT		__reg16(PLD_BASE + 0x400c)
+#define PLD_MMCCDTBCUT		__reg16(PLD_BASE + 0x400e)
+#define PLD_MMCDET		__reg16(PLD_BASE + 0x4010)
+#define PLD_MMCWP		__reg16(PLD_BASE + 0x4012)
+#define PLD_MMCWDATA		__reg16(PLD_BASE + 0x5000)
+#define PLD_MMCRDATA		__reg16(PLD_BASE + 0x6000)
+#define PLD_MMCCMDDATA		__reg16(PLD_BASE + 0x7000)
+#define PLD_MMCRSPDATA		__reg16(PLD_BASE + 0x7006)
+
+/* ICU
+ *  ICUISTS:	status register
+ *  ICUIREQ0: 	request register
+ *  ICUIREQ1: 	request register
+ *  ICUCR3:	control register for CFIREQ# interrupt
+ *  ICUCR4:	control register for CFC Card insert interrupt
+ *  ICUCR5:	control register for CFC Card eject interrupt
+ *  ICUCR6:	control register for external interrupt
+ *  ICUCR11:	control register for MMC Card insert/eject interrupt
+ *  ICUCR13:	control register for SC error interrupt
+ *  ICUCR14:	control register for SC receive interrupt
+ *  ICUCR15:	control register for SC send interrupt
+ */
+
+#define PLD_IRQ_INT0		(M32104UT_PLD_IRQ_BASE + 0)	/* None */
+#define PLD_IRQ_CFIREQ		(M32104UT_PLD_IRQ_BASE + 3)	/* CF IREQ */
+#define PLD_IRQ_CFC_INSERT	(M32104UT_PLD_IRQ_BASE + 4)	/* CF Insert */
+#define PLD_IRQ_CFC_EJECT	(M32104UT_PLD_IRQ_BASE + 5)	/* CF Eject */
+#define PLD_IRQ_EXINT		(M32104UT_PLD_IRQ_BASE + 6)	/* EXINT */
+#define PLD_IRQ_MMCCARD		(M32104UT_PLD_IRQ_BASE + 11)	/* MMC Insert/Eject */
+#define PLD_IRQ_SC_ERROR	(M32104UT_PLD_IRQ_BASE + 13)	/* SC error */
+#define PLD_IRQ_SC_RCV		(M32104UT_PLD_IRQ_BASE + 14)	/* SC receive */
+#define PLD_IRQ_SC_SND		(M32104UT_PLD_IRQ_BASE + 15)	/* SC send */
+
+#define PLD_ICUISTS		__reg16(PLD_BASE + 0x8002)
+#define PLD_ICUISTS_VECB_MASK	(0xf000)
+#define PLD_ICUISTS_VECB(x)	((x) & PLD_ICUISTS_VECB_MASK)
+#define PLD_ICUISTS_ISN_MASK	(0x07c0)
+#define PLD_ICUISTS_ISN(x)	((x) & PLD_ICUISTS_ISN_MASK)
+#define PLD_ICUCR3		__reg16(PLD_BASE + 0x8104)
+#define PLD_ICUCR4		__reg16(PLD_BASE + 0x8106)
+#define PLD_ICUCR5		__reg16(PLD_BASE + 0x8108)
+#define PLD_ICUCR6		__reg16(PLD_BASE + 0x810a)
+#define PLD_ICUCR11		__reg16(PLD_BASE + 0x8114)
+#define PLD_ICUCR13		__reg16(PLD_BASE + 0x8118)
+#define PLD_ICUCR14		__reg16(PLD_BASE + 0x811a)
+#define PLD_ICUCR15		__reg16(PLD_BASE + 0x811c)
+#define PLD_ICUCR_IEN		(0x1000)
+#define PLD_ICUCR_IREQ		(0x0100)
+#define PLD_ICUCR_ISMOD00	(0x0000)	/* Low edge */
+#define PLD_ICUCR_ISMOD01	(0x0010)	/* Low level */
+#define PLD_ICUCR_ISMOD02	(0x0020)	/* High edge */
+#define PLD_ICUCR_ISMOD03	(0x0030)	/* High level */
+#define PLD_ICUCR_ILEVEL0	(0x0000)
+#define PLD_ICUCR_ILEVEL1	(0x0001)
+#define PLD_ICUCR_ILEVEL2	(0x0002)
+#define PLD_ICUCR_ILEVEL3	(0x0003)
+#define PLD_ICUCR_ILEVEL4	(0x0004)
+#define PLD_ICUCR_ILEVEL5	(0x0005)
+#define PLD_ICUCR_ILEVEL6	(0x0006)
+#define PLD_ICUCR_ILEVEL7	(0x0007)
+
+/* Power Control of MMC and CF */
+#define PLD_CPCR		__reg16(PLD_BASE + 0x14000)
+#define PLD_CPCR_CDP		0x0001
+
+/* LED Control
+ *
+ * 1: DIP swich side
+ * 2: Reset switch side
+ */
+#define PLD_IOLEDCR		__reg16(PLD_BASE + 0x14002)
+#define PLD_IOLED_1_ON		0x001
+#define PLD_IOLED_1_OFF		0x000
+#define PLD_IOLED_2_ON		0x002
+#define PLD_IOLED_2_OFF		0x000
+
+/* DIP Switch
+ *  0: Write-protect of Flash Memory (0:protected, 1:non-protected)
+ *  1: -
+ *  2: -
+ *  3: -
+ */
+#define PLD_IOSWSTS		__reg16(PLD_BASE + 0x14004)
+#define	PLD_IOSWSTS_IOSW2	0x0200
+#define	PLD_IOSWSTS_IOSW1	0x0100
+#define	PLD_IOSWSTS_IOWP0	0x0001
+
+/* CRC */
+#define PLD_CRC7DATA		__reg16(PLD_BASE + 0x18000)
+#define PLD_CRC7INDATA		__reg16(PLD_BASE + 0x18002)
+#define PLD_CRC16DATA		__reg16(PLD_BASE + 0x18004)
+#define PLD_CRC16INDATA		__reg16(PLD_BASE + 0x18006)
+#define PLD_CRC16ADATA		__reg16(PLD_BASE + 0x18008)
+#define PLD_CRC16AINDATA	__reg16(PLD_BASE + 0x1800a)
+
+/* RTC */
+#define PLD_RTCCR		__reg16(PLD_BASE + 0x1c000)
+#define PLD_RTCBAUR		__reg16(PLD_BASE + 0x1c002)
+#define PLD_RTCWRDATA		__reg16(PLD_BASE + 0x1c004)
+#define PLD_RTCRDDATA		__reg16(PLD_BASE + 0x1c006)
+#define PLD_RTCRSTODT		__reg16(PLD_BASE + 0x1c008)
+
+/* SIM Card */
+#define PLD_SCCR		__reg16(PLD_BASE + 0x38000)
+#define PLD_SCMOD		__reg16(PLD_BASE + 0x38004)
+#define PLD_SCSTS		__reg16(PLD_BASE + 0x38006)
+#define PLD_SCINTCR		__reg16(PLD_BASE + 0x38008)
+#define PLD_SCBAUR		__reg16(PLD_BASE + 0x3800a)
+#define PLD_SCTXB		__reg16(PLD_BASE + 0x3800c)
+#define PLD_SCRXB		__reg16(PLD_BASE + 0x3800e)
+
+#endif	/* _M32104UT_M32104UT_PLD_H */
diff --git a/include/asm-m32r/m32r.h b/include/asm-m32r/m32r.h
index ec142be..f9bb48a 100644
--- a/include/asm-m32r/m32r.h
+++ b/include/asm-m32r/m32r.h
@@ -14,7 +14,7 @@
 #include <asm/m32r_mp_fpga.h>
 #elif defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \
 	|| defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
-        || defined(CONFIG_CHIP_OPSP)
+        || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
 #include <asm/m32102.h>
 #endif
 
@@ -43,6 +43,10 @@
 #include <asm/m32700ut/m32700ut_pld.h>
 #endif
 
+#if defined(CONFIG_PLAT_M32104UT)
+#include <asm/m32104ut/m32104ut_pld.h>
+#endif  /* CONFIG_PLAT_M32104 */
+
 /*
  * M32R Register
  */
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 5eee832..dcf619a 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -69,12 +69,12 @@
 } while(0)
 
 /* Interrupt Control */
-#if !defined(CONFIG_CHIP_M32102)
+#if !defined(CONFIG_CHIP_M32102) && !defined(CONFIG_CHIP_M32104)
 #define local_irq_enable() \
 	__asm__ __volatile__ ("setpsw #0x40 -> nop": : :"memory")
 #define local_irq_disable() \
 	__asm__ __volatile__ ("clrpsw #0x40 -> nop": : :"memory")
-#else	/* CONFIG_CHIP_M32102 */
+#else	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 static inline void local_irq_enable(void)
 {
 	unsigned long tmpreg;
@@ -96,7 +96,7 @@ static inline void local_irq_disable(void)
 		"mvtc	%0, psw	\n\t"
 	: "=&r" (tmpreg0), "=&r" (tmpreg1) : : "cbit", "memory");
 }
-#endif	/* CONFIG_CHIP_M32102 */
+#endif	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 
 #define local_save_flags(x) \
 	__asm__ __volatile__("mvfc %0,psw" : "=r"(x) : /* no input */)
@@ -105,13 +105,13 @@ static inline void local_irq_disable(void)
 	__asm__ __volatile__("mvtc %0,psw" : /* no outputs */ \
 		: "r" (x) : "cbit", "memory")
 
-#if !defined(CONFIG_CHIP_M32102)
+#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
 #define local_irq_save(x)				\
 	__asm__ __volatile__(				\
   		"mvfc	%0, psw;		\n\t"	\
 	  	"clrpsw	#0x40 -> nop;		\n\t"	\
   		: "=r" (x) : /* no input */ : "memory")
-#else	/* CONFIG_CHIP_M32102 */
+#else	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 #define local_irq_save(x) 				\
 	({						\
 		unsigned long tmpreg;			\
@@ -124,7 +124,7 @@ static inline void local_irq_disable(void)
 			: "=r" (x), "=&r" (tmpreg)	\
 			: : "cbit", "memory");		\
 	})
-#endif	/* CONFIG_CHIP_M32102 */
+#endif	/* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
 
 #define irqs_disabled()					\
 	({						\
-- 
cgit v1.1


From 1b5b776aa5730cbda9cba84ba0f8ccd53a775797 Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:42 -0800
Subject: [PATCH] m32r: Update syscall macros for MMU-less targets

This patch is for updating m32r's MMU-less support.

Some legacy MMU-less m32r chips cannot return from a trap handler to the
right-hand side 16-bit halfword code of a 32-bit instrucion code pair, because
a "trap" instruction specification was expanded in M32R-II ISA.

This modification forces "trap" instructions to be placed in word alignment
location with a parallel "nop" code.

Signed-off-by: Kazuhiro Inaoka <inaoka@linux-m32r.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-m32r/unistd.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h
index ac399e1..39be87c 100644
--- a/include/asm-m32r/unistd.h
+++ b/include/asm-m32r/unistd.h
@@ -319,7 +319,7 @@ type name(void) \
 register long __scno __asm__ ("r7") = __NR_##name; \
 register long __res __asm__("r0"); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno) \
 	: "memory"); \
@@ -332,7 +332,7 @@ type name(type1 arg1) \
 register long __scno __asm__ ("r7") = __NR_##name; \
 register long __res __asm__ ("r0") = (long)(arg1); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno), "0" (__res) \
 	: "memory"); \
@@ -346,7 +346,7 @@ register long __scno __asm__ ("r7") = __NR_##name; \
 register long __arg2 __asm__ ("r1") = (long)(arg2); \
 register long __res __asm__ ("r0") = (long)(arg1); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno), "0" (__res), "r" (__arg2) \
 	: "memory"); \
@@ -361,7 +361,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
 register long __arg2 __asm__ ("r1") = (long)(arg2); \
 register long __res __asm__ ("r0") = (long)(arg1); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno), "0" (__res), "r" (__arg2), \
 		"r" (__arg3) \
@@ -378,7 +378,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
 register long __arg2 __asm__ ("r1") = (long)(arg2); \
 register long __res __asm__ ("r0") = (long)(arg1); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno), "0" (__res), "r" (__arg2), \
 		"r" (__arg3), "r" (__arg4) \
@@ -397,7 +397,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
 register long __arg2 __asm__ ("r1") = (long)(arg2); \
 register long __res __asm__ ("r0") = (long)(arg1); \
 __asm__ __volatile__ (\
-	"trap #" SYSCALL_VECTOR \
+	"trap #" SYSCALL_VECTOR "|| nop"\
 	: "=r" (__res) \
 	: "r" (__scno), "0" (__res), "r" (__arg2), \
 		"r" (__arg3), "r" (__arg4), "r" (__arg5) \
-- 
cgit v1.1


From 46ea178b7a5162405bf70954d769165cf2161309 Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:43 -0800
Subject: [PATCH] m32r: Update _port2addr to use NONCACHE_OFFSET

Modify _port2addr*() routines in arch/m32r/kernel/io_*.c to use
NONCACHE_OFFSET instead of hard-coding of a constant address.

This modification is also required to support an M3A-ZA36 FPGA eva board in
case an MMU-less synthesizable m32r core is used.

Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m32r/kernel/io_m32104ut.c | 24 ++++++++++----------
 arch/m32r/kernel/io_m32700ut.c | 24 ++++++++++----------
 arch/m32r/kernel/io_mappi.c    |  2 +-
 arch/m32r/kernel/io_mappi2.c   | 24 ++++++++++----------
 arch/m32r/kernel/io_mappi3.c   | 51 +++++++++++++++++++++++++-----------------
 arch/m32r/kernel/io_oaks32r.c  |  2 +-
 arch/m32r/kernel/io_opsput.c   |  6 ++---
 include/asm-m32r/m32r.h        |  2 +-
 8 files changed, 72 insertions(+), 63 deletions(-)

diff --git a/arch/m32r/kernel/io_m32104ut.c b/arch/m32r/kernel/io_m32104ut.c
index 3df4215..d26adab 100644
--- a/arch/m32r/kernel/io_m32104ut.c
+++ b/arch/m32r/kernel/io_m32104ut.c
@@ -32,7 +32,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port + NONCACHE_OFFSET);
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 #if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
@@ -41,15 +41,15 @@ static inline void *__port2addr_ata(unsigned long port)
 	static int	dummy_reg;
 
 	switch (port) {
-	case 0x1f0:	return (void *)0xac002000;
-	case 0x1f1:	return (void *)0xac012800;
-	case 0x1f2:	return (void *)0xac012002;
-	case 0x1f3:	return (void *)0xac012802;
-	case 0x1f4:	return (void *)0xac012004;
-	case 0x1f5:	return (void *)0xac012804;
-	case 0x1f6:	return (void *)0xac012006;
-	case 0x1f7:	return (void *)0xac012806;
-	case 0x3f6:	return (void *)0xac01200e;
+	case 0x1f0:	return (void *)(0x0c002000 | NONCACHE_OFFSET);
+	case 0x1f1:	return (void *)(0x0c012800 | NONCACHE_OFFSET);
+	case 0x1f2:	return (void *)(0x0c012002 | NONCACHE_OFFSET);
+	case 0x1f3:	return (void *)(0x0c012802 | NONCACHE_OFFSET);
+	case 0x1f4:	return (void *)(0x0c012004 | NONCACHE_OFFSET);
+	case 0x1f5:	return (void *)(0x0c012804 | NONCACHE_OFFSET);
+	case 0x1f6:	return (void *)(0x0c012006 | NONCACHE_OFFSET);
+	case 0x1f7:	return (void *)(0x0c012806 | NONCACHE_OFFSET);
+	case 0x3f6:	return (void *)(0x0c01200e | NONCACHE_OFFSET);
 	default: 	return (void *)&dummy_reg;
 	}
 }
@@ -60,8 +60,8 @@ static inline void *__port2addr_ata(unsigned long port)
  * from 0x01000000 to 0x01ffffff on physical address.
  * The base address of LAN controller(LAN91C111) is 0x300.
  */
-#define LAN_IOSTART	0x300
-#define LAN_IOEND	0x320
+#define LAN_IOSTART	(0x300 | NONCACHE_OFFSET)
+#define LAN_IOEND	(0x320 | NONCACHE_OFFSET)
 static inline void *_port2addr_ne(unsigned long port)
 {
 	return (void *)(port + NONCACHE_OFFSET + 0x01000000);
diff --git a/arch/m32r/kernel/io_m32700ut.c b/arch/m32r/kernel/io_m32700ut.c
index eda9f96..939932d 100644
--- a/arch/m32r/kernel/io_m32700ut.c
+++ b/arch/m32r/kernel/io_m32700ut.c
@@ -36,7 +36,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port + NONCACHE_OFFSET);
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 #if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
@@ -45,15 +45,15 @@ static inline void *__port2addr_ata(unsigned long port)
 	static int	dummy_reg;
 
 	switch (port) {
-	case 0x1f0:	return (void *)0xac002000;
-	case 0x1f1:	return (void *)0xac012800;
-	case 0x1f2:	return (void *)0xac012002;
-	case 0x1f3:	return (void *)0xac012802;
-	case 0x1f4:	return (void *)0xac012004;
-	case 0x1f5:	return (void *)0xac012804;
-	case 0x1f6:	return (void *)0xac012006;
-	case 0x1f7:	return (void *)0xac012806;
-	case 0x3f6:	return (void *)0xac01200e;
+	case 0x1f0:	return (void *)(0x0c002000 | NONCACHE_OFFSET);
+	case 0x1f1:	return (void *)(0x0c012800 | NONCACHE_OFFSET);
+	case 0x1f2:	return (void *)(0x0c012002 | NONCACHE_OFFSET);
+	case 0x1f3:	return (void *)(0x0c012802 | NONCACHE_OFFSET);
+	case 0x1f4:	return (void *)(0x0c012004 | NONCACHE_OFFSET);
+	case 0x1f5:	return (void *)(0x0c012804 | NONCACHE_OFFSET);
+	case 0x1f6:	return (void *)(0x0c012006 | NONCACHE_OFFSET);
+	case 0x1f7:	return (void *)(0x0c012806 | NONCACHE_OFFSET);
+	case 0x3f6:	return (void *)(0x0c01200e | NONCACHE_OFFSET);
 	default: 	return (void *)&dummy_reg;
 	}
 }
@@ -64,8 +64,8 @@ static inline void *__port2addr_ata(unsigned long port)
  * from 0x10000000 to 0x13ffffff on physical address.
  * The base address of LAN controller(LAN91C111) is 0x300.
  */
-#define LAN_IOSTART	0xa0000300
-#define LAN_IOEND	0xa0000320
+#define LAN_IOSTART	(0x300 | NONCACHE_OFFSET)
+#define LAN_IOEND	(0x320 | NONCACHE_OFFSET)
 static inline void *_port2addr_ne(unsigned long port)
 {
 	return (void *)(port + 0x10000000);
diff --git a/arch/m32r/kernel/io_mappi.c b/arch/m32r/kernel/io_mappi.c
index 3c3da04..a662b53 100644
--- a/arch/m32r/kernel/io_mappi.c
+++ b/arch/m32r/kernel/io_mappi.c
@@ -31,7 +31,7 @@ extern void pcc_iowrite(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port | (NONCACHE_OFFSET));
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 static inline void *_port2addr_ne(unsigned long port)
diff --git a/arch/m32r/kernel/io_mappi2.c b/arch/m32r/kernel/io_mappi2.c
index df3c729..e72d725 100644
--- a/arch/m32r/kernel/io_mappi2.c
+++ b/arch/m32r/kernel/io_mappi2.c
@@ -33,7 +33,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port | (NONCACHE_OFFSET));
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 #if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
@@ -42,22 +42,22 @@ static inline void *__port2addr_ata(unsigned long port)
 	static int	dummy_reg;
 
 	switch (port) {
-	case 0x1f0:	return (void *)0xac002000;
-	case 0x1f1:	return (void *)0xac012800;
-	case 0x1f2:	return (void *)0xac012002;
-	case 0x1f3:	return (void *)0xac012802;
-	case 0x1f4:	return (void *)0xac012004;
-	case 0x1f5:	return (void *)0xac012804;
-	case 0x1f6:	return (void *)0xac012006;
-	case 0x1f7:	return (void *)0xac012806;
-	case 0x3f6:	return (void *)0xac01200e;
+	case 0x1f0:	return (void *)(0x0c002000 | NONCACHE_OFFSET);
+	case 0x1f1:	return (void *)(0x0c012800 | NONCACHE_OFFSET);
+	case 0x1f2:	return (void *)(0x0c012002 | NONCACHE_OFFSET);
+	case 0x1f3:	return (void *)(0x0c012802 | NONCACHE_OFFSET);
+	case 0x1f4:	return (void *)(0x0c012004 | NONCACHE_OFFSET);
+	case 0x1f5:	return (void *)(0x0c012804 | NONCACHE_OFFSET);
+	case 0x1f6:	return (void *)(0x0c012006 | NONCACHE_OFFSET);
+	case 0x1f7:	return (void *)(0x0c012806 | NONCACHE_OFFSET);
+	case 0x3f6:	return (void *)(0x0c01200e | NONCACHE_OFFSET);
 	default: 	return (void *)&dummy_reg;
 	}
 }
 #endif
 
-#define LAN_IOSTART	0xa0000300
-#define LAN_IOEND	0xa0000320
+#define LAN_IOSTART	(0x300 | NONCACHE_OFFSET)
+#define LAN_IOEND	(0x320 | NONCACHE_OFFSET)
 #ifdef CONFIG_CHIP_OPSP
 static inline void *_port2addr_ne(unsigned long port)
 {
diff --git a/arch/m32r/kernel/io_mappi3.c b/arch/m32r/kernel/io_mappi3.c
index f80321a..ed6da93 100644
--- a/arch/m32r/kernel/io_mappi3.c
+++ b/arch/m32r/kernel/io_mappi3.c
@@ -33,7 +33,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port + NONCACHE_OFFSET);
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 #if defined(CONFIG_IDE)
@@ -43,33 +43,42 @@ static inline void *__port2addr_ata(unsigned long port)
 
 	switch (port) {
 	  /* IDE0 CF */
-	case 0x1f0:	return (void *)0xb4002000;
-	case 0x1f1:	return (void *)0xb4012800;
-	case 0x1f2:	return (void *)0xb4012002;
-	case 0x1f3:	return (void *)0xb4012802;
-	case 0x1f4:	return (void *)0xb4012004;
-	case 0x1f5:	return (void *)0xb4012804;
-	case 0x1f6:	return (void *)0xb4012006;
-	case 0x1f7:	return (void *)0xb4012806;
-	case 0x3f6:	return (void *)0xb401200e;
+	case 0x1f0:	return (void *)(0x14002000 | NONCACHE_OFFSET);
+	case 0x1f1:	return (void *)(0x14012800 | NONCACHE_OFFSET);
+	case 0x1f2:	return (void *)(0x14012002 | NONCACHE_OFFSET);
+	case 0x1f3:	return (void *)(0x14012802 | NONCACHE_OFFSET);
+	case 0x1f4:	return (void *)(0x14012004 | NONCACHE_OFFSET);
+	case 0x1f5:	return (void *)(0x14012804 | NONCACHE_OFFSET);
+	case 0x1f6:	return (void *)(0x14012006 | NONCACHE_OFFSET);
+	case 0x1f7:	return (void *)(0x14012806 | NONCACHE_OFFSET);
+	case 0x3f6:	return (void *)(0x1401200e | NONCACHE_OFFSET);
 	  /* IDE1 IDE */
-	case 0x170:	return (void *)0xb4810000;  /* Data 16bit */
-	case 0x171:	return (void *)0xb4810002;  /* Features / Error */
-	case 0x172:	return (void *)0xb4810004;  /* Sector count */
-	case 0x173:	return (void *)0xb4810006;  /* Sector number */
-	case 0x174:	return (void *)0xb4810008;  /* Cylinder low */
-	case 0x175:	return (void *)0xb481000a;  /* Cylinder high */
-	case 0x176:	return (void *)0xb481000c;  /* Device head */
-	case 0x177:	return (void *)0xb481000e;  /* Command     */
-	case 0x376:	return (void *)0xb480800c;  /* Device control / Alt status */
+	case 0x170:	/* Data 16bit */
+			return (void *)(0x14810000 | NONCACHE_OFFSET);
+	case 0x171:	/* Features / Error */
+			return (void *)(0x14810002 | NONCACHE_OFFSET);
+	case 0x172:	/* Sector count */
+			return (void *)(0x14810004 | NONCACHE_OFFSET);
+	case 0x173:	/* Sector number */
+			return (void *)(0x14810006 | NONCACHE_OFFSET);
+	case 0x174:	/* Cylinder low */
+			return (void *)(0x14810008 | NONCACHE_OFFSET);
+	case 0x175:	/* Cylinder high */
+			return (void *)(0x1481000a | NONCACHE_OFFSET);
+	case 0x176:	/* Device head */
+			return (void *)(0x1481000c | NONCACHE_OFFSET);
+	case 0x177:	/* Command     */
+			return (void *)(0x1481000e | NONCACHE_OFFSET);
+	case 0x376:	/* Device control / Alt status */
+			return (void *)(0x1480800c | NONCACHE_OFFSET);
 
 	default: 	return (void *)&dummy_reg;
 	}
 }
 #endif
 
-#define LAN_IOSTART	0xa0000300
-#define LAN_IOEND	0xa0000320
+#define LAN_IOSTART	(0x300 | NONCACHE_OFFSET)
+#define LAN_IOEND	(0x320 | NONCACHE_OFFSET)
 static inline void *_port2addr_ne(unsigned long port)
 {
 	return (void *)(port + 0x10000000);
diff --git a/arch/m32r/kernel/io_oaks32r.c b/arch/m32r/kernel/io_oaks32r.c
index 8be3239..910dd13 100644
--- a/arch/m32r/kernel/io_oaks32r.c
+++ b/arch/m32r/kernel/io_oaks32r.c
@@ -16,7 +16,7 @@
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port | (NONCACHE_OFFSET));
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 static inline  void *_port2addr_ne(unsigned long port)
diff --git a/arch/m32r/kernel/io_opsput.c b/arch/m32r/kernel/io_opsput.c
index 4793bd1..bec6929 100644
--- a/arch/m32r/kernel/io_opsput.c
+++ b/arch/m32r/kernel/io_opsput.c
@@ -36,7 +36,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
 
 static inline void *_port2addr(unsigned long port)
 {
-	return (void *)(port | (NONCACHE_OFFSET));
+	return (void *)(port | NONCACHE_OFFSET);
 }
 
 /*
@@ -44,8 +44,8 @@ static inline void *_port2addr(unsigned long port)
  * from 0x10000000 to 0x13ffffff on physical address.
  * The base address of LAN controller(LAN91C111) is 0x300.
  */
-#define LAN_IOSTART	0xa0000300
-#define LAN_IOEND	0xa0000320
+#define LAN_IOSTART	(0x300 | NONCACHE_OFFSET)
+#define LAN_IOEND	(0x320 | NONCACHE_OFFSET)
 static inline void *_port2addr_ne(unsigned long port)
 {
 	return (void *)(port + 0x10000000);
diff --git a/include/asm-m32r/m32r.h b/include/asm-m32r/m32r.h
index f9bb48a..b133ca6 100644
--- a/include/asm-m32r/m32r.h
+++ b/include/asm-m32r/m32r.h
@@ -126,7 +126,7 @@
 
 #include <asm/page.h>
 #ifdef CONFIG_MMU
-#define NONCACHE_OFFSET  __PAGE_OFFSET+0x20000000
+#define NONCACHE_OFFSET  (__PAGE_OFFSET + 0x20000000)
 #else
 #define NONCACHE_OFFSET  __PAGE_OFFSET
 #endif /* CONFIG_MMU */
-- 
cgit v1.1


From 9b791d4766c19ac014a7b81a551efe4a7511e12a Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:44 -0800
Subject: [PATCH] m32r: Fix M32104 cache flushing routines

This patch fixes cache memory parameter setting for the M32104 target.  So
far, its performance seemed to have been degraded due to incorrect cache
parameter setting.

  * arch/m32r/boot/setup.S: Set SFR(Special Fuction Registers) region
    to be non-cachable explicitly.
  * arch/m32r/mm/cache.c: Fix cache flushing routines not to switch off
    the M32104 cache.

Signed-off-by: Hayato Fujiwara <fujiwara@linux-m32r.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m32r/boot/setup.S | 15 ++++++++++++---
 arch/m32r/mm/cache.c   | 28 +++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/arch/m32r/boot/setup.S b/arch/m32r/boot/setup.S
index 742669f..3985425 100644
--- a/arch/m32r/boot/setup.S
+++ b/arch/m32r/boot/setup.S
@@ -1,11 +1,10 @@
 /*
  *  linux/arch/m32r/boot/setup.S -- A setup code.
  *
- *  Copyright (C) 2001, 2002  Hiroyuki Kondo, Hirokazu Takata,
- *  and Hitoshi Yamamoto
+ *  Copyright (C) 2001-2005   Hiroyuki Kondo, Hirokazu Takata,
+ *                            Hitoshi Yamamoto, Hayato Fujiwara
  *
  */
-/* $Id$ */
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
@@ -81,6 +80,16 @@ ENTRY(boot)
 ;	ldi	r1, #0x00		; cache off
 	st	r1, @r0
 #elif defined(CONFIG_CHIP_M32104)
+	ldi	r0, #-96		; DNCR0
+	seth	r1, #0x0060		;  from 0x00600000
+	or3	r1, r1, #0x0005		;  size 2MB
+	st	r1, @r0
+	seth	r1, #0x0100		;  from 0x01000000
+	or3	r1, r1, #0x0003		;  size 16MB
+	st	r1, @+r0
+	seth	r1, #0x0200		;  from 0x02000000
+	or3	r1, r1, #0x0002		;  size 32MB
+	st	r1, @+r0
 	ldi	r0, #-4              ;LDIMM	(r0, M32R_MCCR)
 	ldi	r1, #0x703		; cache on (with invalidation)
 	st	r1, @r0
diff --git a/arch/m32r/mm/cache.c b/arch/m32r/mm/cache.c
index c6f72a6..9f54dd9 100644
--- a/arch/m32r/mm/cache.c
+++ b/arch/m32r/mm/cache.c
@@ -1,7 +1,7 @@
 /*
  *  linux/arch/m32r/mm/cache.c
  *
- *  Copyright (C) 2002  Hirokazu Takata
+ *  Copyright (C) 2002-2005  Hirokazu Takata, Hayato Fujiwara
  */
 
 #include <linux/config.h>
@@ -9,7 +9,8 @@
 
 #undef MCCR
 
-#if defined(CONFIG_CHIP_XNUX2) || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_OPSP)
+#if defined(CONFIG_CHIP_XNUX2) || defined(CONFIG_CHIP_M32700) \
+	|| defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_OPSP)
 /* Cache Control Register */
 #define MCCR		((volatile unsigned long*)0xfffffffc)
 #define MCCR_CC		(1UL << 7)	/* Cache mode modify bit */
@@ -27,7 +28,7 @@
 #define MCCR_IIV	(1UL << 0)	/* I-cache invalidate */
 #define MCCR_ICACHE_INV		MCCR_IIV
 #elif defined(CONFIG_CHIP_M32104)
-#define MCCR		((volatile unsigned long*)0xfffffffc)
+#define MCCR		((volatile unsigned short*)0xfffffffe)
 #define MCCR_IIV	(1UL << 8)	/* I-cache invalidate */
 #define MCCR_DIV	(1UL << 9)	/* D-cache invalidate */
 #define MCCR_DCB	(1UL << 10)	/* D-cache copy back */
@@ -36,7 +37,7 @@
 #define MCCR_ICACHE_INV		MCCR_IIV
 #define MCCR_DCACHE_CB		MCCR_DCB
 #define MCCR_DCACHE_CBINV	(MCCR_DIV|MCCR_DCB)
-#endif /* CONFIG_CHIP_XNUX2 || CONFIG_CHIP_M32700 */
+#endif
 
 #ifndef MCCR
 #error Unknown cache type.
@@ -47,29 +48,42 @@
 void _flush_cache_all(void)
 {
 #if defined(CONFIG_CHIP_M32102)
+	unsigned char mccr;
 	*MCCR = MCCR_ICACHE_INV;
+#elif defined(CONFIG_CHIP_M32104)
+	unsigned short mccr;
+
+	/* Copyback and invalidate D-cache */
+	/* Invalidate I-cache */
+	*MCCR |= (MCCR_ICACHE_INV | MCCR_DCACHE_CBINV);
 #else
 	unsigned long mccr;
 
 	/* Copyback and invalidate D-cache */
 	/* Invalidate I-cache */
 	*MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CBINV;
-	while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
 #endif
+	while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
 }
 
 /* Copy back D-cache and invalidate I-cache all */
 void _flush_cache_copyback_all(void)
 {
 #if defined(CONFIG_CHIP_M32102)
+	unsigned char mccr;
 	*MCCR = MCCR_ICACHE_INV;
+#elif defined(CONFIG_CHIP_M32104)
+	unsigned short mccr;
+
+	/* Copyback and invalidate D-cache */
+	/* Invalidate I-cache */
+	*MCCR |= (MCCR_ICACHE_INV | MCCR_DCACHE_CB);
 #else
 	unsigned long mccr;
 
 	/* Copyback D-cache */
 	/* Invalidate I-cache */
 	*MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CB;
-	while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
-
 #endif
+	while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
 }
-- 
cgit v1.1


From adfc31c67f4515ed4bad1ef9555cbacdfc24e8d3 Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Fri, 6 Jan 2006 00:18:45 -0800
Subject: [PATCH] m32r: Remove unnecessary icu_data_t definitions

This patch removes unnecessary struct icu_data_t definitions of
arch/m32r/kernel/setup_*.c.

Signed-off-by: Hayato Fujiwara <fujiwara@linux-m32r.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m32r/kernel/setup_m32104ut.c | 6 ------
 arch/m32r/kernel/setup_m32700ut.c | 8 --------
 arch/m32r/kernel/setup_mappi.c    | 6 ------
 arch/m32r/kernel/setup_mappi2.c   | 6 ------
 arch/m32r/kernel/setup_mappi3.c   | 6 ------
 arch/m32r/kernel/setup_oaks32r.c  | 6 ------
 arch/m32r/kernel/setup_opsput.c   | 8 --------
 arch/m32r/kernel/setup_usrv.c     | 6 ------
 include/asm-m32r/m32102.h         | 7 ++-----
 9 files changed, 2 insertions(+), 57 deletions(-)

diff --git a/arch/m32r/kernel/setup_m32104ut.c b/arch/m32r/kernel/setup_m32104ut.c
index ab16c66..6328e13 100644
--- a/arch/m32r/kernel/setup_m32104ut.c
+++ b/arch/m32r/kernel/setup_m32104ut.c
@@ -20,12 +20,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[NR_IRQS];
 
 static void disable_m32104ut_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_m32700ut.c b/arch/m32r/kernel/setup_m32700ut.c
index cb76916..fad1fc9 100644
--- a/arch/m32r/kernel/setup_m32700ut.c
+++ b/arch/m32r/kernel/setup_m32700ut.c
@@ -26,15 +26,7 @@
  */
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-static icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
-#else
 icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
-#endif /* CONFIG_SMP */
-
 
 static void disable_m32700ut_irq(unsigned int irq)
 {
diff --git a/arch/m32r/kernel/setup_mappi.c b/arch/m32r/kernel/setup_mappi.c
index 501d798..00f2532 100644
--- a/arch/m32r/kernel/setup_mappi.c
+++ b/arch/m32r/kernel/setup_mappi.c
@@ -19,12 +19,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[NR_IRQS];
 
 static void disable_mappi_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_mappi2.c b/arch/m32r/kernel/setup_mappi2.c
index 7f2db5b..eebc9d8 100644
--- a/arch/m32r/kernel/setup_mappi2.c
+++ b/arch/m32r/kernel/setup_mappi2.c
@@ -19,12 +19,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[NR_IRQS];
 
 static void disable_mappi2_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_mappi3.c b/arch/m32r/kernel/setup_mappi3.c
index f6ecdf7..d2ff021 100644
--- a/arch/m32r/kernel/setup_mappi3.c
+++ b/arch/m32r/kernel/setup_mappi3.c
@@ -19,12 +19,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[NR_IRQS];
 
 static void disable_mappi3_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_oaks32r.c b/arch/m32r/kernel/setup_oaks32r.c
index 45add5b..0e9e635 100644
--- a/arch/m32r/kernel/setup_oaks32r.c
+++ b/arch/m32r/kernel/setup_oaks32r.c
@@ -18,12 +18,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[NR_IRQS];
 
 static void disable_oaks32r_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_opsput.c b/arch/m32r/kernel/setup_opsput.c
index 1fbb140..548e8fc 100644
--- a/arch/m32r/kernel/setup_opsput.c
+++ b/arch/m32r/kernel/setup_opsput.c
@@ -27,15 +27,7 @@
  */
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#ifndef CONFIG_SMP
-typedef struct {
-	unsigned long icucr;  /* ICU Control Register */
-} icu_data_t;
-static icu_data_t icu_data[OPSPUT_NUM_CPU_IRQ];
-#else
 icu_data_t icu_data[OPSPUT_NUM_CPU_IRQ];
-#endif /* CONFIG_SMP */
-
 
 static void disable_opsput_irq(unsigned int irq)
 {
diff --git a/arch/m32r/kernel/setup_usrv.c b/arch/m32r/kernel/setup_usrv.c
index 634741b..64be659 100644
--- a/arch/m32r/kernel/setup_usrv.c
+++ b/arch/m32r/kernel/setup_usrv.c
@@ -18,12 +18,6 @@
 
 #define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
 
-#if !defined(CONFIG_SMP)
-typedef struct {
-	unsigned long icucr;	/* ICU Control Register */
-} icu_data_t;
-#endif /* CONFIG_SMP */
-
 icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
 
 static void disable_mappi_irq(unsigned int irq)
diff --git a/include/asm-m32r/m32102.h b/include/asm-m32r/m32102.h
index 0bd0a3f..a1f0d1f 100644
--- a/include/asm-m32r/m32102.h
+++ b/include/asm-m32r/m32102.h
@@ -302,15 +302,12 @@
 #define M32R_FPGA_VERSION0_PORTL    (0x30+M32R_FPGA_TOP)
 #define M32R_FPGA_VERSION1_PORTL    (0x34+M32R_FPGA_TOP)
 
+#endif /* CONFIG_SMP */
+
 #ifndef __ASSEMBLY__
-/* For NETDEV WATCHDOG */
 typedef struct {
 	unsigned long icucr;	/* ICU Control Register */
 } icu_data_t;
-
-extern icu_data_t icu_data[];
 #endif
 
-#endif /* CONFIG_SMP */
-
 #endif /* _M32102_H_ */
-- 
cgit v1.1


From 32588918254cff7c03651dcbd3d8cc2301aba5bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Jan 2006 00:18:45 -0800
Subject: [PATCH] m68knommu: enable_irq/disable_irq

mach_enable_irq/mach_disable_irq are never actually set, so let's remove
them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/m68knommu/kernel/m68k_ksyms.c | 2 --
 arch/m68knommu/kernel/setup.c      | 2 --
 include/asm-m68knommu/irq.h        | 4 ++--
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index e93a5ad..b2c62ee 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -38,8 +38,6 @@ EXPORT_SYMBOL(strncmp);
 
 EXPORT_SYMBOL(ip_fast_csum);
 
-EXPORT_SYMBOL(mach_enable_irq);
-EXPORT_SYMBOL(mach_disable_irq);
 EXPORT_SYMBOL(kernel_thread);
 
 /* Networking helper routines. */
diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index abb80fa..93120b9 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -65,8 +65,6 @@ void (*mach_kbd_leds) (unsigned int) = NULL;
 /* machine dependent irq functions */
 void (*mach_init_IRQ) (void) = NULL;
 irqreturn_t (*(*mach_default_handler)[]) (int, void *, struct pt_regs *) = NULL;
-void (*mach_enable_irq) (unsigned int) = NULL;
-void (*mach_disable_irq) (unsigned int) = NULL;
 int (*mach_get_irq_list) (struct seq_file *, void *) = NULL;
 void (*mach_process_int) (int irq, struct pt_regs *fp) = NULL;
 void (*mach_trap_init) (void);
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index a08fa9b..993046b 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -84,8 +84,8 @@ extern void (*mach_disable_irq)(unsigned int);
 /*
  * Some drivers want these entry points
  */
-#define enable_irq(x)	(mach_enable_irq  ? (*mach_enable_irq)(x)  : 0)
-#define disable_irq(x)	(mach_disable_irq ? (*mach_disable_irq)(x) : 0)
+#define enable_irq(x)	0
+#define disable_irq(x)	do { } while (0)
 
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
-- 
cgit v1.1


From b14a72d6cbf73443b975ffb04871e0ffb240df58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Jan 2006 00:18:46 -0800
Subject: [PATCH] m68knommu: remove enable_irq_nosync()

m68k, m68knommu and h8300 define this, but it's not actually used
anywhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-h8300/irq.h     | 5 -----
 include/asm-m68k/irq.h      | 2 --
 include/asm-m68knommu/irq.h | 2 --
 3 files changed, 9 deletions(-)

diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index 5027181..73065f5 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -61,11 +61,6 @@ static __inline__ int irq_canonicalize(int irq)
 
 extern void enable_irq(unsigned int);
 extern void disable_irq(unsigned int);
-
-/*
- * Some drivers want these entry points
- */
-#define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
 struct irqaction;
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 1f56990..127ad19 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -70,8 +70,6 @@ static __inline__ int irq_canonicalize(int irq)
 
 extern void (*enable_irq)(unsigned int);
 extern void (*disable_irq)(unsigned int);
-
-#define disable_irq_nosync	disable_irq
 #define enable_irq_nosync	enable_irq
 
 struct pt_regs;
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 993046b..20c48ec 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -86,8 +86,6 @@ extern void (*mach_disable_irq)(unsigned int);
  */
 #define enable_irq(x)	0
 #define disable_irq(x)	do { } while (0)
-
-#define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
 struct irqaction;
-- 
cgit v1.1


From 118c1f27b838c5d1cf5338dc5abff52ceb364826 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 6 Jan 2006 00:18:47 -0800
Subject: [PATCH] cris: kgdb: remove double_this()

Doesn't make much sense and unused.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Mikael Starvik <starvik@axis.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/arch-v10/kernel/kgdb.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/cris/arch-v10/kernel/kgdb.c b/arch/cris/arch-v10/kernel/kgdb.c
index b72e6a9..34528da 100644
--- a/arch/cris/arch-v10/kernel/kgdb.c
+++ b/arch/cris/arch-v10/kernel/kgdb.c
@@ -569,12 +569,6 @@ gdb_cris_strtol (const char *s, char **endptr, int base)
 	return x;
 }
 
-int
-double_this(int x)
-{
-        return 2 * x;
-}
-
 /********************************* Register image ****************************/
 /* Copy the content of a register image into another. The size n is
    the size of the register image. Due to struct assignment generation of
-- 
cgit v1.1


From 970d6e3a3461ebc62bc3fc6d4962c936cb2ed97c Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:48 -0800
Subject: [PATCH] uml: use kstrdup

There were a bunch of calls to uml_strdup dating from before kstrdup was
introduced.  This changes those calls.  It doesn't eliminate the definition
since there is still a couple of calls in userspace code (which should
probably call the libc strdup).

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c          | 10 +++++-----
 arch/um/drivers/mconsole_kern.c |  2 +-
 arch/um/drivers/net_kern.c      |  2 +-
 arch/um/drivers/ubd_kern.c      |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index e0fdffa..c31fc54 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -562,10 +562,11 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
 
 int line_config(struct line *lines, unsigned int num, char *str)
 {
-	char *new = uml_strdup(str);
+	char *new;
 
+	new = kstrdup(str, GFP_KERNEL);
 	if(new == NULL){
-		printk("line_config - uml_strdup failed\n");
+		printk("line_config - kstrdup failed\n");
 		return -ENOMEM;
 	}
 	return !line_setup(lines, num, new, 0);
@@ -677,10 +678,9 @@ void lines_init(struct line *lines, int nlines)
 		INIT_LIST_HEAD(&line->chan_list);
 		spin_lock_init(&line->lock);
 		if(line->init_str != NULL){
-			line->init_str = uml_strdup(line->init_str);
+			line->init_str = kstrdup(line->init_str, GFP_KERNEL);
 			if(line->init_str == NULL)
-				printk("lines_init - uml_strdup returned "
-				       "NULL\n");
+				printk("lines_init - kstrdup returned NULL\n");
 		}
 	}
 }
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 12c9536..b367864 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -563,7 +563,7 @@ int mconsole_init(void)
 	}
 
 	if(notify_socket != NULL){
-		notify_socket = uml_strdup(notify_socket);
+		notify_socket = kstrdup(notify_socket, GFP_KERNEL);
 		if(notify_socket != NULL)
 			mconsole_notify(notify_socket, MCONSOLE_SOCKET,
 					mconsole_socket_name, 
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 84c73a3..29785f6 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -586,7 +586,7 @@ static int net_config(char *str)
 	err = eth_parse(str, &n, &str);
 	if(err) return(err);
 
-	str = uml_strdup(str);
+	str = kstrdup(str, GFP_KERNEL);
 	if(str == NULL){
 		printk(KERN_ERR "net_config failed to strdup string\n");
 		return(-1);
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 9389891..1fe0dcd 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -706,7 +706,7 @@ static int ubd_config(char *str)
 {
 	int n, err;
 
-	str = uml_strdup(str);
+	str = kstrdup(str, GFP_KERNEL);
 	if(str == NULL){
 		printk(KERN_ERR "ubd_config failed to strdup string\n");
 		return(1);
-- 
cgit v1.1


From 1b57e9c27882a908f180d4daf72ee12c6f137178 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:49 -0800
Subject: [PATCH] uml: non-void functions should return something

There are a few functions which are declared to return something, but don't.
These are actually infinite loops which are forced to be declared as non-void.
 This makes them all return 0.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/ubd_kern.c  | 13 ++-----------
 arch/um/kernel/sigio_user.c |  2 ++
 arch/um/os-Linux/aio.c      |  2 ++
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 1fe0dcd..73f9652 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1387,15 +1387,6 @@ int io_thread(void *arg)
 			printk("io_thread - write failed, fd = %d, err = %d\n",
 			       kernel_fd, -n);
 	}
-}
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
+	return 0;
+}
diff --git a/arch/um/kernel/sigio_user.c b/arch/um/kernel/sigio_user.c
index 48b1f64..62e5cfd 100644
--- a/arch/um/kernel/sigio_user.c
+++ b/arch/um/kernel/sigio_user.c
@@ -216,6 +216,8 @@ static int write_sigio_thread(void *unused)
 				       "err = %d\n", -n);
 		}
 	}
+
+	return 0;
 }
 
 static int need_poll(int n)
diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c
index ffa759a..0b78bb7 100644
--- a/arch/um/os-Linux/aio.c
+++ b/arch/um/os-Linux/aio.c
@@ -210,6 +210,8 @@ static int not_aio_thread(void *arg)
                         printk("not_aio_thread - write failed, fd = %d, "
                                "err = %d\n", aio_req_fd_r, -err);
         }
+
+	return 0;
 }
 
 static int aio_pid = -1;
-- 
cgit v1.1


From d50084a2991f3d9490d5c0f3af72e6fe1515a493 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:50 -0800
Subject: [PATCH] uml: Formatting changes

This patch makes a bunch of non-functional changes -
    return(foo); becomes return foo;
    some statements are broken across lines for readability
    some trailing whitespace is cleaned up
    open_one_chan took four arguments, three of which could be
       deduced from the first.  Accordingly, they were eliminated.
    some examples of "} else {" had a newline added
    some whitespace cleanup in the indentation
    lines_init got some control flow cleanup
    some long lines were broken
    removed another emacs-specific C formatting comment

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c     | 124 ++++++-----
 arch/um/drivers/line.c          |  47 ++--
 arch/um/drivers/mconsole_kern.c |  25 ++-
 arch/um/drivers/ssl.c           |  29 +--
 arch/um/drivers/stdio_console.c |  26 +--
 arch/um/include/chan_kern.h     |  11 -
 arch/um/include/line.h          |  22 +-
 arch/um/os-Linux/aio.c          | 465 ++++++++++++++++++++--------------------
 8 files changed, 381 insertions(+), 368 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 5b58fad..8b1262e 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -58,7 +58,7 @@ static void *not_configged_init(char *str, int device, struct chan_opts *opts)
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(NULL);
+	return NULL;
 }
 
 static int not_configged_open(int input, int output, int primary, void *data,
@@ -66,7 +66,7 @@ static int not_configged_open(int input, int output, int primary, void *data,
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(-ENODEV);
+	return -ENODEV;
 }
 
 static void not_configged_close(int fd, void *data)
@@ -79,21 +79,21 @@ static int not_configged_read(int fd, char *c_out, void *data)
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(-EIO);
+	return -EIO;
 }
 
 static int not_configged_write(int fd, const char *buf, int len, void *data)
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(-EIO);
+	return -EIO;
 }
 
 static int not_configged_console_write(int fd, const char *buf, int len)
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(-EIO);
+	return -EIO;
 }
 
 static int not_configged_window_size(int fd, void *data, unsigned short *rows,
@@ -101,7 +101,7 @@ static int not_configged_window_size(int fd, void *data, unsigned short *rows,
 {
 	my_puts("Using a channel type which is configured out of "
 	       "UML\n");
-	return(-ENODEV);
+	return -ENODEV;
 }
 
 static void not_configged_free(void *data)
@@ -135,17 +135,17 @@ int generic_read(int fd, char *c_out, void *unused)
 	n = os_read_file(fd, c_out, sizeof(*c_out));
 
 	if(n == -EAGAIN)
-		return(0);
+		return 0;
 	else if(n == 0)
-		return(-EIO);
-	return(n);
+		return -EIO;
+	return n;
 }
 
 /* XXX Trivial wrapper around os_write_file */
 
 int generic_write(int fd, const char *buf, int n, void *unused)
 {
-	return(os_write_file(fd, buf, n));
+	return os_write_file(fd, buf, n);
 }
 
 int generic_window_size(int fd, void *unused, unsigned short *rows_out,
@@ -156,14 +156,14 @@ int generic_window_size(int fd, void *unused, unsigned short *rows_out,
 
 	ret = os_window_size(fd, &rows, &cols);
 	if(ret < 0)
-		return(ret);
+		return ret;
 
 	ret = ((*rows_out != rows) || (*cols_out != cols));
 
 	*rows_out = rows;
 	*cols_out = cols;
 
-	return(ret);
+	return ret;
 }
 
 void generic_free(void *data)
@@ -186,25 +186,29 @@ static void tty_receive_char(struct tty_struct *tty, char ch)
 		}
 	}
 
-	if((tty->flip.flag_buf_ptr == NULL) || 
+	if((tty->flip.flag_buf_ptr == NULL) ||
 	   (tty->flip.char_buf_ptr == NULL))
 		return;
 	tty_insert_flip_char(tty, ch, TTY_NORMAL);
 }
 
-static int open_one_chan(struct chan *chan, int input, int output, int primary)
+static int open_one_chan(struct chan *chan)
 {
 	int fd;
 
-	if(chan->opened) return(0);
-	if(chan->ops->open == NULL) fd = 0;
-	else fd = (*chan->ops->open)(input, output, primary, chan->data,
-				     &chan->dev);
-	if(fd < 0) return(fd);
+	if(chan->opened)
+		return 0;
+
+	if(chan->ops->open == NULL)
+		fd = 0;
+	else fd = (*chan->ops->open)(chan->input, chan->output, chan->primary,
+				     chan->data, &chan->dev);
+	if(fd < 0)
+		return fd;
 	chan->fd = fd;
 
 	chan->opened = 1;
-	return(0);
+	return 0;
 }
 
 int open_chan(struct list_head *chans)
@@ -215,11 +219,11 @@ int open_chan(struct list_head *chans)
 
 	list_for_each(ele, chans){
 		chan = list_entry(ele, struct chan, list);
-		ret = open_one_chan(chan, chan->input, chan->output,
-				    chan->primary);
-		if(chan->primary) err = ret;
+		ret = open_one_chan(chan);
+		if(chan->primary)
+			err = ret;
 	}
-	return(err);
+	return err;
 }
 
 void chan_enable_winch(struct list_head *chans, struct tty_struct *tty)
@@ -267,7 +271,7 @@ void close_chan(struct list_head *chans)
 	}
 }
 
-int write_chan(struct list_head *chans, const char *buf, int len, 
+int write_chan(struct list_head *chans, const char *buf, int len,
 	       int write_irq)
 {
 	struct list_head *ele;
@@ -285,7 +289,7 @@ int write_chan(struct list_head *chans, const char *buf, int len,
 				reactivate_fd(chan->fd, write_irq);
 		}
 	}
-	return(ret);
+	return ret;
 }
 
 int console_write_chan(struct list_head *chans, const char *buf, int len)
@@ -301,10 +305,11 @@ int console_write_chan(struct list_head *chans, const char *buf, int len)
 		n = chan->ops->console_write(chan->fd, buf, len);
 		if(chan->primary) ret = n;
 	}
-	return(ret);
+	return ret;
 }
 
-int console_open_chan(struct line *line, struct console *co, struct chan_opts *opts)
+int console_open_chan(struct line *line, struct console *co,
+		      struct chan_opts *opts)
 {
 	if (!list_empty(&line->chan_list))
 		return 0;
@@ -327,12 +332,13 @@ int chan_window_size(struct list_head *chans, unsigned short *rows_out,
 	list_for_each(ele, chans){
 		chan = list_entry(ele, struct chan, list);
 		if(chan->primary){
-			if(chan->ops->window_size == NULL) return(0);
-			return(chan->ops->window_size(chan->fd, chan->data,
-						      rows_out, cols_out));
+			if(chan->ops->window_size == NULL)
+				return 0;
+			return chan->ops->window_size(chan->fd, chan->data,
+						      rows_out, cols_out);
 		}
 	}
-	return(0);
+	return 0;
 }
 
 void free_one_chan(struct chan *chan)
@@ -363,23 +369,23 @@ static int one_chan_config_string(struct chan *chan, char *str, int size,
 
 	if(chan == NULL){
 		CONFIG_CHUNK(str, size, n, "none", 1);
-		return(n);
+		return n;
 	}
 
 	CONFIG_CHUNK(str, size, n, chan->ops->type, 0);
 
 	if(chan->dev == NULL){
 		CONFIG_CHUNK(str, size, n, "", 1);
-		return(n);
+		return n;
 	}
 
 	CONFIG_CHUNK(str, size, n, ":", 0);
 	CONFIG_CHUNK(str, size, n, chan->dev, 0);
 
-	return(n);
+	return n;
 }
 
-static int chan_pair_config_string(struct chan *in, struct chan *out, 
+static int chan_pair_config_string(struct chan *in, struct chan *out,
 				   char *str, int size, char **error_out)
 {
 	int n;
@@ -390,7 +396,7 @@ static int chan_pair_config_string(struct chan *in, struct chan *out,
 
 	if(in == out){
 		CONFIG_CHUNK(str, size, n, "", 1);
-		return(n);
+		return n;
 	}
 
 	CONFIG_CHUNK(str, size, n, ",", 1);
@@ -399,10 +405,10 @@ static int chan_pair_config_string(struct chan *in, struct chan *out,
 	size -= n;
 	CONFIG_CHUNK(str, size, n, "", 1);
 
-	return(n);
+	return n;
 }
 
-int chan_config_string(struct list_head *chans, char *str, int size, 
+int chan_config_string(struct list_head *chans, char *str, int size,
 		       char **error_out)
 {
 	struct list_head *ele;
@@ -418,7 +424,7 @@ int chan_config_string(struct list_head *chans, char *str, int size,
 			out = chan;
 	}
 
-	return(chan_pair_config_string(in, out, str, size, error_out));
+	return chan_pair_config_string(in, out, str, size, error_out);
 }
 
 struct chan_type {
@@ -462,7 +468,7 @@ struct chan_type chan_table[] = {
 #endif
 };
 
-static struct chan *parse_chan(char *str, int pri, int device, 
+static struct chan *parse_chan(char *str, int pri, int device,
 			       struct chan_opts *opts)
 {
 	struct chan_type *entry;
@@ -484,14 +490,17 @@ static struct chan *parse_chan(char *str, int pri, int device,
 	if(ops == NULL){
 		my_printf("parse_chan couldn't parse \"%s\"\n",
 		       str);
-		return(NULL);
+		return NULL;
 	}
-	if(ops->init == NULL) return(NULL); 
+	if(ops->init == NULL)
+		return NULL;
 	data = (*ops->init)(str, device, opts);
-	if(data == NULL) return(NULL);
+	if(data == NULL)
+		return NULL;
 
 	chan = kmalloc(sizeof(*chan), GFP_ATOMIC);
-	if(chan == NULL) return(NULL);
+	if(chan == NULL)
+		return NULL;
 	*chan = ((struct chan) { .list	 	= LIST_HEAD_INIT(chan->list),
 				 .primary	= 1,
 				 .input		= 0,
@@ -501,7 +510,7 @@ static struct chan *parse_chan(char *str, int pri, int device,
 				 .pri 		= pri,
 				 .ops 		= ops,
 				 .data 		= data });
-	return(chan);
+	return chan;
 }
 
 int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
@@ -512,7 +521,8 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
 
 	if(!list_empty(chans)){
 		chan = list_entry(chans->next, struct chan, list);
-		if(chan->pri >= pri) return(0);
+		if(chan->pri >= pri)
+			return 0;
 		free_chan(chans);
 		INIT_LIST_HEAD(chans);
 	}
@@ -523,23 +533,29 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
 		*out = '\0';
 		out++;
 		new = parse_chan(in, pri, device, opts);
-		if(new == NULL) return(-1);
+		if(new == NULL)
+			return -1;
+
 		new->input = 1;
 		list_add(&new->list, chans);
 
 		new = parse_chan(out, pri, device, opts);
-		if(new == NULL) return(-1);
+		if(new == NULL)
+			return -1;
+
 		list_add(&new->list, chans);
 		new->output = 1;
 	}
 	else {
 		new = parse_chan(str, pri, device, opts);
-		if(new == NULL) return(-1);
+		if(new == NULL)
+			return -1;
+
 		list_add(&new->list, chans);
 		new->input = 1;
 		new->output = 1;
 	}
-	return(0);
+	return 0;
 }
 
 int chan_out_fd(struct list_head *chans)
@@ -550,9 +566,9 @@ int chan_out_fd(struct list_head *chans)
 	list_for_each(ele, chans){
 		chan = list_entry(ele, struct chan, list);
 		if(chan->primary && chan->output)
-			return(chan->fd);
+			return chan->fd;
 	}
-	return(-1);
+	return -1;
 }
 
 void chan_interrupt(struct list_head *chans, struct work_struct *task,
@@ -567,7 +583,7 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
 		chan = list_entry(ele, struct chan, list);
 		if(!chan->input || (chan->ops->read == NULL)) continue;
 		do {
-			if((tty != NULL) && 
+			if((tty != NULL) &&
 			   (tty->flip.count >= TTY_FLIPBUF_SIZE)){
 				schedule_work(task);
 				goto out;
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index c31fc54..2ee00cb 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -124,7 +124,8 @@ static int buffer_data(struct line *line, const char *buf, int len)
 	if (len < end){
 		memcpy(line->tail, buf, len);
 		line->tail += len;
-	} else {
+	}
+	else {
 		/* The circular buffer is wrapping */
 		memcpy(line->tail, buf, end);
 		buf += end;
@@ -170,7 +171,7 @@ static int flush_buffer(struct line *line)
 	}
 
 	count = line->tail - line->head;
-	n = write_chan(&line->chan_list, line->head, count, 
+	n = write_chan(&line->chan_list, line->head, count,
 		       line->driver->write_irq);
 
 	if(n < 0)
@@ -227,7 +228,7 @@ int line_write(struct tty_struct *tty, const unsigned char *buf, int len)
 		if (err <= 0 && (err != -EAGAIN || !ret))
 			ret = err;
 	} else {
-		n = write_chan(&line->chan_list, buf, len, 
+		n = write_chan(&line->chan_list, buf, len,
 			       line->driver->write_irq);
 		if (n < 0) {
 			ret = n;
@@ -384,13 +385,13 @@ int line_setup_irq(int fd, int input, int output, struct tty_struct *tty)
 
 	if (input)
 		err = um_request_irq(driver->read_irq, fd, IRQ_READ,
-				       line_interrupt, flags, 
+				       line_interrupt, flags,
 				       driver->read_irq_name, tty);
 	if (err)
 		return err;
 	if (output)
 		err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
-					line_write_interrupt, flags, 
+					line_write_interrupt, flags,
 					driver->write_irq_name, tty);
 	line->have_irq = 1;
 	return err;
@@ -512,10 +513,11 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
 		/* We said con=/ssl= instead of con#=, so we are configuring all
 		 * consoles at once.*/
 		n = -1;
-	} else {
+	}
+	else {
 		n = simple_strtoul(init, &end, 0);
 		if(*end != '='){
-			printk(KERN_ERR "line_setup failed to parse \"%s\"\n", 
+			printk(KERN_ERR "line_setup failed to parse \"%s\"\n",
 			       init);
 			return 0;
 		}
@@ -527,7 +529,8 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
 		printk("line_setup - %d out of range ((0 ... %d) allowed)\n",
 		       n, num - 1);
 		return 0;
-	} else if (n >= 0){
+	}
+	else if (n >= 0){
 		if (lines[n].count > 0) {
 			printk("line_setup - device %d is open\n", n);
 			return 0;
@@ -541,11 +544,13 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
 				lines[n].valid = 1;
 			}	
 		}
-	} else if(!all_allowed){
+	}
+	else if(!all_allowed){
 		printk("line_setup - can't configure all devices from "
 		       "mconsole\n");
 		return 0;
-	} else {
+	}
+	else {
 		for(i = 0; i < num; i++){
 			if(lines[i].init_pri <= INIT_ALL){
 				lines[i].init_pri = INIT_ALL;
@@ -627,7 +632,7 @@ int line_remove(struct line *lines, unsigned int num, int n)
 }
 
 struct tty_driver *line_register_devfs(struct lines *set,
-			 struct line_driver *line_driver, 
+			 struct line_driver *line_driver,
 			 struct tty_operations *ops, struct line *lines,
 			 int nlines)
 {
@@ -656,7 +661,7 @@ struct tty_driver *line_register_devfs(struct lines *set,
 	}
 
 	for(i = 0; i < nlines; i++){
-		if(!lines[i].valid) 
+		if(!lines[i].valid)
 			tty_unregister_device(driver, i);
 	}
 
@@ -677,11 +682,12 @@ void lines_init(struct line *lines, int nlines)
 		line = &lines[i];
 		INIT_LIST_HEAD(&line->chan_list);
 		spin_lock_init(&line->lock);
-		if(line->init_str != NULL){
-			line->init_str = kstrdup(line->init_str, GFP_KERNEL);
-			if(line->init_str == NULL)
-				printk("lines_init - kstrdup returned NULL\n");
-		}
+		if(line->init_str == NULL)
+			continue;
+
+		line->init_str = kstrdup(line->init_str, GFP_KERNEL);
+		if(line->init_str == NULL)
+			printk("lines_init - kstrdup returned NULL\n");
 	}
 }
 
@@ -717,8 +723,7 @@ irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused)
 	tty  = winch->tty;
 	if (tty != NULL) {
 		line = tty->driver_data;
-		chan_window_size(&line->chan_list,
-				 &tty->winsize.ws_row, 
+		chan_window_size(&line->chan_list, &tty->winsize.ws_row,
 				 &tty->winsize.ws_col);
 		kill_pg(tty->pgrp, SIGWINCH, 1);
 	}
@@ -749,7 +754,7 @@ void register_winch_irq(int fd, int tty_fd, int pid, struct tty_struct *tty)
 	spin_unlock(&winch_handler_lock);
 
 	if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt,
-			  SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 
+			  SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM,
 			  "winch", winch) < 0)
 		printk("register_winch_irq - failed to register IRQ\n");
 }
@@ -800,7 +805,7 @@ static void winch_cleanup(void)
 			deactivate_fd(winch->fd, WINCH_IRQ);
 			os_close_file(winch->fd);
 		}
-		if(winch->pid != -1) 
+		if(winch->pid != -1)
 			os_kill_process(winch->pid, 1);
 	}
 }
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index b367864..355866a 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -34,7 +34,7 @@
 #include "irq_kern.h"
 #include "choose-mode.h"
 
-static int do_unlink_socket(struct notifier_block *notifier, 
+static int do_unlink_socket(struct notifier_block *notifier,
 			    unsigned long what, void *data)
 {
 	return(mconsole_unlink_socket());
@@ -46,7 +46,7 @@ static struct notifier_block reboot_notifier = {
 	.priority		= 0,
 };
 
-/* Safe without explicit locking for now.  Tasklets provide their own 
+/* Safe without explicit locking for now.  Tasklets provide their own
  * locking, and the interrupt handler is safe because it can't interrupt
  * itself and it can only happen on CPU 0.
  */
@@ -60,7 +60,7 @@ static void mc_work_proc(void *unused)
 
 	while(!list_empty(&mc_requests)){
 		local_save_flags(flags);
-		req = list_entry(mc_requests.next, struct mconsole_entry, 
+		req = list_entry(mc_requests.next, struct mconsole_entry,
 				 list);
 		list_del(&req->list);
 		local_irq_restore(flags);
@@ -103,8 +103,8 @@ void mconsole_version(struct mc_request *req)
 {
 	char version[256];
 
-	sprintf(version, "%s %s %s %s %s", system_utsname.sysname, 
-		system_utsname.nodename, system_utsname.release, 
+	sprintf(version, "%s %s %s %s %s", system_utsname.sysname,
+		system_utsname.nodename, system_utsname.release,
 		system_utsname.version, system_utsname.machine);
 	mconsole_reply(req, version, 0, 0);
 }
@@ -348,7 +348,7 @@ static struct mc_device *mconsole_find_dev(char *name)
 
 #define CONFIG_BUF_SIZE 64
 
-static void mconsole_get_config(int (*get_config)(char *, char *, int, 
+static void mconsole_get_config(int (*get_config)(char *, char *, int,
 						  char **),
 				struct mc_request *req, char *name)
 {
@@ -389,7 +389,6 @@ static void mconsole_get_config(int (*get_config)(char *, char *, int,
  out:
 	if(buf != default_buf)
 		kfree(buf);
-	
 }
 
 void mconsole_config(struct mc_request *req)
@@ -420,7 +419,7 @@ void mconsole_config(struct mc_request *req)
 
 void mconsole_remove(struct mc_request *req)
 {
-	struct mc_device *dev;	
+	struct mc_device *dev;
 	char *ptr = req->request.data, *err_msg = "";
         char error[256];
 	int err, start, end, n;
@@ -534,7 +533,7 @@ void mconsole_stack(struct mc_request *req)
 /* Changed by mconsole_setup, which is __setup, and called before SMP is
  * active.
  */
-static char *notify_socket = NULL; 
+static char *notify_socket = NULL;
 
 int mconsole_init(void)
 {
@@ -566,13 +565,13 @@ int mconsole_init(void)
 		notify_socket = kstrdup(notify_socket, GFP_KERNEL);
 		if(notify_socket != NULL)
 			mconsole_notify(notify_socket, MCONSOLE_SOCKET,
-					mconsole_socket_name, 
+					mconsole_socket_name,
 					strlen(mconsole_socket_name) + 1);
 		else printk(KERN_ERR "mconsole_setup failed to strdup "
 			    "string\n");
 	}
 
-	printk("mconsole (version %d) initialized on %s\n", 
+	printk("mconsole (version %d) initialized on %s\n",
 	       MCONSOLE_VERSION, mconsole_socket_name);
 	return(0);
 }
@@ -585,7 +584,7 @@ static int write_proc_mconsole(struct file *file, const char __user *buffer,
 	char *buf;
 
 	buf = kmalloc(count + 1, GFP_KERNEL);
-	if(buf == NULL) 
+	if(buf == NULL)
 		return(-ENOMEM);
 
 	if(copy_from_user(buf, buffer, count)){
@@ -661,7 +660,7 @@ static int notify_panic(struct notifier_block *self, unsigned long unused1,
 
 	if(notify_socket == NULL) return(0);
 
-	mconsole_notify(notify_socket, MCONSOLE_PANIC, message, 
+	mconsole_notify(notify_socket, MCONSOLE_PANIC, message,
 			strlen(message) + 1);
 	return(0);
 }
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 62e04ec..95a3eaa 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -69,7 +69,7 @@ static struct line_driver driver = {
 		.name  		= "ssl",
 		.config 	= ssl_config,
 		.get_config 	= ssl_get_config,
-                .id		= line_id,
+		.id		= line_id,
 		.remove 	= ssl_remove,
 	},
 };
@@ -84,21 +84,21 @@ static struct lines lines = LINES_INIT(NR_PORTS);
 
 static int ssl_config(char *str)
 {
-	return(line_config(serial_lines, 
-			   sizeof(serial_lines)/sizeof(serial_lines[0]), str));
+	return line_config(serial_lines,
+			   sizeof(serial_lines)/sizeof(serial_lines[0]), str);
 }
 
 static int ssl_get_config(char *dev, char *str, int size, char **error_out)
 {
-	return(line_get_config(dev, serial_lines, 
-			       sizeof(serial_lines)/sizeof(serial_lines[0]), 
-			       str, size, error_out));
+	return line_get_config(dev, serial_lines,
+			       sizeof(serial_lines)/sizeof(serial_lines[0]),
+			       str, size, error_out);
 }
 
 static int ssl_remove(int n)
 {
-        return line_remove(serial_lines,
-                           sizeof(serial_lines)/sizeof(serial_lines[0]), n);
+	return line_remove(serial_lines,
+			   sizeof(serial_lines)/sizeof(serial_lines[0]), n);
 }
 
 int ssl_open(struct tty_struct *tty, struct file *filp)
@@ -183,7 +183,7 @@ static int ssl_console_setup(struct console *co, char *options)
 {
 	struct line *line = &serial_lines[co->index];
 
-	return console_open_chan(line,co,&opts);
+	return console_open_chan(line, co, &opts);
 }
 
 static struct console ssl_cons = {
@@ -199,10 +199,11 @@ int ssl_init(void)
 {
 	char *new_title;
 
-	printk(KERN_INFO "Initializing software serial port version %d\n", 
+	printk(KERN_INFO "Initializing software serial port version %d\n",
 	       ssl_version);
 	ssl_driver = line_register_devfs(&lines, &driver, &ssl_ops,
-					 serial_lines, ARRAY_SIZE(serial_lines));
+					 serial_lines,
+					 ARRAY_SIZE(serial_lines));
 
 	lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]));
 
@@ -212,7 +213,7 @@ int ssl_init(void)
 
 	ssl_init_done = 1;
 	register_console(&ssl_cons);
-	return(0);
+	return 0;
 }
 late_initcall(ssl_init);
 
@@ -227,9 +228,9 @@ __uml_exitcall(ssl_exit);
 
 static int ssl_chan_setup(char *str)
 {
-	return(line_setup(serial_lines,
+	return line_setup(serial_lines,
 			  sizeof(serial_lines)/sizeof(serial_lines[0]),
-			  str, 1));
+			  str, 1);
 }
 
 __setup("ssl", ssl_chan_setup);
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 005aa63..8f3b168 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -75,7 +75,7 @@ static struct line_driver driver = {
 		.name  		= "con",
 		.config 	= con_config,
 		.get_config 	= con_get_config,
-                .id		= line_id,
+		.id		= line_id,
 		.remove 	= con_remove,
 	},
 };
@@ -86,23 +86,23 @@ static struct lines console_lines = LINES_INIT(MAX_TTYS);
  * individual elements are protected by individual semaphores.
  */
 struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver),
-			      [ 1 ... MAX_TTYS - 1 ] = 
+			      [ 1 ... MAX_TTYS - 1 ] =
 			      LINE_INIT(CONFIG_CON_CHAN, &driver) };
 
 static int con_config(char *str)
 {
-	return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str));
+	return line_config(vts, sizeof(vts)/sizeof(vts[0]), str);
 }
 
 static int con_get_config(char *dev, char *str, int size, char **error_out)
 {
-	return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, 
-			       size, error_out));
+	return line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str,
+			       size, error_out);
 }
 
 static int con_remove(int n)
 {
-        return line_remove(vts, sizeof(vts)/sizeof(vts[0]), n);
+	return line_remove(vts, sizeof(vts)/sizeof(vts[0]), n);
 }
 
 static int con_open(struct tty_struct *tty, struct file *filp)
@@ -117,7 +117,7 @@ static struct tty_operations console_ops = {
 	.close 	 		= line_close,
 	.write 	 		= line_write,
 	.put_char 		= line_put_char,
- 	.write_room		= line_write_room,
+	.write_room		= line_write_room,
 	.chars_in_buffer 	= line_chars_in_buffer,
 	.flush_buffer 		= line_flush_buffer,
 	.flush_chars 		= line_flush_chars,
@@ -126,7 +126,7 @@ static struct tty_operations console_ops = {
 };
 
 static void uml_console_write(struct console *console, const char *string,
-			  unsigned len)
+			      unsigned len)
 {
 	struct line *line = &vts[console->index];
 	unsigned long flags;
@@ -146,7 +146,7 @@ static int uml_console_setup(struct console *co, char *options)
 {
 	struct line *line = &vts[co->index];
 
-	return console_open_chan(line,co,&opts);
+	return console_open_chan(line, co, &opts);
 }
 
 static struct console stdiocons = {
@@ -156,7 +156,7 @@ static struct console stdiocons = {
 	.setup		= uml_console_setup,
 	.flags		= CON_PRINTBUFFER,
 	.index		= -1,
-	.data           = &vts,
+	.data		= &vts,
 };
 
 int stdio_init(void)
@@ -166,7 +166,7 @@ int stdio_init(void)
 	console_driver = line_register_devfs(&console_lines, &driver,
 					     &console_ops, vts,
 					     ARRAY_SIZE(vts));
-	if (NULL == console_driver)
+	if (console_driver == NULL)
 		return -1;
 	printk(KERN_INFO "Initialized stdio console driver\n");
 
@@ -178,7 +178,7 @@ int stdio_init(void)
 
 	con_init_done = 1;
 	register_console(&stdiocons);
-	return(0);
+	return 0;
 }
 late_initcall(stdio_init);
 
@@ -192,7 +192,7 @@ __uml_exitcall(console_exit);
 
 static int console_chan_setup(char *str)
 {
-	return(line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1));
+	return line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1);
 }
 __setup("con", console_chan_setup);
 __channel_help(console_chan_setup, "con");
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index da9a671..9ac0691 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -47,14 +47,3 @@ extern int chan_config_string(struct list_head *chans, char *str, int size,
 			      char **error_out);
 
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 5323d22..315788c 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -64,8 +64,8 @@ struct line {
 	  head :	NULL, \
 	  tail :	NULL, \
 	  sigio :	0, \
- 	  driver :	d, \
-          have_irq :	0 }
+	  driver :	d, \
+	  have_irq :	0 }
 
 struct lines {
 	int num;
@@ -74,11 +74,12 @@ struct lines {
 #define LINES_INIT(n) {  num :		n }
 
 extern void line_close(struct tty_struct *tty, struct file * filp);
-extern int line_open(struct line *lines, struct tty_struct *tty, 
+extern int line_open(struct line *lines, struct tty_struct *tty,
 		     struct chan_opts *opts);
 extern int line_setup(struct line *lines, unsigned int sizeof_lines, char *init,
 		      int all_allowed);
-extern int line_write(struct tty_struct *tty, const unsigned char *buf, int len);
+extern int line_write(struct tty_struct *tty, const unsigned char *buf,
+		      int len);
 extern void line_put_char(struct tty_struct *tty, unsigned char ch);
 extern void line_set_termios(struct tty_struct *tty, struct termios * old);
 extern int line_chars_in_buffer(struct tty_struct *tty);
@@ -89,21 +90,24 @@ extern int line_ioctl(struct tty_struct *tty, struct file * file,
 		      unsigned int cmd, unsigned long arg);
 
 extern char *add_xterm_umid(char *base);
-extern int line_setup_irq(int fd, int input, int output, struct tty_struct *tty);
+extern int line_setup_irq(int fd, int input, int output,
+			  struct tty_struct *tty);
 extern void line_close_chan(struct line *line);
 extern void line_disable(struct tty_struct *tty, int current_irq);
-extern struct tty_driver * line_register_devfs(struct lines *set, 
-				struct line_driver *line_driver, 
+extern struct tty_driver * line_register_devfs(struct lines *set,
+				struct line_driver *line_driver,
 				struct tty_operations *driver,
 				struct line *lines,
 				int nlines);
 extern void lines_init(struct line *lines, int nlines);
 extern void close_lines(struct line *lines, int nlines);
 
-extern int line_config(struct line *lines, unsigned int sizeof_lines, char *str);
+extern int line_config(struct line *lines, unsigned int sizeof_lines,
+		       char *str);
 extern int line_id(char **str, int *start_out, int *end_out);
 extern int line_remove(struct line *lines, unsigned int sizeof_lines, int n);
-extern int line_get_config(char *dev, struct line *lines, unsigned int sizeof_lines, char *str,
+extern int line_get_config(char *dev, struct line *lines,
+			   unsigned int sizeof_lines, char *str,
 			   int size, char **error_out);
 
 #endif
diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c
index 0b78bb7..f897140 100644
--- a/arch/um/os-Linux/aio.c
+++ b/arch/um/os-Linux/aio.c
@@ -16,12 +16,12 @@
 #include "mode.h"
 
 struct aio_thread_req {
-        enum aio_type type;
-        int io_fd;
-        unsigned long long offset;
-        char *buf;
-        int len;
-        struct aio_context *aio;
+	enum aio_type type;
+	int io_fd;
+	unsigned long long offset;
+	char *buf;
+	int len;
+	struct aio_context *aio;
 };
 
 static int aio_req_fd_r = -1;
@@ -38,18 +38,18 @@ static int aio_req_fd_w = -1;
 
 static long io_setup(int n, aio_context_t *ctxp)
 {
-        return syscall(__NR_io_setup, n, ctxp);
+	return syscall(__NR_io_setup, n, ctxp);
 }
 
 static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
 {
-        return syscall(__NR_io_submit, ctx, nr, iocbpp);
+	return syscall(__NR_io_submit, ctx, nr, iocbpp);
 }
 
 static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
-                         struct io_event *events, struct timespec *timeout)
+			 struct io_event *events, struct timespec *timeout)
 {
-        return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
+	return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
 }
 
 #endif
@@ -66,150 +66,150 @@ static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
  */
 
 static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf,
-                  int len, unsigned long long offset, struct aio_context *aio)
+		  int len, unsigned long long offset, struct aio_context *aio)
 {
-        struct iocb iocb, *iocbp = &iocb;
-        char c;
-        int err;
-
-        iocb = ((struct iocb) { .aio_data 	= (unsigned long) aio,
-                                .aio_reqprio	= 0,
-                                .aio_fildes	= fd,
-                                .aio_buf	= (unsigned long) buf,
-                                .aio_nbytes	= len,
-                                .aio_offset	= offset,
-                                .aio_reserved1	= 0,
-                                .aio_reserved2	= 0,
-                                .aio_reserved3	= 0 });
-
-        switch(type){
-        case AIO_READ:
-                iocb.aio_lio_opcode = IOCB_CMD_PREAD;
-                err = io_submit(ctx, 1, &iocbp);
-                break;
-        case AIO_WRITE:
-                iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
-                err = io_submit(ctx, 1, &iocbp);
-                break;
-        case AIO_MMAP:
-                iocb.aio_lio_opcode = IOCB_CMD_PREAD;
-                iocb.aio_buf = (unsigned long) &c;
-                iocb.aio_nbytes = sizeof(c);
-                err = io_submit(ctx, 1, &iocbp);
-                break;
-        default:
-                printk("Bogus op in do_aio - %d\n", type);
-                err = -EINVAL;
-                break;
-        }
-
-        if(err > 0)
-                err = 0;
+	struct iocb iocb, *iocbp = &iocb;
+	char c;
+	int err;
+
+	iocb = ((struct iocb) { .aio_data 	= (unsigned long) aio,
+				.aio_reqprio	= 0,
+				.aio_fildes	= fd,
+				.aio_buf	= (unsigned long) buf,
+				.aio_nbytes	= len,
+				.aio_offset	= offset,
+				.aio_reserved1	= 0,
+				.aio_reserved2	= 0,
+				.aio_reserved3	= 0 });
+
+	switch(type){
+	case AIO_READ:
+		iocb.aio_lio_opcode = IOCB_CMD_PREAD;
+		err = io_submit(ctx, 1, &iocbp);
+		break;
+	case AIO_WRITE:
+		iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
+		err = io_submit(ctx, 1, &iocbp);
+		break;
+	case AIO_MMAP:
+		iocb.aio_lio_opcode = IOCB_CMD_PREAD;
+		iocb.aio_buf = (unsigned long) &c;
+		iocb.aio_nbytes = sizeof(c);
+		err = io_submit(ctx, 1, &iocbp);
+		break;
+	default:
+		printk("Bogus op in do_aio - %d\n", type);
+		err = -EINVAL;
+		break;
+	}
+
+	if(err > 0)
+		err = 0;
 	else
 		err = -errno;
 
-        return err;
+	return err;
 }
 
 static aio_context_t ctx = 0;
 
 static int aio_thread(void *arg)
 {
-        struct aio_thread_reply reply;
-        struct io_event event;
-        int err, n, reply_fd;
-
-        signal(SIGWINCH, SIG_IGN);
-
-        while(1){
-                n = io_getevents(ctx, 1, 1, &event, NULL);
-                if(n < 0){
-                        if(errno == EINTR)
-                                continue;
-                        printk("aio_thread - io_getevents failed, "
-                               "errno = %d\n", errno);
-                }
-                else {
-                        reply = ((struct aio_thread_reply)
-				 { .data = (void *) (long) event.data,
-				   .err	= event.res });
+	struct aio_thread_reply reply;
+	struct io_event event;
+	int err, n, reply_fd;
+
+	signal(SIGWINCH, SIG_IGN);
+
+	while(1){
+		n = io_getevents(ctx, 1, 1, &event, NULL);
+		if(n < 0){
+			if(errno == EINTR)
+				continue;
+			printk("aio_thread - io_getevents failed, "
+			       "errno = %d\n", errno);
+		}
+		else {
+			reply = ((struct aio_thread_reply)
+				{ .data = (void *) (long) event.data,
+						.err	= event.res });
 			reply_fd = ((struct aio_context *) reply.data)->reply_fd;
 			err = os_write_file(reply_fd, &reply, sizeof(reply));
-                        if(err != sizeof(reply))
+			if(err != sizeof(reply))
 				printk("aio_thread - write failed, fd = %d, "
-                                       "err = %d\n", aio_req_fd_r, -err);
-                }
-        }
-        return 0;
+				       "err = %d\n", aio_req_fd_r, -err);
+		}
+	}
+	return 0;
 }
 
 #endif
 
 static int do_not_aio(struct aio_thread_req *req)
 {
-        char c;
-        int err;
-
-        switch(req->type){
-        case AIO_READ:
-                err = os_seek_file(req->io_fd, req->offset);
-                if(err)
-                        goto out;
-
-                err = os_read_file(req->io_fd, req->buf, req->len);
-                break;
-        case AIO_WRITE:
-                err = os_seek_file(req->io_fd, req->offset);
-                if(err)
-                        goto out;
-
-                err = os_write_file(req->io_fd, req->buf, req->len);
-                break;
-        case AIO_MMAP:
-                err = os_seek_file(req->io_fd, req->offset);
-                if(err)
-                        goto out;
-
-                err = os_read_file(req->io_fd, &c, sizeof(c));
-                break;
-        default:
-                printk("do_not_aio - bad request type : %d\n", req->type);
-                err = -EINVAL;
-                break;
-        }
-
- out:
-        return err;
+	char c;
+	int err;
+
+	switch(req->type){
+	case AIO_READ:
+		err = os_seek_file(req->io_fd, req->offset);
+		if(err)
+			goto out;
+
+		err = os_read_file(req->io_fd, req->buf, req->len);
+		break;
+	case AIO_WRITE:
+		err = os_seek_file(req->io_fd, req->offset);
+		if(err)
+			goto out;
+
+		err = os_write_file(req->io_fd, req->buf, req->len);
+		break;
+	case AIO_MMAP:
+		err = os_seek_file(req->io_fd, req->offset);
+		if(err)
+			goto out;
+
+		err = os_read_file(req->io_fd, &c, sizeof(c));
+		break;
+	default:
+		printk("do_not_aio - bad request type : %d\n", req->type);
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	return err;
 }
 
 static int not_aio_thread(void *arg)
 {
-        struct aio_thread_req req;
-        struct aio_thread_reply reply;
-        int err;
-
-        signal(SIGWINCH, SIG_IGN);
-        while(1){
-                err = os_read_file(aio_req_fd_r, &req, sizeof(req));
-                if(err != sizeof(req)){
-                        if(err < 0)
-                                printk("not_aio_thread - read failed, "
-                                       "fd = %d, err = %d\n", aio_req_fd_r,
-                                       -err);
-                        else {
-                                printk("not_aio_thread - short read, fd = %d, "
-                                       "length = %d\n", aio_req_fd_r, err);
-                        }
-                        continue;
-                }
-                err = do_not_aio(&req);
-                reply = ((struct aio_thread_reply) { .data 	= req.aio,
-                                                     .err	= err });
-                err = os_write_file(req.aio->reply_fd, &reply, sizeof(reply));
-                if(err != sizeof(reply))
-                        printk("not_aio_thread - write failed, fd = %d, "
-                               "err = %d\n", aio_req_fd_r, -err);
-        }
+	struct aio_thread_req req;
+	struct aio_thread_reply reply;
+	int err;
+
+	signal(SIGWINCH, SIG_IGN);
+	while(1){
+		err = os_read_file(aio_req_fd_r, &req, sizeof(req));
+		if(err != sizeof(req)){
+			if(err < 0)
+				printk("not_aio_thread - read failed, "
+				       "fd = %d, err = %d\n", aio_req_fd_r,
+				       -err);
+			else {
+				printk("not_aio_thread - short read, fd = %d, "
+				       "length = %d\n", aio_req_fd_r, err);
+			}
+			continue;
+		}
+		err = do_not_aio(&req);
+		reply = ((struct aio_thread_reply) { .data 	= req.aio,
+					 .err	= err });
+		err = os_write_file(req.aio->reply_fd, &reply, sizeof(reply));
+		if(err != sizeof(reply))
+			printk("not_aio_thread - write failed, fd = %d, "
+			       "err = %d\n", aio_req_fd_r, -err);
+	}
 
 	return 0;
 }
@@ -218,93 +218,93 @@ static int aio_pid = -1;
 
 static int init_aio_24(void)
 {
-        unsigned long stack;
-        int fds[2], err;
-
-        err = os_pipe(fds, 1, 1);
-        if(err)
-                goto out;
-
-        aio_req_fd_w = fds[0];
-        aio_req_fd_r = fds[1];
-        err = run_helper_thread(not_aio_thread, NULL,
-                                CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
-        if(err < 0)
-                goto out_close_pipe;
-
-        aio_pid = err;
-        goto out;
-
- out_close_pipe:
-        os_close_file(fds[0]);
-        os_close_file(fds[1]);
-        aio_req_fd_w = -1;
-        aio_req_fd_r = -1;
- out:
+	unsigned long stack;
+	int fds[2], err;
+
+	err = os_pipe(fds, 1, 1);
+	if(err)
+		goto out;
+
+	aio_req_fd_w = fds[0];
+	aio_req_fd_r = fds[1];
+	err = run_helper_thread(not_aio_thread, NULL,
+				CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
+	if(err < 0)
+		goto out_close_pipe;
+
+	aio_pid = err;
+	goto out;
+
+out_close_pipe:
+	os_close_file(fds[0]);
+	os_close_file(fds[1]);
+	aio_req_fd_w = -1;
+	aio_req_fd_r = -1;
+out:
 #ifndef HAVE_AIO_ABI
 	printk("/usr/include/linux/aio_abi.h not present during build\n");
 #endif
 	printk("2.6 host AIO support not used - falling back to I/O "
 	       "thread\n");
-        return 0;
+	return 0;
 }
 
 #ifdef HAVE_AIO_ABI
 #define DEFAULT_24_AIO 0
 static int init_aio_26(void)
 {
-        unsigned long stack;
-        int err;
+	unsigned long stack;
+	int err;
 
-        if(io_setup(256, &ctx)){
+	if(io_setup(256, &ctx)){
 		err = -errno;
-                printk("aio_thread failed to initialize context, err = %d\n",
-                       errno);
-                return err;
-        }
+		printk("aio_thread failed to initialize context, err = %d\n",
+		       errno);
+		return err;
+	}
 
-        err = run_helper_thread(aio_thread, NULL,
-                                CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
-        if(err < 0)
-                return err;
+	err = run_helper_thread(aio_thread, NULL,
+				CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
+	if(err < 0)
+		return err;
 
-        aio_pid = err;
+	aio_pid = err;
 
 	printk("Using 2.6 host AIO\n");
-        return 0;
+	return 0;
 }
 
 static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
 			 unsigned long long offset, struct aio_context *aio)
 {
-        struct aio_thread_reply reply;
-        int err;
-
-        err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
-        if(err){
-                reply = ((struct aio_thread_reply) { .data = aio,
-                                                     .err  = err });
-                err = os_write_file(aio->reply_fd, &reply, sizeof(reply));
-                if(err != sizeof(reply))
-                        printk("submit_aio_26 - write failed, "
-                               "fd = %d, err = %d\n", aio->reply_fd, -err);
-                else err = 0;
-        }
-
-        return err;
+	struct aio_thread_reply reply;
+	int err;
+
+	err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
+	if(err){
+		reply = ((struct aio_thread_reply) { .data = aio,
+					 .err  = err });
+		err = os_write_file(aio->reply_fd, &reply, sizeof(reply));
+		if(err != sizeof(reply))
+			printk("submit_aio_26 - write failed, "
+			       "fd = %d, err = %d\n", aio->reply_fd, -err);
+		else err = 0;
+	}
+
+	return err;
 }
 
 #else
 #define DEFAULT_24_AIO 1
 static int init_aio_26(void)
 {
-        return -ENOSYS;
+	return -ENOSYS;
 }
 
 static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
 			 unsigned long long offset, struct aio_context *aio)
 {
-        return -ENOSYS;
+	return -ENOSYS;
 }
 #endif
 
@@ -312,8 +312,8 @@ static int aio_24 = DEFAULT_24_AIO;
 
 static int __init set_aio_24(char *name, int *add)
 {
-        aio_24 = 1;
-        return 0;
+	aio_24 = 1;
+	return 0;
 }
 
 __uml_setup("aio=2.4", set_aio_24,
@@ -330,28 +330,27 @@ __uml_setup("aio=2.4", set_aio_24,
 
 static int init_aio(void)
 {
-        int err;
-
-        CHOOSE_MODE(({
-                if(!aio_24){
-                        printk("Disabling 2.6 AIO in tt mode\n");
-                        aio_24 = 1;
-                } }), (void) 0);
-
-        if(!aio_24){
-                err = init_aio_26();
-                if(err && (errno == ENOSYS)){
-                        printk("2.6 AIO not supported on the host - "
-                               "reverting to 2.4 AIO\n");
-                        aio_24 = 1;
-                }
-                else return err;
-        }
-
-        if(aio_24)
-                return init_aio_24();
-
-        return 0;
+	int err;
+
+	CHOOSE_MODE(({ if(!aio_24){
+			    printk("Disabling 2.6 AIO in tt mode\n");
+			    aio_24 = 1;
+		    } }), (void) 0);
+
+	if(!aio_24){
+		err = init_aio_26();
+		if(err && (errno == ENOSYS)){
+			printk("2.6 AIO not supported on the host - "
+			       "reverting to 2.4 AIO\n");
+			aio_24 = 1;
+		}
+		else return err;
+	}
+
+	if(aio_24)
+		return init_aio_24();
+
+	return 0;
 }
 
 /* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio
@@ -364,8 +363,8 @@ __initcall(init_aio);
 
 static void exit_aio(void)
 {
-        if(aio_pid != -1)
-                os_kill_process(aio_pid, 1);
+	if(aio_pid != -1)
+		os_kill_process(aio_pid, 1);
 }
 
 __uml_exitcall(exit_aio);
@@ -373,30 +372,30 @@ __uml_exitcall(exit_aio);
 static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len,
 			 unsigned long long offset, struct aio_context *aio)
 {
-        struct aio_thread_req req = { .type 		= type,
-                                      .io_fd		= io_fd,
-                                      .offset		= offset,
-                                      .buf		= buf,
-                                      .len		= len,
-                                      .aio		= aio,
-        };
-        int err;
-
-        err = os_write_file(aio_req_fd_w, &req, sizeof(req));
-        if(err == sizeof(req))
-                err = 0;
-
-        return err;
+	struct aio_thread_req req = { .type 		= type,
+				      .io_fd		= io_fd,
+				      .offset		= offset,
+				      .buf		= buf,
+				      .len		= len,
+				      .aio		= aio,
+	};
+	int err;
+
+	err = os_write_file(aio_req_fd_w, &req, sizeof(req));
+	if(err == sizeof(req))
+		err = 0;
+
+	return err;
 }
 
 int submit_aio(enum aio_type type, int io_fd, char *buf, int len,
-               unsigned long long offset, int reply_fd,
-               struct aio_context *aio)
+	       unsigned long long offset, int reply_fd,
+	       struct aio_context *aio)
 {
-        aio->reply_fd = reply_fd;
-        if(aio_24)
-                return submit_aio_24(type, io_fd, buf, len, offset, aio);
-        else {
-                return submit_aio_26(type, io_fd, buf, len, offset, aio);
-        }
+	aio->reply_fd = reply_fd;
+	if(aio_24)
+		return submit_aio_24(type, io_fd, buf, len, offset, aio);
+	else {
+		return submit_aio_26(type, io_fd, buf, len, offset, aio);
+	}
 }
-- 
cgit v1.1


From 0834cc77af6a8a650f803d4a7c3c0f134b366f87 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:51 -0800
Subject: [PATCH] uml: use ARRAY_SIZE

This patch replaces instances of "sizeof(foo)/sizeof(foo[0])" with
ARRAY_SIZE(foo), which expands to the same thing.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/ssl.c           | 20 +++++++-------------
 arch/um/drivers/stdio_console.c | 13 ++++++-------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 95a3eaa..8564784 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -84,21 +84,18 @@ static struct lines lines = LINES_INIT(NR_PORTS);
 
 static int ssl_config(char *str)
 {
-	return line_config(serial_lines,
-			   sizeof(serial_lines)/sizeof(serial_lines[0]), str);
+	return line_config(serial_lines, ARRAY_SIZE(serial_lines), str);
 }
 
 static int ssl_get_config(char *dev, char *str, int size, char **error_out)
 {
-	return line_get_config(dev, serial_lines,
-			       sizeof(serial_lines)/sizeof(serial_lines[0]),
-			       str, size, error_out);
+	return line_get_config(dev, serial_lines, ARRAY_SIZE(serial_lines), str,
+			       size, error_out);
 }
 
 static int ssl_remove(int n)
 {
-	return line_remove(serial_lines,
-			   sizeof(serial_lines)/sizeof(serial_lines[0]), n);
+	return line_remove(serial_lines, ARRAY_SIZE(serial_lines), n);
 }
 
 int ssl_open(struct tty_struct *tty, struct file *filp)
@@ -205,7 +202,7 @@ int ssl_init(void)
 					 serial_lines,
 					 ARRAY_SIZE(serial_lines));
 
-	lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]));
+	lines_init(serial_lines, ARRAY_SIZE(serial_lines));
 
 	new_title = add_xterm_umid(opts.xterm_title);
 	if (new_title != NULL)
@@ -221,16 +218,13 @@ static void ssl_exit(void)
 {
 	if (!ssl_init_done)
 		return;
-	close_lines(serial_lines,
-		    sizeof(serial_lines)/sizeof(serial_lines[0]));
+	close_lines(serial_lines, ARRAY_SIZE(serial_lines));
 }
 __uml_exitcall(ssl_exit);
 
 static int ssl_chan_setup(char *str)
 {
-	return line_setup(serial_lines,
-			  sizeof(serial_lines)/sizeof(serial_lines[0]),
-			  str, 1);
+	return line_setup(serial_lines, ARRAY_SIZE(serial_lines), str, 1);
 }
 
 __setup("ssl", ssl_chan_setup);
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 8f3b168..b77f7d2 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -91,18 +91,17 @@ struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver),
 
 static int con_config(char *str)
 {
-	return line_config(vts, sizeof(vts)/sizeof(vts[0]), str);
+	return line_config(vts, ARRAY_SIZE(vts), str);
 }
 
 static int con_get_config(char *dev, char *str, int size, char **error_out)
 {
-	return line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str,
-			       size, error_out);
+	return line_get_config(dev, vts, ARRAY_SIZE(vts), str, size, error_out);
 }
 
 static int con_remove(int n)
 {
-	return line_remove(vts, sizeof(vts)/sizeof(vts[0]), n);
+	return line_remove(vts, ARRAY_SIZE(vts), n);
 }
 
 static int con_open(struct tty_struct *tty, struct file *filp)
@@ -170,7 +169,7 @@ int stdio_init(void)
 		return -1;
 	printk(KERN_INFO "Initialized stdio console driver\n");
 
-	lines_init(vts, sizeof(vts)/sizeof(vts[0]));
+	lines_init(vts, ARRAY_SIZE(vts));
 
 	new_title = add_xterm_umid(opts.xterm_title);
 	if(new_title != NULL)
@@ -186,13 +185,13 @@ static void console_exit(void)
 {
 	if (!con_init_done)
 		return;
-	close_lines(vts, sizeof(vts)/sizeof(vts[0]));
+	close_lines(vts, ARRAY_SIZE(vts));
 }
 __uml_exitcall(console_exit);
 
 static int console_chan_setup(char *str)
 {
-	return line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1);
+	return line_setup(vts, ARRAY_SIZE(vts), str, 1);
 }
 __setup("con", console_chan_setup);
 __channel_help(console_chan_setup, "con");
-- 
cgit v1.1


From 88890b88742debb97006df264b653d18acdc80d0 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:52 -0800
Subject: [PATCH] uml: Remove unneeded structure field

This removes a structure field which turned out to be pointless, and
references to it.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c | 16 ++++++----------
 arch/um/drivers/line.c      |  2 +-
 arch/um/include/chan_kern.h |  5 ++---
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 8b1262e..59c9b3f 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -315,7 +315,7 @@ int console_open_chan(struct line *line, struct console *co,
 		return 0;
 
 	if (0 != parse_chan_pair(line->init_str, &line->chan_list,
-				 line->init_pri, co->index, opts))
+				 co->index, opts))
 		return -1;
 	if (0 != open_chan(&line->chan_list))
 		return -1;
@@ -468,8 +468,7 @@ struct chan_type chan_table[] = {
 #endif
 };
 
-static struct chan *parse_chan(char *str, int pri, int device,
-			       struct chan_opts *opts)
+static struct chan *parse_chan(char *str, int device, struct chan_opts *opts)
 {
 	struct chan_type *entry;
 	struct chan_ops *ops;
@@ -507,13 +506,12 @@ static struct chan *parse_chan(char *str, int pri, int device,
 				 .output 	= 0,
 				 .opened  	= 0,
 				 .fd 		= -1,
-				 .pri 		= pri,
 				 .ops 		= ops,
 				 .data 		= data });
 	return chan;
 }
 
-int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
+int parse_chan_pair(char *str, struct list_head *chans, int device,
 		    struct chan_opts *opts)
 {
 	struct chan *new, *chan;
@@ -521,8 +519,6 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
 
 	if(!list_empty(chans)){
 		chan = list_entry(chans->next, struct chan, list);
-		if(chan->pri >= pri)
-			return 0;
 		free_chan(chans);
 		INIT_LIST_HEAD(chans);
 	}
@@ -532,14 +528,14 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
 		in = str;
 		*out = '\0';
 		out++;
-		new = parse_chan(in, pri, device, opts);
+		new = parse_chan(in, device, opts);
 		if(new == NULL)
 			return -1;
 
 		new->input = 1;
 		list_add(&new->list, chans);
 
-		new = parse_chan(out, pri, device, opts);
+		new = parse_chan(out, device, opts);
 		if(new == NULL)
 			return -1;
 
@@ -547,7 +543,7 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
 		new->output = 1;
 	}
 	else {
-		new = parse_chan(str, pri, device, opts);
+		new = parse_chan(str, device, opts);
 		if(new == NULL)
 			return -1;
 
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 2ee00cb..80ade22 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -438,7 +438,7 @@ int line_open(struct line *lines, struct tty_struct *tty,
 		}
 		if (list_empty(&line->chan_list)) {
 			err = parse_chan_pair(line->init_str, &line->chan_list,
-					      line->init_pri, tty->index, opts);
+					      tty->index, opts);
 			if(err) goto out;
 			err = open_chan(&line->chan_list);
 			if(err) goto out;
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index 9ac0691..22bf3a7 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -20,15 +20,14 @@ struct chan {
 	unsigned int output:1;
 	unsigned int opened:1;
 	int fd;
-	enum chan_init_pri pri;
 	struct chan_ops *ops;
 	void *data;
 };
 
 extern void chan_interrupt(struct list_head *chans, struct work_struct *task,
 			   struct tty_struct *tty, int irq);
-extern int parse_chan_pair(char *str, struct list_head *chans, int pri, 
-			   int device, struct chan_opts *opts);
+extern int parse_chan_pair(char *str, struct list_head *chans, int device,
+			   struct chan_opts *opts);
 extern int open_chan(struct list_head *chans);
 extern int write_chan(struct list_head *chans, const char *buf, int len,
 			     int write_irq);
-- 
cgit v1.1


From d571cd18f225542460b5d9b83e5e0d507be71656 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:53 -0800
Subject: [PATCH] uml: Move mconsole support out of generic code

A bit of restructuring which eliminates the all_allowed argument (which is
mconsole-specific) to line_setup.  That logic is moved to the mconsole
callback.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c          | 21 ++++++++++-----------
 arch/um/drivers/ssl.c           |  2 +-
 arch/um/drivers/stdio_console.c |  2 +-
 arch/um/include/line.h          |  4 ++--
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 80ade22..9af55ec 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -500,11 +500,9 @@ void close_lines(struct line *lines, int nlines)
 /* Common setup code for both startup command line and mconsole initialization.
  * @lines contains the the array (of size @num) to modify;
  * @init is the setup string;
- * @all_allowed is a boolean saying if we can setup the whole @lines
- * at once. For instance, it will be usually true for startup init. (where we
- * can use con=xterm) and false for mconsole.*/
+ */
 
-int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed)
+int line_setup(struct line *lines, unsigned int num, char *init)
 {
 	int i, n;
 	char *end;
@@ -545,11 +543,6 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
 			}	
 		}
 	}
-	else if(!all_allowed){
-		printk("line_setup - can't configure all devices from "
-		       "mconsole\n");
-		return 0;
-	}
 	else {
 		for(i = 0; i < num; i++){
 			if(lines[i].init_pri <= INIT_ALL){
@@ -569,12 +562,18 @@ int line_config(struct line *lines, unsigned int num, char *str)
 {
 	char *new;
 
+	if(*str == '='){
+		printk("line_config - can't configure all devices from "
+		       "mconsole\n");
+		return 1;
+	}
+
 	new = kstrdup(str, GFP_KERNEL);
 	if(new == NULL){
 		printk("line_config - kstrdup failed\n");
 		return -ENOMEM;
 	}
-	return !line_setup(lines, num, new, 0);
+	return !line_setup(lines, num, new);
 }
 
 int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
@@ -628,7 +627,7 @@ int line_remove(struct line *lines, unsigned int num, int n)
 	char config[sizeof("conxxxx=none\0")];
 
 	sprintf(config, "%d=none", n);
-	return !line_setup(lines, num, config, 0);
+	return !line_setup(lines, num, config);
 }
 
 struct tty_driver *line_register_devfs(struct lines *set,
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 8564784..e1895d9 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -224,7 +224,7 @@ __uml_exitcall(ssl_exit);
 
 static int ssl_chan_setup(char *str)
 {
-	return line_setup(serial_lines, ARRAY_SIZE(serial_lines), str, 1);
+	return line_setup(serial_lines, ARRAY_SIZE(serial_lines), str);
 }
 
 __setup("ssl", ssl_chan_setup);
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index b77f7d2..72000d3 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -191,7 +191,7 @@ __uml_exitcall(console_exit);
 
 static int console_chan_setup(char *str)
 {
-	return line_setup(vts, ARRAY_SIZE(vts), str, 1);
+	return line_setup(vts, ARRAY_SIZE(vts), str);
 }
 __setup("con", console_chan_setup);
 __channel_help(console_chan_setup, "con");
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 315788c..e22c9e0 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -76,8 +76,8 @@ struct lines {
 extern void line_close(struct tty_struct *tty, struct file * filp);
 extern int line_open(struct line *lines, struct tty_struct *tty,
 		     struct chan_opts *opts);
-extern int line_setup(struct line *lines, unsigned int sizeof_lines, char *init,
-		      int all_allowed);
+extern int line_setup(struct line *lines, unsigned int sizeof_lines,
+		      char *init);
 extern int line_write(struct tty_struct *tty, const unsigned char *buf,
 		      int len);
 extern void line_put_char(struct tty_struct *tty, unsigned char ch);
-- 
cgit v1.1


From 9010772cdff36072dd509ec72c1a55fccde8e58e Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:54 -0800
Subject: [PATCH] uml: Add static initializations and declarations

Some structure fields were being dynamically initialized when they could be
initialized at compile-time instead.  This also makes some declarations static
(in the C sense).

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c          | 7 +++----
 arch/um/drivers/mconsole_kern.c | 6 +++---
 arch/um/drivers/net_kern.c      | 4 ++--
 arch/um/include/line.h          | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 9af55ec..1c2cc5d 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -668,19 +668,18 @@ struct tty_driver *line_register_devfs(struct lines *set,
 	return driver;
 }
 
-static spinlock_t winch_handler_lock;
-LIST_HEAD(winch_handlers);
+static DEFINE_SPINLOCK(winch_handler_lock);
+static LIST_HEAD(winch_handlers);
 
 void lines_init(struct line *lines, int nlines)
 {
 	struct line *line;
 	int i;
 
-	spin_lock_init(&winch_handler_lock);
 	for(i = 0; i < nlines; i++){
 		line = &lines[i];
 		INIT_LIST_HEAD(&line->chan_list);
-		spin_lock_init(&line->lock);
+
 		if(line->init_str == NULL)
 			continue;
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 355866a..b5217bd 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -51,7 +51,7 @@ static struct notifier_block reboot_notifier = {
  * itself and it can only happen on CPU 0.
  */
 
-LIST_HEAD(mc_requests);
+static LIST_HEAD(mc_requests);
 
 static void mc_work_proc(void *unused)
 {
@@ -69,7 +69,7 @@ static void mc_work_proc(void *unused)
 	}
 }
 
-DECLARE_WORK(mconsole_work, mc_work_proc, NULL);
+static DECLARE_WORK(mconsole_work, mc_work_proc, NULL);
 
 static irqreturn_t mconsole_interrupt(int irq, void *dev_id,
 				      struct pt_regs *regs)
@@ -535,7 +535,7 @@ void mconsole_stack(struct mc_request *req)
  */
 static char *notify_socket = NULL;
 
-int mconsole_init(void)
+static int mconsole_init(void)
 {
 	/* long to avoid size mismatch warnings from gcc */
 	long sock;
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 29785f6..deb2482 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -34,7 +34,7 @@
 #define DRIVER_NAME "uml-netdev"
 
 static DEFINE_SPINLOCK(opened_lock);
-LIST_HEAD(opened);
+static LIST_HEAD(opened);
 
 static int uml_net_rx(struct net_device *dev)
 {
@@ -266,7 +266,7 @@ void uml_net_user_timer_expire(unsigned long _conn)
 }
 
 static DEFINE_SPINLOCK(devices_lock);
-static struct list_head devices = LIST_HEAD_INIT(devices);
+static LIST_HEAD(devices);
 
 static struct platform_driver uml_net_driver = {
 	.driver = {
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index e22c9e0..351d3ac 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -58,8 +58,8 @@ struct line {
 #define LINE_INIT(str, d) \
 	{ init_str :	str, \
 	  init_pri :	INIT_STATIC, \
-	  chan_list : 	{ }, \
 	  valid :	1, \
+	  lock :	SPIN_LOCK_UNLOCKED, \
 	  buffer :	NULL, \
 	  head :	NULL, \
 	  tail :	NULL, \
-- 
cgit v1.1


From 418e55d49b0ec7d2e7a033f2dd083f5b2ab7d119 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:54 -0800
Subject: [PATCH] uml: line_setup interface change

line_setup is changed to return the device which it set up, rather than just
success or failure.  This will be important in the line-config patch.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 1c2cc5d..1352a21 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -555,12 +555,13 @@ int line_setup(struct line *lines, unsigned int num, char *init)
 			}
 		}
 	}
-	return 1;
+	return n == -1 ? num : n;
 }
 
 int line_config(struct line *lines, unsigned int num, char *str)
 {
 	char *new;
+	int n;
 
 	if(*str == '='){
 		printk("line_config - can't configure all devices from "
@@ -573,7 +574,8 @@ int line_config(struct line *lines, unsigned int num, char *str)
 		printk("line_config - kstrdup failed\n");
 		return -ENOMEM;
 	}
-	return !line_setup(lines, num, new);
+	n = line_setup(lines, num, new);
+	return n < 0 ? n : 0;
 }
 
 int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
@@ -624,10 +626,14 @@ int line_id(char **str, int *start_out, int *end_out)
 
 int line_remove(struct line *lines, unsigned int num, int n)
 {
+	int err;
 	char config[sizeof("conxxxx=none\0")];
 
 	sprintf(config, "%d=none", n);
-	return !line_setup(lines, num, config);
+	err = line_setup(lines, num, config);
+	if(err >= 0)
+		err = 0;
+	return err;
 }
 
 struct tty_driver *line_register_devfs(struct lines *set,
-- 
cgit v1.1


From 1f80171e81ed0d08dcdb6efe239d7b929aef498f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:55 -0800
Subject: [PATCH] uml: move console configuration

This patch changes when console devices are configured in order to prepare the
ground for the next patch.

parse_chan_pair is now done earlier, when initcalls are run, rather than when
the device is opened.

When a host device disappears, the channel list is closed, but not freed.
This is required by the previous change.  line_config now takes the options
structure as an argument, and line_open doesn't.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c     | 14 +++++---------
 arch/um/drivers/line.c          | 34 +++++++++++++++++++++-------------
 arch/um/drivers/ssl.c           |  6 +++---
 arch/um/drivers/stdio_console.c |  6 +++---
 arch/um/include/line.h          |  7 +++----
 5 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 59c9b3f..31b69c4 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -311,14 +311,12 @@ int console_write_chan(struct list_head *chans, const char *buf, int len)
 int console_open_chan(struct line *line, struct console *co,
 		      struct chan_opts *opts)
 {
-	if (!list_empty(&line->chan_list))
-		return 0;
+	int err;
+
+	err = open_chan(&line->chan_list);
+	if(err)
+		return err;
 
-	if (0 != parse_chan_pair(line->init_str, &line->chan_list,
-				 co->index, opts))
-		return -1;
-	if (0 != open_chan(&line->chan_list))
-		return -1;
 	printk("Console initialized on /dev/%s%d\n",co->name,co->index);
 	return 0;
 }
@@ -596,13 +594,11 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
 					tty_hangup(tty);
 				line_disable(tty, irq);
 				close_chan(chans);
-				free_chan(chans);
 				return;
 			}
 			else {
 				if(chan->ops->close != NULL)
 					chan->ops->close(chan->fd, chan->data);
-				free_one_chan(chan);
 			}
 		}
 	}
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 1352a21..da81d22 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -419,8 +419,7 @@ void line_disable(struct tty_struct *tty, int current_irq)
 	line->have_irq = 0;
 }
 
-int line_open(struct line *lines, struct tty_struct *tty,
-	      struct chan_opts *opts)
+int line_open(struct line *lines, struct tty_struct *tty)
 {
 	struct line *line;
 	int err = 0;
@@ -436,13 +435,11 @@ int line_open(struct line *lines, struct tty_struct *tty,
 			err = -ENODEV;
 			goto out;
 		}
-		if (list_empty(&line->chan_list)) {
-			err = parse_chan_pair(line->init_str, &line->chan_list,
-					      tty->index, opts);
-			if(err) goto out;
-			err = open_chan(&line->chan_list);
-			if(err) goto out;
-		}
+
+		err = open_chan(&line->chan_list);
+		if(err)
+			goto out;
+
 		/* Here the interrupt is registered.*/
 		enable_chan(&line->chan_list, tty);
 		INIT_WORK(&line->task, line_timer_cb, tty);
@@ -558,8 +555,10 @@ int line_setup(struct line *lines, unsigned int num, char *init)
 	return n == -1 ? num : n;
 }
 
-int line_config(struct line *lines, unsigned int num, char *str)
+int line_config(struct line *lines, unsigned int num, char *str,
+		struct chan_opts *opts)
 {
+	struct line *line;
 	char *new;
 	int n;
 
@@ -572,10 +571,14 @@ int line_config(struct line *lines, unsigned int num, char *str)
 	new = kstrdup(str, GFP_KERNEL);
 	if(new == NULL){
 		printk("line_config - kstrdup failed\n");
-		return -ENOMEM;
+		return 1;
 	}
 	n = line_setup(lines, num, new);
-	return n < 0 ? n : 0;
+	if(n < 0)
+		return 1;
+
+	line = &lines[n];
+	return parse_chan_pair(line->init_str, &line->chan_list, n, opts);
 }
 
 int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
@@ -677,7 +680,7 @@ struct tty_driver *line_register_devfs(struct lines *set,
 static DEFINE_SPINLOCK(winch_handler_lock);
 static LIST_HEAD(winch_handlers);
 
-void lines_init(struct line *lines, int nlines)
+void lines_init(struct line *lines, int nlines, struct chan_opts *opts)
 {
 	struct line *line;
 	int i;
@@ -692,6 +695,11 @@ void lines_init(struct line *lines, int nlines)
 		line->init_str = kstrdup(line->init_str, GFP_KERNEL);
 		if(line->init_str == NULL)
 			printk("lines_init - kstrdup returned NULL\n");
+
+		if(parse_chan_pair(line->init_str, &line->chan_list, i, opts)){
+			printk("parse_chan_pair failed for device %d\n", i);
+			line->valid = 0;
+		}
 	}
 }
 
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index e1895d9..6823dc5 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -84,7 +84,7 @@ static struct lines lines = LINES_INIT(NR_PORTS);
 
 static int ssl_config(char *str)
 {
-	return line_config(serial_lines, ARRAY_SIZE(serial_lines), str);
+	return line_config(serial_lines, ARRAY_SIZE(serial_lines), str, &opts);
 }
 
 static int ssl_get_config(char *dev, char *str, int size, char **error_out)
@@ -100,7 +100,7 @@ static int ssl_remove(int n)
 
 int ssl_open(struct tty_struct *tty, struct file *filp)
 {
-	return line_open(serial_lines, tty, &opts);
+	return line_open(serial_lines, tty);
 }
 
 #if 0
@@ -202,7 +202,7 @@ int ssl_init(void)
 					 serial_lines,
 					 ARRAY_SIZE(serial_lines));
 
-	lines_init(serial_lines, ARRAY_SIZE(serial_lines));
+	lines_init(serial_lines, ARRAY_SIZE(serial_lines), &opts);
 
 	new_title = add_xterm_umid(opts.xterm_title);
 	if (new_title != NULL)
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 72000d3..6d4edda 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -91,7 +91,7 @@ struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver),
 
 static int con_config(char *str)
 {
-	return line_config(vts, ARRAY_SIZE(vts), str);
+	return line_config(vts, ARRAY_SIZE(vts), str, &opts);
 }
 
 static int con_get_config(char *dev, char *str, int size, char **error_out)
@@ -106,7 +106,7 @@ static int con_remove(int n)
 
 static int con_open(struct tty_struct *tty, struct file *filp)
 {
-	return line_open(vts, tty, &opts);
+	return line_open(vts, tty);
 }
 
 static int con_init_done = 0;
@@ -169,7 +169,7 @@ int stdio_init(void)
 		return -1;
 	printk(KERN_INFO "Initialized stdio console driver\n");
 
-	lines_init(vts, ARRAY_SIZE(vts));
+	lines_init(vts, ARRAY_SIZE(vts), &opts);
 
 	new_title = add_xterm_umid(opts.xterm_title);
 	if(new_title != NULL)
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 351d3ac..474398b 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -74,8 +74,7 @@ struct lines {
 #define LINES_INIT(n) {  num :		n }
 
 extern void line_close(struct tty_struct *tty, struct file * filp);
-extern int line_open(struct line *lines, struct tty_struct *tty,
-		     struct chan_opts *opts);
+extern int line_open(struct line *lines, struct tty_struct *tty);
 extern int line_setup(struct line *lines, unsigned int sizeof_lines,
 		      char *init);
 extern int line_write(struct tty_struct *tty, const unsigned char *buf,
@@ -99,11 +98,11 @@ extern struct tty_driver * line_register_devfs(struct lines *set,
 				struct tty_operations *driver,
 				struct line *lines,
 				int nlines);
-extern void lines_init(struct line *lines, int nlines);
+extern void lines_init(struct line *lines, int nlines, struct chan_opts *opts);
 extern void close_lines(struct line *lines, int nlines);
 
 extern int line_config(struct line *lines, unsigned int sizeof_lines,
-		       char *str);
+		       char *str, struct chan_opts *opts);
 extern int line_id(char **str, int *start_out, int *end_out);
 extern int line_remove(struct line *lines, unsigned int sizeof_lines, int n);
 extern int line_get_config(char *dev, struct line *lines,
-- 
cgit v1.1


From 165dc5911627a9c4752e909a0da661b96b6fd269 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:57 -0800
Subject: [PATCH] uml: Simplify console opening/closing and irq registration

This patch simplifies the opening and closing of host console devices and the
registration and deregistration of IRQs.  The intent is to make it obvious
that an IRQ can't exist without an open file descriptor.

chan_enable will now open the channel, and when both opening and IRQ
registration are desired, this should be used.  Opening only is done for the
initial console, so that interface still needs to exist.

The free_irqs_later interface is now gone.  It was intended to avoid freeing
an IRQ while it was being processed.  It did this, but it didn't eliminate the
possiblity of free_irq being called from an interrupt, which is bad.  In its
place is a list of irqs to be freed, which is processed by the signal handler
just before exiting.  close_one_chan now disables irqs.

When a host device disappears, it is just closed, and that disables IRQs.

The device id registered with the IRQ is now the chan structure, not the tty.
This is because the interrupt arrives on a descriptor associated with the
channel.  This caused equivalent changes in the arguments to line_timer_cb.
line_disable is gone since it is not used any more.

The count field in the line structure is gone.  tty->count is used instead.

The complicated logic in sigio_handler with freeing IRQs when necessary and
making sure its idea of the next irq is correct is now much simpler.  The irq
list can't be rearranged underneath it, so it is now a simple list walk.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c | 111 +++++++++++++++++++++++++++---------------
 arch/um/drivers/line.c      | 116 ++++++++++++++++++--------------------------
 arch/um/include/chan_kern.h |   9 ++--
 arch/um/include/irq_user.h  |  13 +----
 arch/um/include/line.h      |   6 +--
 arch/um/kernel/irq_user.c   |  48 +++---------------
 6 files changed, 137 insertions(+), 166 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 31b69c4..1bb920c 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -240,20 +240,65 @@ void chan_enable_winch(struct list_head *chans, struct tty_struct *tty)
 	}
 }
 
-void enable_chan(struct list_head *chans, struct tty_struct *tty)
+void enable_chan(struct line *line)
 {
 	struct list_head *ele;
 	struct chan *chan;
 
-	list_for_each(ele, chans){
+	list_for_each(ele, &line->chan_list){
 		chan = list_entry(ele, struct chan, list);
-		if(!chan->opened) continue;
+		if(open_one_chan(chan))
+			continue;
+
+		if(chan->enabled)
+			continue;
+		line_setup_irq(chan->fd, chan->input, chan->output, line,
+			       chan);
+		chan->enabled = 1;
+	}
+}
+
+static LIST_HEAD(irqs_to_free);
+
+void free_irqs(void)
+{
+	struct chan *chan;
+
+	while(!list_empty(&irqs_to_free)){
+		chan = list_entry(irqs_to_free.next, struct chan, free_list);
+		list_del(&chan->free_list);
+
+		if(chan->input)
+			free_irq(chan->line->driver->read_irq, chan);
+		if(chan->output)
+			free_irq(chan->line->driver->write_irq, chan);
+		chan->enabled = 0;
+	}
+}
+
+static void close_one_chan(struct chan *chan, int delay_free_irq)
+{
+	if(!chan->opened)
+		return;
 
-		line_setup_irq(chan->fd, chan->input, chan->output, tty);
+	if(delay_free_irq){
+		list_add(&chan->free_list, &irqs_to_free);
+	}
+	else {
+		if(chan->input)
+			free_irq(chan->line->driver->read_irq, chan);
+		if(chan->output)
+			free_irq(chan->line->driver->write_irq, chan);
+		chan->enabled = 0;
 	}
+	if(chan->ops->close != NULL)
+		(*chan->ops->close)(chan->fd, chan->data);
+
+	chan->opened = 0;
+	chan->fd = -1;
 }
 
-void close_chan(struct list_head *chans)
+void close_chan(struct list_head *chans, int delay_free_irq)
 {
 	struct chan *chan;
 
@@ -263,11 +308,7 @@ void close_chan(struct list_head *chans)
 	 * so it must be the last closed.
 	 */
 	list_for_each_entry_reverse(chan, chans, list) {
-		if(!chan->opened) continue;
-		if(chan->ops->close != NULL)
-			(*chan->ops->close)(chan->fd, chan->data);
-		chan->opened = 0;
-		chan->fd = -1;
+		close_one_chan(chan, delay_free_irq);
 	}
 }
 
@@ -339,24 +380,27 @@ int chan_window_size(struct list_head *chans, unsigned short *rows_out,
 	return 0;
 }
 
-void free_one_chan(struct chan *chan)
+void free_one_chan(struct chan *chan, int delay_free_irq)
 {
 	list_del(&chan->list);
+
+	close_one_chan(chan, delay_free_irq);
+
 	if(chan->ops->free != NULL)
 		(*chan->ops->free)(chan->data);
-	free_irq_by_fd(chan->fd);
+
 	if(chan->primary && chan->output) ignore_sigio_fd(chan->fd);
 	kfree(chan);
 }
 
-void free_chan(struct list_head *chans)
+void free_chan(struct list_head *chans, int delay_free_irq)
 {
 	struct list_head *ele, *next;
 	struct chan *chan;
 
 	list_for_each_safe(ele, next, chans){
 		chan = list_entry(ele, struct chan, list);
-		free_one_chan(chan);
+		free_one_chan(chan, delay_free_irq);
 	}
 }
 
@@ -466,7 +510,8 @@ struct chan_type chan_table[] = {
 #endif
 };
 
-static struct chan *parse_chan(char *str, int device, struct chan_opts *opts)
+static struct chan *parse_chan(struct line *line, char *str, int device,
+			       struct chan_opts *opts)
 {
 	struct chan_type *entry;
 	struct chan_ops *ops;
@@ -499,25 +544,30 @@ static struct chan *parse_chan(char *str, int device, struct chan_opts *opts)
 	if(chan == NULL)
 		return NULL;
 	*chan = ((struct chan) { .list	 	= LIST_HEAD_INIT(chan->list),
+				 .free_list 	=
+				 	LIST_HEAD_INIT(chan->free_list),
+				 .line		= line,
 				 .primary	= 1,
 				 .input		= 0,
 				 .output 	= 0,
 				 .opened  	= 0,
+				 .enabled  	= 0,
 				 .fd 		= -1,
 				 .ops 		= ops,
 				 .data 		= data });
 	return chan;
 }
 
-int parse_chan_pair(char *str, struct list_head *chans, int device,
+int parse_chan_pair(char *str, struct line *line, int device,
 		    struct chan_opts *opts)
 {
+	struct list_head *chans = &line->chan_list;
 	struct chan *new, *chan;
 	char *in, *out;
 
 	if(!list_empty(chans)){
 		chan = list_entry(chans->next, struct chan, list);
-		free_chan(chans);
+		free_chan(chans, 0);
 		INIT_LIST_HEAD(chans);
 	}
 
@@ -526,14 +576,14 @@ int parse_chan_pair(char *str, struct list_head *chans, int device,
 		in = str;
 		*out = '\0';
 		out++;
-		new = parse_chan(in, device, opts);
+		new = parse_chan(line, in, device, opts);
 		if(new == NULL)
 			return -1;
 
 		new->input = 1;
 		list_add(&new->list, chans);
 
-		new = parse_chan(out, device, opts);
+		new = parse_chan(line, out, device, opts);
 		if(new == NULL)
 			return -1;
 
@@ -541,7 +591,7 @@ int parse_chan_pair(char *str, struct list_head *chans, int device,
 		new->output = 1;
 	}
 	else {
-		new = parse_chan(str, device, opts);
+		new = parse_chan(line, str, device, opts);
 		if(new == NULL)
 			return -1;
 
@@ -592,27 +642,12 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
 			if(chan->primary){
 				if(tty != NULL)
 					tty_hangup(tty);
-				line_disable(tty, irq);
-				close_chan(chans);
+				close_chan(chans, 1);
 				return;
 			}
-			else {
-				if(chan->ops->close != NULL)
-					chan->ops->close(chan->fd, chan->data);
-			}
+			else close_one_chan(chan, 1);
 		}
 	}
  out:
 	if(tty) tty_flip_buffer_push(tty);
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index da81d22..851a7c8 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -23,8 +23,9 @@
 
 static irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused)
 {
-	struct tty_struct *tty = data;
-	struct line *line = tty->driver_data;
+	struct chan *chan = data;
+	struct line *line = chan->line;
+	struct tty_struct *tty = line->tty;
 
 	if (line)
 		chan_interrupt(&line->chan_list, &line->task, tty, irq);
@@ -33,10 +34,10 @@ static irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused)
 
 static void line_timer_cb(void *arg)
 {
-	struct tty_struct *tty = arg;
-	struct line *line = tty->driver_data;
+	struct line *line = arg;
 
-	line_interrupt(line->driver->read_irq, arg, NULL);
+	chan_interrupt(&line->chan_list, &line->task, line->tty,
+		       line->driver->read_irq);
 }
 
 /* Returns the free space inside the ring buffer of this line.
@@ -342,8 +343,9 @@ int line_ioctl(struct tty_struct *tty, struct file * file,
 static irqreturn_t line_write_interrupt(int irq, void *data,
 					struct pt_regs *unused)
 {
-	struct tty_struct *tty = data;
-	struct line *line = tty->driver_data;
+	struct chan *chan = data;
+	struct line *line = chan->line;
+	struct tty_struct *tty = line->tty;
 	int err;
 
 	/* Interrupts are enabled here because we registered the interrupt with
@@ -365,7 +367,7 @@ static irqreturn_t line_write_interrupt(int irq, void *data,
 	if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) &&
 	   (tty->ldisc.write_wakeup != NULL))
 		(tty->ldisc.write_wakeup)(tty);
-	
+
 	/* BLOCKING mode
 	 * In blocking mode, everything sleeps on tty->write_wait.
 	 * Sleeping in the console driver would break non-blocking
@@ -377,52 +379,29 @@ static irqreturn_t line_write_interrupt(int irq, void *data,
 	return IRQ_HANDLED;
 }
 
-int line_setup_irq(int fd, int input, int output, struct tty_struct *tty)
+int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
 {
-	struct line *line = tty->driver_data;
 	struct line_driver *driver = line->driver;
 	int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM;
 
 	if (input)
 		err = um_request_irq(driver->read_irq, fd, IRQ_READ,
 				       line_interrupt, flags,
-				       driver->read_irq_name, tty);
+				       driver->read_irq_name, data);
 	if (err)
 		return err;
 	if (output)
 		err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
 					line_write_interrupt, flags,
-					driver->write_irq_name, tty);
+					driver->write_irq_name, data);
 	line->have_irq = 1;
 	return err;
 }
 
-void line_disable(struct tty_struct *tty, int current_irq)
-{
-	struct line *line = tty->driver_data;
-
-	if(!line->have_irq)
-		return;
-
-	if(line->driver->read_irq == current_irq)
-		free_irq_later(line->driver->read_irq, tty);
-	else {
-		free_irq(line->driver->read_irq, tty);
-	}
-
-	if(line->driver->write_irq == current_irq)
-		free_irq_later(line->driver->write_irq, tty);
-	else {
-		free_irq(line->driver->write_irq, tty);
-	}
-
-	line->have_irq = 0;
-}
-
 int line_open(struct line *lines, struct tty_struct *tty)
 {
 	struct line *line;
-	int err = 0;
+	int err = -ENODEV;
 
 	line = &lines[tty->index];
 	tty->driver_data = line;
@@ -430,29 +409,29 @@ int line_open(struct line *lines, struct tty_struct *tty)
 	/* The IRQ which takes this lock is not yet enabled and won't be run
 	 * before the end, so we don't need to use spin_lock_irq.*/
 	spin_lock(&line->lock);
-	if (tty->count == 1) {
-		if (!line->valid) {
-			err = -ENODEV;
-			goto out;
-		}
 
-		err = open_chan(&line->chan_list);
-		if(err)
-			goto out;
-
-		/* Here the interrupt is registered.*/
-		enable_chan(&line->chan_list, tty);
-		INIT_WORK(&line->task, line_timer_cb, tty);
-	}
+	tty->driver_data = line;
+	line->tty = tty;
+	if(!line->valid)
+		goto out;
+
+	if(tty->count == 1){
+		/* Here the device is opened, if necessary, and interrupt
+		 * is registered.
+		 */
+		enable_chan(line);
+		INIT_WORK(&line->task, line_timer_cb, line);
+
+		if(!line->sigio){
+			chan_enable_winch(&line->chan_list, tty);
+			line->sigio = 1;
+		}
 
-	if(!line->sigio){
-		chan_enable_winch(&line->chan_list, tty);
-		line->sigio = 1;
+		chan_window_size(&line->chan_list, &tty->winsize.ws_row,
+				 &tty->winsize.ws_col);
 	}
-	chan_window_size(&line->chan_list, &tty->winsize.ws_row,
-			 &tty->winsize.ws_col);
-	line->count++;
 
+	err = 0;
 out:
 	spin_unlock(&line->lock);
 	return err;
@@ -472,15 +451,14 @@ void line_close(struct tty_struct *tty, struct file * filp)
 	/* We ignore the error anyway! */
 	flush_buffer(line);
 
-	line->count--;
-	if (tty->count == 1) {
-		line_disable(tty, -1);
+	if(tty->count == 1){
+		line->tty = NULL;
 		tty->driver_data = NULL;
-	}
 
-        if((line->count == 0) && line->sigio){
-                unregister_winch(tty);
-                line->sigio = 0;
+		if(line->sigio){
+			unregister_winch(tty);
+			line->sigio = 0;
+		}
         }
 
 	spin_unlock_irq(&line->lock);
@@ -491,7 +469,7 @@ void close_lines(struct line *lines, int nlines)
 	int i;
 
 	for(i = 0; i < nlines; i++)
-		close_chan(&lines[i].chan_list);
+		close_chan(&lines[i].chan_list, 0);
 }
 
 /* Common setup code for both startup command line and mconsole initialization.
@@ -526,7 +504,7 @@ int line_setup(struct line *lines, unsigned int num, char *init)
 		return 0;
 	}
 	else if (n >= 0){
-		if (lines[n].count > 0) {
+		if (lines[n].tty != NULL) {
 			printk("line_setup - device %d is open\n", n);
 			return 0;
 		}
@@ -537,7 +515,7 @@ int line_setup(struct line *lines, unsigned int num, char *init)
 			else {
 				lines[n].init_str = init;
 				lines[n].valid = 1;
-			}	
+			}
 		}
 	}
 	else {
@@ -578,7 +556,7 @@ int line_config(struct line *lines, unsigned int num, char *str,
 		return 1;
 
 	line = &lines[n];
-	return parse_chan_pair(line->init_str, &line->chan_list, n, opts);
+	return parse_chan_pair(line->init_str, line, n, opts);
 }
 
 int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
@@ -604,7 +582,7 @@ int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
 	spin_lock(&line->lock);
 	if(!line->valid)
 		CONFIG_CHUNK(str, size, n, "none", 1);
-	else if(line->count == 0)
+	else if(line->tty == NULL)
 		CONFIG_CHUNK(str, size, n, line->init_str, 1);
 	else n = chan_config_string(&line->chan_list, str, size, error_out);
 	spin_unlock(&line->lock);
@@ -696,7 +674,7 @@ void lines_init(struct line *lines, int nlines, struct chan_opts *opts)
 		if(line->init_str == NULL)
 			printk("lines_init - kstrdup returned NULL\n");
 
-		if(parse_chan_pair(line->init_str, &line->chan_list, i, opts)){
+		if(parse_chan_pair(line->init_str, line, i, opts)){
 			printk("parse_chan_pair failed for device %d\n", i);
 			line->valid = 0;
 		}
@@ -831,7 +809,7 @@ char *add_xterm_umid(char *base)
 	umid = get_umid(1);
 	if(umid == NULL)
 		return base;
-	
+
 	len = strlen(base) + strlen(" ()") + strlen(umid) + 1;
 	title = kmalloc(len, GFP_KERNEL);
 	if(title == NULL){
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index 22bf3a7..84d1f64 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -14,11 +14,14 @@
 
 struct chan {
 	struct list_head list;
+	struct list_head free_list;
+	struct line *line;
 	char *dev;
 	unsigned int primary:1;
 	unsigned int input:1;
 	unsigned int output:1;
 	unsigned int opened:1;
+	unsigned int enabled:1;
 	int fd;
 	struct chan_ops *ops;
 	void *data;
@@ -26,7 +29,7 @@ struct chan {
 
 extern void chan_interrupt(struct list_head *chans, struct work_struct *task,
 			   struct tty_struct *tty, int irq);
-extern int parse_chan_pair(char *str, struct list_head *chans, int device,
+extern int parse_chan_pair(char *str, struct line *line, int device,
 			   struct chan_opts *opts);
 extern int open_chan(struct list_head *chans);
 extern int write_chan(struct list_head *chans, const char *buf, int len,
@@ -35,9 +38,9 @@ extern int console_write_chan(struct list_head *chans, const char *buf,
 			      int len);
 extern int console_open_chan(struct line *line, struct console *co,
 			     struct chan_opts *opts);
-extern void close_chan(struct list_head *chans);
 extern void chan_enable_winch(struct list_head *chans, struct tty_struct *tty);
-extern void enable_chan(struct list_head *chans, struct tty_struct *tty);
+extern void enable_chan(struct line *line);
+extern void close_chan(struct list_head *chans, int delay_free_irq);
 extern int chan_window_size(struct list_head *chans, 
 			     unsigned short *rows_out, 
 			     unsigned short *cols_out);
diff --git a/arch/um/include/irq_user.h b/arch/um/include/irq_user.h
index f724b71..b61deb8 100644
--- a/arch/um/include/irq_user.h
+++ b/arch/um/include/irq_user.h
@@ -18,19 +18,8 @@ extern int deactivate_all_fds(void);
 extern void forward_interrupts(int pid);
 extern void init_irq_signals(int on_sigstack);
 extern void forward_ipi(int fd, int pid);
-extern void free_irq_later(int irq, void *dev_id);
 extern int activate_ipi(int fd, int pid);
 extern unsigned long irq_lock(void);
 extern void irq_unlock(unsigned long flags);
-#endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
+#endif
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 474398b..e6cc3ab 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -32,6 +32,7 @@ struct line_driver {
 };
 
 struct line {
+	struct tty_struct *tty;
 	char *init_str;
 	int init_pri;
 	struct list_head chan_list;
@@ -89,10 +90,9 @@ extern int line_ioctl(struct tty_struct *tty, struct file * file,
 		      unsigned int cmd, unsigned long arg);
 
 extern char *add_xterm_umid(char *base);
-extern int line_setup_irq(int fd, int input, int output,
-			  struct tty_struct *tty);
+extern int line_setup_irq(int fd, int input, int output, struct line *line,
+			  void *data);
 extern void line_close_chan(struct line *line);
-extern void line_disable(struct tty_struct *tty, int current_irq);
 extern struct tty_driver * line_register_devfs(struct lines *set,
 				struct line_driver *line_driver,
 				struct tty_operations *driver,
diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c
index c3ccaf2..50a2aa3 100644
--- a/arch/um/kernel/irq_user.c
+++ b/arch/um/kernel/irq_user.c
@@ -29,7 +29,6 @@ struct irq_fd {
 	int pid;
 	int events;
 	int current_events;
-	int freed;
 };
 
 static struct irq_fd *active_fds = NULL;
@@ -41,9 +40,11 @@ static int pollfds_size = 0;
 
 extern int io_count, intr_count;
 
+extern void free_irqs(void);
+
 void sigio_handler(int sig, union uml_pt_regs *regs)
 {
-	struct irq_fd *irq_fd, *next;
+	struct irq_fd *irq_fd;
 	int i, n;
 
 	if(smp_sigio_handler()) return;
@@ -66,29 +67,15 @@ void sigio_handler(int sig, union uml_pt_regs *regs)
 			irq_fd = irq_fd->next;
 		}
 
-		for(irq_fd = active_fds; irq_fd != NULL; irq_fd = next){
-			next = irq_fd->next;
+		for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
 			if(irq_fd->current_events != 0){
 				irq_fd->current_events = 0;
 				do_IRQ(irq_fd->irq, regs);
-
-				/* This is here because the next irq may be
-				 * freed in the handler.  If a console goes
-				 * away, both the read and write irqs will be
-				 * freed.  After do_IRQ, ->next will point to
-				 * a good IRQ.
-				 * Irqs can't be freed inside their handlers,
-				 * so the next best thing is to have them
-				 * marked as needing freeing, so that they
-				 * can be freed here.
-				 */
-				next = irq_fd->next;
-				if(irq_fd->freed){
-					free_irq(irq_fd->irq, irq_fd->id);
-				}
 			}
 		}
 	}
+
+	free_irqs();
 }
 
 int activate_ipi(int fd, int pid)
@@ -136,8 +123,7 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 				     .irq 		= irq,
 				     .pid  		= pid,
 				     .events 		= events,
-				     .current_events 	= 0,
-				     .freed 		= 0  } );
+				     .current_events 	= 0 } );
 
 	/* Critical section - locked by a spinlock because this stuff can
 	 * be changed from interrupt handlers.  The stuff above is done 
@@ -313,26 +299,6 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
 	return(irq);
 }
 
-void free_irq_later(int irq, void *dev_id)
-{
-	struct irq_fd *irq_fd;
-	unsigned long flags;
-
-	flags = irq_lock();
-	for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
-		if((irq_fd->irq == irq) && (irq_fd->id == dev_id))
-			break;
-	}
-	if(irq_fd == NULL){
-		printk("free_irq_later found no irq, irq = %d, "
-		       "dev_id = 0x%p\n", irq, dev_id);
-		goto out;
-	}
-	irq_fd->freed = 1;
- out:
-	irq_unlock(flags);
-}
-
 void reactivate_fd(int fd, int irqnum)
 {
 	struct irq_fd *irq;
-- 
cgit v1.1


From 9159c9dfffe1746d58b015ceaa3b7b8e99ee9d5c Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:58 -0800
Subject: [PATCH] uml: Fix flip_buf full handling

When the tty flip_buf is full, it's a good idea to delay the input processing
for a jiffy, rather than just scheduling the tasklet immediately.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 1bb920c..36df55a 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -629,7 +629,7 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
 		do {
 			if((tty != NULL) &&
 			   (tty->flip.count >= TTY_FLIPBUF_SIZE)){
-				schedule_work(task);
+				schedule_delayed_work(task, 1);
 				goto out;
 			}
 			err = chan->ops->read(chan->fd, &c, chan->data);
-- 
cgit v1.1


From e4dcee8099802c71437a15b940f66106d9f88b2f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:58 -0800
Subject: [PATCH] uml: Add throttling to console driver

This patch adds support for throttling and unthrottling input when the tty
driver can't handle it.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c     | 26 ++++++++++++++++++++++++++
 arch/um/drivers/line.c          | 29 +++++++++++++++++++++++++++--
 arch/um/drivers/ssl.c           | 14 ++------------
 arch/um/drivers/stdio_console.c |  2 ++
 arch/um/include/chan_kern.h     |  2 ++
 arch/um/include/line.h          |  4 ++++
 6 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 36df55a..cd13b91 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -312,6 +312,32 @@ void close_chan(struct list_head *chans, int delay_free_irq)
 	}
 }
 
+void deactivate_chan(struct list_head *chans, int irq)
+{
+	struct list_head *ele;
+
+	struct chan *chan;
+	list_for_each(ele, chans) {
+		chan = list_entry(ele, struct chan, list);
+
+		if(chan->enabled && chan->input)
+			deactivate_fd(chan->fd, irq);
+	}
+}
+
+void reactivate_chan(struct list_head *chans, int irq)
+{
+	struct list_head *ele;
+	struct chan *chan;
+
+	list_for_each(ele, chans) {
+		chan = list_entry(ele, struct chan, list);
+
+		if(chan->enabled && chan->input)
+			reactivate_fd(chan->fd, irq);
+	}
+}
+
 int write_chan(struct list_head *chans, const char *buf, int len,
 	       int write_irq)
 {
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 851a7c8..b8e3e80 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -36,8 +36,9 @@ static void line_timer_cb(void *arg)
 {
 	struct line *line = arg;
 
-	chan_interrupt(&line->chan_list, &line->task, line->tty,
-		       line->driver->read_irq);
+	if(!line->throttled)
+		chan_interrupt(&line->chan_list, &line->task, line->tty,
+			       line->driver->read_irq);
 }
 
 /* Returns the free space inside the ring buffer of this line.
@@ -340,6 +341,30 @@ int line_ioctl(struct tty_struct *tty, struct file * file,
 	return ret;
 }
 
+void line_throttle(struct tty_struct *tty)
+{
+	struct line *line = tty->driver_data;
+
+	deactivate_chan(&line->chan_list, line->driver->read_irq);
+	line->throttled = 1;
+}
+
+void line_unthrottle(struct tty_struct *tty)
+{
+	struct line *line = tty->driver_data;
+
+	line->throttled = 0;
+	chan_interrupt(&line->chan_list, &line->task, tty,
+		       line->driver->read_irq);
+
+	/* Maybe there is enough stuff pending that calling the interrupt
+	 * throttles us again.  In this case, line->throttled will be 1
+	 * again and we shouldn't turn the interrupt back on.
+	 */
+	if(!line->throttled)
+		reactivate_chan(&line->chan_list, line->driver->read_irq);
+}
+
 static irqreturn_t line_write_interrupt(int irq, void *data,
 					struct pt_regs *unused)
 {
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 6823dc5..a32ef55 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -109,16 +109,6 @@ static void ssl_flush_buffer(struct tty_struct *tty)
 	return;
 }
 
-static void ssl_throttle(struct tty_struct * tty)
-{
-	printk(KERN_ERR "Someone should implement ssl_throttle\n");
-}
-
-static void ssl_unthrottle(struct tty_struct * tty)
-{
-	printk(KERN_ERR "Someone should implement ssl_unthrottle\n");
-}
-
 static void ssl_stop(struct tty_struct *tty)
 {
 	printk(KERN_ERR "Someone should implement ssl_stop\n");
@@ -145,9 +135,9 @@ static struct tty_operations ssl_ops = {
 	.flush_chars 		= line_flush_chars,
 	.set_termios 		= line_set_termios,
 	.ioctl 	 		= line_ioctl,
+	.throttle 		= line_throttle,
+	.unthrottle 		= line_unthrottle,
 #if 0
-	.throttle 		= ssl_throttle,
-	.unthrottle 		= ssl_unthrottle,
 	.stop 	 		= ssl_stop,
 	.start 	 		= ssl_start,
 	.hangup 	 	= ssl_hangup,
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 6d4edda..61db8b2 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -122,6 +122,8 @@ static struct tty_operations console_ops = {
 	.flush_chars 		= line_flush_chars,
 	.set_termios 		= line_set_termios,
 	.ioctl 	 		= line_ioctl,
+	.throttle 		= line_throttle,
+	.unthrottle 		= line_unthrottle,
 };
 
 static void uml_console_write(struct console *console, const char *string,
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index 84d1f64..1bb5e9d 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -38,6 +38,8 @@ extern int console_write_chan(struct list_head *chans, const char *buf,
 			      int len);
 extern int console_open_chan(struct line *line, struct console *co,
 			     struct chan_opts *opts);
+extern void deactivate_chan(struct list_head *chans, int irq);
+extern void reactivate_chan(struct list_head *chans, int irq);
 extern void chan_enable_winch(struct list_head *chans, struct tty_struct *tty);
 extern void enable_chan(struct line *line);
 extern void close_chan(struct list_head *chans, int delay_free_irq);
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index e6cc3ab..6f4d680 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -38,6 +38,7 @@ struct line {
 	struct list_head chan_list;
 	int valid;
 	int count;
+	int throttled;
 	/*This lock is actually, mostly, local to*/
 	spinlock_t lock;
 
@@ -60,6 +61,7 @@ struct line {
 	{ init_str :	str, \
 	  init_pri :	INIT_STATIC, \
 	  valid :	1, \
+	  throttled :	0, \
 	  lock :	SPIN_LOCK_UNLOCKED, \
 	  buffer :	NULL, \
 	  head :	NULL, \
@@ -88,6 +90,8 @@ extern void line_flush_chars(struct tty_struct *tty);
 extern int line_write_room(struct tty_struct *tty);
 extern int line_ioctl(struct tty_struct *tty, struct file * file,
 		      unsigned int cmd, unsigned long arg);
+extern void line_throttle(struct tty_struct *tty);
+extern void line_unthrottle(struct tty_struct *tty);
 
 extern char *add_xterm_umid(char *base);
 extern int line_setup_irq(int fd, int input, int output, struct line *line,
-- 
cgit v1.1


From 2264c475e4bf7427e59921953c89a5693ecb506f Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:18:59 -0800
Subject: [PATCH] uml: separate libc-dependent umid code

I reworked Gennady's umid OS abstraction patch because the code shouldn't
be moved entirely to os.  As it turns out, I moved most of it anyway.  This
patch is the minimal one needed to move the code and have it work.
It turns out that the concept of the umid is OS-independent, but
almost everything else about the implementation is OS-dependent.

This is code movement without cleanup - a follow-on patch tidies
everything up without shuffling code around.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/kern.h        |  13 +-
 arch/um/include/os.h          |  17 +--
 arch/um/kernel/Makefile       |   6 +-
 arch/um/kernel/process_kern.c |   4 -
 arch/um/kernel/umid.c         | 327 +++---------------------------------------
 arch/um/os-Linux/Makefile     |   4 +-
 arch/um/os-Linux/umid.c       | 292 +++++++++++++++++++++++++++++++++++++
 7 files changed, 324 insertions(+), 339 deletions(-)
 create mode 100644 arch/um/os-Linux/umid.c

diff --git a/arch/um/include/kern.h b/arch/um/include/kern.h
index 1e31707..7d223be 100644
--- a/arch/um/include/kern.h
+++ b/arch/um/include/kern.h
@@ -17,7 +17,7 @@ extern int errno;
 
 extern int clone(int (*proc)(void *), void *sp, int flags, void *data);
 extern int sleep(int);
-extern int printf(char *fmt, ...);
+extern int printf(const char *fmt, ...);
 extern char *strerror(int errnum);
 extern char *ptsname(int __fd);
 extern int munmap(void *, int);
@@ -35,15 +35,6 @@ extern int read(unsigned int, char *, int);
 extern int pipe(int *);
 extern int sched_yield(void);
 extern int ptrace(int op, int pid, long addr, long data);
+
 #endif
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 2cccfa5..258444e 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -213,15 +213,10 @@ extern int run_helper_thread(int (*proc)(void *), void *arg,
 			     int stack_order);
 extern int helper_wait(int pid);
 
-#endif
+/* umid.c */
 
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
+extern int umid_file_name(char *name, char *buf, int len);
+extern int set_umid(char *name, int (*printer)(const char *fmt, ...));
+extern char *get_umid(int only_if_set);
+
+#endif
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 3de9d21..6f77005 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -10,8 +10,8 @@ obj-y = config.o exec_kern.o exitcode.o \
 	init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \
 	process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \
 	signal_kern.o signal_user.o smp.o syscall_kern.o sysrq.o time.o \
-	time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o \
-	umid.o user_util.o
+	time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o umid.o \
+	user_util.o
 
 obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
 obj-$(CONFIG_GPROF)	+= gprof_syms.o
@@ -24,7 +24,7 @@ obj-$(CONFIG_MODE_SKAS) += skas/
 
 user-objs-$(CONFIG_TTY_LOG) += tty_log.o
 
-USER_OBJS := $(user-objs-y) config.o time.o tty_log.o umid.o user_util.o
+USER_OBJS := $(user-objs-y) config.o time.o tty_log.o user_util.o
 
 include arch/um/scripts/Makefile.rules
 
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index 34b54a3..651abf2 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -324,10 +324,6 @@ int user_context(unsigned long sp)
 	return(stack != (unsigned long) current_thread);
 }
 
-extern void remove_umid_dir(void);
-
-__uml_exitcall(remove_umid_dir);
-
 extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end;
 
 void do_uml_exitcalls(void)
diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
index 0b21d59..772c7cf 100644
--- a/arch/um/kernel/umid.c
+++ b/arch/um/kernel/umid.c
@@ -3,61 +3,34 @@
  * Licensed under the GPL
  */
 
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <dirent.h>
-#include <signal.h>
-#include <sys/stat.h>
-#include <sys/param.h>
-#include "user.h"
-#include "umid.h"
+#include "linux/stddef.h"
+#include "linux/kernel.h"
+#include "asm/errno.h"
 #include "init.h"
 #include "os.h"
-#include "user_util.h"
-#include "choose-mode.h"
+#include "kern.h"
 
-#define UMID_LEN 64
-#define UML_DIR "~/.uml/"
-
-/* Changed by set_umid and make_umid, which are run early in boot */
-static char umid[UMID_LEN] = { 0 };
-
-/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */
-static char *uml_dir = UML_DIR;
-
-/* Changed by set_umid */
-static int umid_is_random = 1;
+/* Changed by set_umid_arg and umid_file_name */
+int umid_is_random = 0;
 static int umid_inited = 0;
-/* Have we created the files? Should we remove them? */
-static int umid_owned = 0;
 
-static int make_umid(int (*printer)(const char *fmt, ...));
-
-static int __init set_umid(char *name, int is_random,
-			   int (*printer)(const char *fmt, ...))
+static int __init set_umid_arg(char *name, int *add)
 {
-	if(umid_inited){
-		(*printer)("Unique machine name can't be set twice\n");
-		return(-1);
-	}
+	int err;
 
-	if(strlen(name) > UMID_LEN - 1)
-		(*printer)("Unique machine name is being truncated to %d "
-			   "characters\n", UMID_LEN);
-	strlcpy(umid, name, sizeof(umid));
+	if(umid_inited)
+		return 0;
 
-	umid_is_random = is_random;
-	umid_inited = 1;
-	return 0;
-}
-
-static int __init set_umid_arg(char *name, int *add)
-{
 	*add = 0;
-	return(set_umid(name, 0, printf));
+	err = set_umid(name, printf);
+	if(err == -EEXIST){
+		printf("umid '%s' already in use\n", name);
+		umid_is_random = 1;
+	}
+	else if(!err)
+		umid_inited = 1;
+
+	return 0;
 }
 
 __uml_setup("umid=", set_umid_arg,
@@ -66,265 +39,3 @@ __uml_setup("umid=", set_umid_arg,
 "    is used for naming the pid file and management console socket.\n\n"
 );
 
-int __init umid_file_name(char *name, char *buf, int len)
-{
-	int n;
-
-	if(!umid_inited && make_umid(printk)) return(-1);
-
-	n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1;
-	if(n > len){
-		printk("umid_file_name : buffer too short\n");
-		return(-1);
-	}
-
-	sprintf(buf, "%s%s/%s", uml_dir, umid, name);
-	return(0);
-}
-
-extern int tracing_pid;
-
-static void __init create_pid_file(void)
-{
-	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
-	char pid[sizeof("nnnnn\0")];
-	int fd, n;
-
-	if(umid_file_name("pid", file, sizeof(file)))
-		return;
-
-	fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), 
-			  0644);
-	if(fd < 0){
-		printf("Open of machine pid file \"%s\" failed: %s\n",
-		       file, strerror(-fd));
-		return;
-	}
-
-	sprintf(pid, "%d\n", os_getpid());
-	n = os_write_file(fd, pid, strlen(pid));
-	if(n != strlen(pid))
-		printf("Write of pid file failed - err = %d\n", -n);
-	os_close_file(fd);
-}
-
-static int actually_do_remove(char *dir)
-{
-	DIR *directory;
-	struct dirent *ent;
-	int len;
-	char file[256];
-
-	directory = opendir(dir);
-	if(directory == NULL){
-		printk("actually_do_remove : couldn't open directory '%s', "
-		       "errno = %d\n", dir, errno);
-		return(1);
-	}
-	while((ent = readdir(directory)) != NULL){
-		if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
-			continue;
-		len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
-		if(len > sizeof(file)){
-			printk("Not deleting '%s' from '%s' - name too long\n",
-			       ent->d_name, dir);
-			continue;
-		}
-		sprintf(file, "%s/%s", dir, ent->d_name);
-		if(unlink(file) < 0){
-			printk("actually_do_remove : couldn't remove '%s' "
-			       "from '%s', errno = %d\n", ent->d_name, dir, 
-			       errno);
-			return(1);
-		}
-	}
-	if(rmdir(dir) < 0){
-		printk("actually_do_remove : couldn't rmdir '%s', "
-		       "errno = %d\n", dir, errno);
-		return(1);
-	}
-	return(0);
-}
-
-void remove_umid_dir(void)
-{
-	char dir[strlen(uml_dir) + UMID_LEN + 1];
-	if (!umid_owned)
-		return;
-
-	sprintf(dir, "%s%s", uml_dir, umid);
-	actually_do_remove(dir);
-}
-
-char *get_umid(int only_if_set)
-{
-	if(only_if_set && umid_is_random)
-		return NULL;
-	return umid;
-}
-
-static int not_dead_yet(char *dir)
-{
-	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
-	char pid[sizeof("nnnnn\0")], *end;
-	int dead, fd, p, n;
-
-	sprintf(file, "%s/pid", dir);
-	dead = 0;
-	fd = os_open_file(file, of_read(OPENFLAGS()), 0);
-	if(fd < 0){
-		if(fd != -ENOENT){
-			printk("not_dead_yet : couldn't open pid file '%s', "
-			       "err = %d\n", file, -fd);
-			return(1);
-		}
-		dead = 1;
-	}
-	if(fd > 0){
-		n = os_read_file(fd, pid, sizeof(pid));
-		if(n < 0){
-			printk("not_dead_yet : couldn't read pid file '%s', "
-			       "err = %d\n", file, -n);
-			return(1);
-		}
-		p = strtoul(pid, &end, 0);
-		if(end == pid){
-			printk("not_dead_yet : couldn't parse pid file '%s', "
-			       "errno = %d\n", file, errno);
-			dead = 1;
-		}
-		if(((kill(p, 0) < 0) && (errno == ESRCH)) ||
-		   (p == CHOOSE_MODE(tracing_pid, os_getpid())))
-			dead = 1;
-	}
-	if(!dead)
-		return(1);
-	return(actually_do_remove(dir));
-}
-
-static int __init set_uml_dir(char *name, int *add)
-{
-	if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
-		uml_dir = malloc(strlen(name) + 2);
-		if(uml_dir == NULL){
-			printf("Failed to malloc uml_dir - error = %d\n",
-			       errno);
-			uml_dir = name;
-			/* Return 0 here because do_initcalls doesn't look at
-			 * the return value.
-			 */
-			return(0);
-		}
-		sprintf(uml_dir, "%s/", name);
-	}
-	else uml_dir = name;
-	return(0);
-}
-
-static int __init make_uml_dir(void)
-{
-	char dir[MAXPATHLEN + 1] = { '\0' };
-	int len;
-
-	if(*uml_dir == '~'){
-		char *home = getenv("HOME");
-
-		if(home == NULL){
-			printf("make_uml_dir : no value in environment for "
-			       "$HOME\n");
-			exit(1);
-		}
-		strlcpy(dir, home, sizeof(dir));
-		uml_dir++;
-	}
-	strlcat(dir, uml_dir, sizeof(dir));
-	len = strlen(dir);
-	if (len > 0 && dir[len - 1] != '/')
-		strlcat(dir, "/", sizeof(dir));
-
-	uml_dir = malloc(strlen(dir) + 1);
-	if (uml_dir == NULL) {
-		printf("make_uml_dir : malloc failed, errno = %d\n", errno);
-		exit(1);
-	}
-	strcpy(uml_dir, dir);
-	
-	if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
-	        printf("Failed to mkdir %s: %s\n", uml_dir, strerror(errno));
-		return(-1);
-	}
-	return 0;
-}
-
-static int __init make_umid(int (*printer)(const char *fmt, ...))
-{
-	int fd, err;
-	char tmp[strlen(uml_dir) + UMID_LEN + 1];
-
-	strlcpy(tmp, uml_dir, sizeof(tmp));
-
-	if(!umid_inited){
-		strcat(tmp, "XXXXXX");
-		fd = mkstemp(tmp);
-		if(fd < 0){
-			(*printer)("make_umid - mkstemp(%s) failed: %s\n",
-				   tmp,strerror(errno));
-			return(1);
-		}
-
-		os_close_file(fd);
-		/* There's a nice tiny little race between this unlink and
-		 * the mkdir below.  It'd be nice if there were a mkstemp
-		 * for directories.
-		 */
-		unlink(tmp);
-		set_umid(&tmp[strlen(uml_dir)], 1, printer);
-	}
-	
-	sprintf(tmp, "%s%s", uml_dir, umid);
-
-	err = mkdir(tmp, 0777);
-	if(err < 0){
-		if(errno == EEXIST){
-			if(not_dead_yet(tmp)){
-				(*printer)("umid '%s' is in use\n", umid);
-				umid_owned = 0;
-				return(-1);
-			}
-			err = mkdir(tmp, 0777);
-		}
-	}
-	if(err < 0){
-		(*printer)("Failed to create %s - errno = %d\n", umid, errno);
-		return(-1);
-	}
-
-	umid_owned = 1;
-	return 0;
-}
-
-__uml_setup("uml_dir=", set_uml_dir,
-"uml_dir=<directory>\n"
-"    The location to place the pid and umid files.\n\n"
-);
-
-static int __init make_umid_setup(void)
-{
-	/* one function with the ordering we need ... */
-	make_uml_dir();
-	make_umid(printf);
-	create_pid_file();
-	return 0;
-}
-__uml_postsetup(make_umid_setup);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index b83ac8e..11e30b13 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -4,11 +4,11 @@
 #
 
 obj-y = aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
-	start_up.o time.o tt.o tty.o uaccess.o user_syms.o drivers/ \
+	start_up.o time.o tt.o tty.o uaccess.o umid.o user_syms.o drivers/ \
 	sys-$(SUBARCH)/
 
 USER_OBJS := aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
-	start_up.o time.o tt.o tty.o uaccess.o
+	start_up.o time.o tt.o tty.o uaccess.o umid.o
 
 elf_aux.o: $(ARCH_DIR)/kernel-offsets.h
 CFLAGS_elf_aux.o += -I$(objtree)/arch/um
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
new file mode 100644
index 0000000..77d69a3
--- /dev/null
+++ b/arch/um/os-Linux/umid.c
@@ -0,0 +1,292 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include "init.h"
+#include "os.h"
+#include "user.h"
+#include "mode.h"
+
+#define UML_DIR "~/.uml/"
+
+#define UMID_LEN 64
+
+/* Changed by set_umid, which is run early in boot */
+char umid[UMID_LEN] = { 0 };
+
+/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */
+static char *uml_dir = UML_DIR;
+
+static int __init make_uml_dir(void)
+{
+	char dir[512] = { '\0' };
+	int len;
+
+	if(*uml_dir == '~'){
+		char *home = getenv("HOME");
+
+		if(home == NULL){
+			printf("make_uml_dir : no value in environment for "
+			       "$HOME\n");
+			exit(1);
+		}
+		strlcpy(dir, home, sizeof(dir));
+		uml_dir++;
+	}
+	strlcat(dir, uml_dir, sizeof(dir));
+	len = strlen(dir);
+	if (len > 0 && dir[len - 1] != '/')
+		strlcat(dir, "/", sizeof(dir));
+
+	uml_dir = malloc(strlen(dir) + 1);
+	if (uml_dir == NULL) {
+		printf("make_uml_dir : malloc failed, errno = %d\n", errno);
+		exit(1);
+	}
+	strcpy(uml_dir, dir);
+
+	if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
+	        printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno));
+		return(-1);
+	}
+	return 0;
+}
+
+static int actually_do_remove(char *dir)
+{
+	DIR *directory;
+	struct dirent *ent;
+	int len;
+	char file[256];
+
+	directory = opendir(dir);
+	if(directory == NULL){
+		printk("actually_do_remove : couldn't open directory '%s', "
+		       "errno = %d\n", dir, errno);
+		return(1);
+	}
+	while((ent = readdir(directory)) != NULL){
+		if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
+			continue;
+		len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
+		if(len > sizeof(file)){
+			printk("Not deleting '%s' from '%s' - name too long\n",
+			       ent->d_name, dir);
+			continue;
+		}
+		sprintf(file, "%s/%s", dir, ent->d_name);
+		if(unlink(file) < 0){
+			printk("actually_do_remove : couldn't remove '%s' "
+			       "from '%s', errno = %d\n", ent->d_name, dir,
+			       errno);
+			return(1);
+		}
+	}
+	if(rmdir(dir) < 0){
+		printk("actually_do_remove : couldn't rmdir '%s', "
+		       "errno = %d\n", dir, errno);
+		return(1);
+	}
+	return(0);
+}
+
+extern int tracing_pid;
+
+static int not_dead_yet(char *dir)
+{
+	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
+	char pid[sizeof("nnnnn\0")], *end;
+	int dead, fd, p, n;
+
+	sprintf(file, "%s/pid", dir);
+	dead = 0;
+	fd = os_open_file(file, of_read(OPENFLAGS()), 0);
+	if(fd < 0){
+		if(fd != -ENOENT){
+			printk("not_dead_yet : couldn't open pid file '%s', "
+			       "err = %d\n", file, -fd);
+			return(1);
+		}
+		dead = 1;
+	}
+	if(fd > 0){
+		n = os_read_file(fd, pid, sizeof(pid));
+		if(n < 0){
+			printk("not_dead_yet : couldn't read pid file '%s', "
+			       "err = %d\n", file, -n);
+			return(1);
+		}
+		p = strtoul(pid, &end, 0);
+		if(end == pid){
+			printk("not_dead_yet : couldn't parse pid file '%s', "
+			       "errno = %d\n", file, errno);
+			dead = 1;
+		}
+		if(((kill(p, 0) < 0) && (errno == ESRCH)) ||
+		   (p == CHOOSE_MODE(tracing_pid, os_getpid())))
+			dead = 1;
+	}
+	if(!dead)
+		return(1);
+	return(actually_do_remove(dir));
+}
+
+static void __init create_pid_file(void)
+{
+	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
+	char pid[sizeof("nnnnn\0")];
+	int fd, n;
+
+	if(umid_file_name("pid", file, sizeof(file)))
+		return;
+
+	fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))),
+			  0644);
+	if(fd < 0){
+		printf("Open of machine pid file \"%s\" failed: %s\n",
+		       file, strerror(-fd));
+		return;
+	}
+
+	sprintf(pid, "%d\n", os_getpid());
+	n = os_write_file(fd, pid, strlen(pid));
+	if(n != strlen(pid))
+		printf("Write of pid file failed - err = %d\n", -n);
+	os_close_file(fd);
+}
+
+int __init set_umid(char *name, int (*printer)(const char *fmt, ...))
+{
+	if(strlen(name) > UMID_LEN - 1)
+		(*printer)("Unique machine name is being truncated to %d "
+			   "characters\n", UMID_LEN);
+	strlcpy(umid, name, sizeof(umid));
+
+	return 0;
+}
+
+static int umid_setup = 0;
+
+int __init make_umid(int (*printer)(const char *fmt, ...))
+{
+	int fd, err;
+	char tmp[256];
+
+	make_uml_dir();
+
+	if(*umid == '\0'){
+		strlcpy(tmp, uml_dir, sizeof(tmp));
+		strcat(tmp, "XXXXXX");
+		fd = mkstemp(tmp);
+		if(fd < 0){
+			(*printer)("make_umid - mkstemp(%s) failed: %s\n",
+				   tmp,strerror(errno));
+			return(1);
+		}
+
+		os_close_file(fd);
+		/* There's a nice tiny little race between this unlink and
+		 * the mkdir below.  It'd be nice if there were a mkstemp
+		 * for directories.
+		 */
+		unlink(tmp);
+		set_umid(&tmp[strlen(uml_dir)], printer);
+	}
+
+	sprintf(tmp, "%s%s", uml_dir, umid);
+	err = mkdir(tmp, 0777);
+	if(err < 0){
+		if(errno == EEXIST){
+			if(not_dead_yet(tmp))
+				return -EEXIST;
+			err = mkdir(tmp, 0777);
+		}
+	}
+	if(err < 0){
+		(*printer)("Failed to create %s - errno = %d\n", umid, errno);
+		return(-1);
+	}
+
+	umid_setup = 1;
+
+	create_pid_file();
+
+	return 0;
+}
+
+static int __init make_umid_init(void)
+{
+	make_umid(printk);
+
+	return(0);
+}
+
+__initcall(make_umid_init);
+
+int __init umid_file_name(char *name, char *buf, int len)
+{
+	int n, err;
+
+	if(!umid_setup){
+		err = make_umid(printk);
+		if(err)
+			return err;
+	}
+
+	n = strlen(uml_dir) + strlen(umid) + strlen("/") + strlen(name) + 1;
+	if(n > len){
+		printk("umid_file_name : buffer too short\n");
+		return(-1);
+	}
+
+	sprintf(buf, "%s%s/%s", uml_dir, umid, name);
+	return(0);
+}
+
+extern int umid_is_random;
+
+char *get_umid(int only_if_set)
+{
+	if(only_if_set && umid_is_random)
+		return NULL;
+	return umid;
+}
+
+static int __init set_uml_dir(char *name, int *add)
+{
+	if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
+		uml_dir = malloc(strlen(name) + 2);
+		if(uml_dir == NULL){
+			printf("Failed to malloc uml_dir - error = %d\n",
+			       errno);
+			uml_dir = name;
+			/* Return 0 here because do_initcalls doesn't look at
+			 * the return value.
+			 */
+			return(0);
+		}
+		sprintf(uml_dir, "%s/", name);
+	}
+	else uml_dir = name;
+	return(0);
+}
+
+__uml_setup("uml_dir=", set_uml_dir,
+"uml_dir=<directory>\n"
+"    The location to place the pid and umid files.\n\n"
+);
+
+static void remove_umid_dir(void)
+{
+	char dir[strlen(uml_dir) + UMID_LEN + 1];
+
+	sprintf(dir, "%s%s", uml_dir, umid);
+	actually_do_remove(dir);
+}
+
+__uml_exitcall(remove_umid_dir);
-- 
cgit v1.1


From 7eebe8a9c51686927709a57b1f2725d371014abc Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:01 -0800
Subject: [PATCH] uml: umid cleanup

This patch cleans up the umid code:

- The only_if_set argument to get_umid is gone.

- get_umid returns an empty string rather than NULL if there is no umid.

- umid_is_random is gone since its users went away.

- Some printfs were turned into printks because the code runs late enough
  that printk is working.

- Error paths were cleaned up.

- Some functions now return an error and let the caller print the error
  message rather than printing it themselves.  This eliminates the practice of
  passing a pointer to printf or printk in, depending on where in the boot
  process we are.

- Major tidying of not_dead_yet - mostly error path cleanup, plus a comment
  explaining why it doesn't react to errors the way you might expect.

- Calls to os_* interfaces that were moved under os are changed back to
  their native libc forms.

- snprintf, strlcpy, and their bounds-checking friends are used more often,
  replacing by-hand bounds checking in some places.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c      |   4 +-
 arch/um/include/os.h        |   4 +-
 arch/um/include/user_util.h |   1 -
 arch/um/kernel/um_arch.c    |   4 +-
 arch/um/kernel/umid.c       |  12 +-
 arch/um/os-Linux/umid.c     | 265 +++++++++++++++++++++++++-------------------
 6 files changed, 164 insertions(+), 126 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index b8e3e80..a3c3937 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -831,8 +831,8 @@ char *add_xterm_umid(char *base)
 	char *umid, *title;
 	int len;
 
-	umid = get_umid(1);
-	if(umid == NULL)
+	umid = get_umid();
+	if(*umid == '\0')
 		return base;
 
 	len = strlen(base) + strlen(" ()") + strlen(umid) + 1;
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 258444e..c279ee6 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -216,7 +216,7 @@ extern int helper_wait(int pid);
 /* umid.c */
 
 extern int umid_file_name(char *name, char *buf, int len);
-extern int set_umid(char *name, int (*printer)(const char *fmt, ...));
-extern char *get_umid(int only_if_set);
+extern int set_umid(char *name);
+extern char *get_umid(void);
 
 #endif
diff --git a/arch/um/include/user_util.h b/arch/um/include/user_util.h
index bb505e0..b998400 100644
--- a/arch/um/include/user_util.h
+++ b/arch/um/include/user_util.h
@@ -64,7 +64,6 @@ extern void setup_machinename(char *machine_out);
 extern void setup_hostinfo(void);
 extern void do_exec(int old_pid, int new_pid);
 extern void tracer_panic(char *msg, ...);
-extern char *get_umid(int only_if_set);
 extern void do_longjmp(void *p, int val);
 extern int detach(int pid, int sig);
 extern int attach(int pid);
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 142a949..26626b2 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -146,8 +146,8 @@ void set_cmdline(char *cmd)
 
 	if(CHOOSE_MODE(honeypot, 0)) return;
 
-	umid = get_umid(1);
-	if(umid != NULL){
+	umid = get_umid();
+	if(*umid != '\0'){
 		snprintf(argv1_begin, 
 			 (argv1_end - argv1_begin) * sizeof(*ptr), 
 			 "(%s) ", umid);
diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
index 772c7cf..4eaee82 100644
--- a/arch/um/kernel/umid.c
+++ b/arch/um/kernel/umid.c
@@ -3,15 +3,13 @@
  * Licensed under the GPL
  */
 
-#include "linux/stddef.h"
-#include "linux/kernel.h"
 #include "asm/errno.h"
 #include "init.h"
 #include "os.h"
 #include "kern.h"
+#include "linux/kernel.h"
 
-/* Changed by set_umid_arg and umid_file_name */
-int umid_is_random = 0;
+/* Changed by set_umid_arg */
 static int umid_inited = 0;
 
 static int __init set_umid_arg(char *name, int *add)
@@ -22,11 +20,9 @@ static int __init set_umid_arg(char *name, int *add)
 		return 0;
 
 	*add = 0;
-	err = set_umid(name, printf);
-	if(err == -EEXIST){
+	err = set_umid(name);
+	if(err == -EEXIST)
 		printf("umid '%s' already in use\n", name);
-		umid_is_random = 1;
-	}
 	else if(!err)
 		umid_inited = 1;
 
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 77d69a3..ecf107a 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -5,6 +5,7 @@
 #include <errno.h>
 #include <signal.h>
 #include <dirent.h>
+#include <sys/fcntl.h>
 #include <sys/stat.h>
 #include <sys/param.h>
 #include "init.h"
@@ -25,15 +26,16 @@ static char *uml_dir = UML_DIR;
 static int __init make_uml_dir(void)
 {
 	char dir[512] = { '\0' };
-	int len;
+	int len, err;
 
 	if(*uml_dir == '~'){
 		char *home = getenv("HOME");
 
+		err = -ENOENT;
 		if(home == NULL){
-			printf("make_uml_dir : no value in environment for "
+			printk("make_uml_dir : no value in environment for "
 			       "$HOME\n");
-			exit(1);
+			goto err;
 		}
 		strlcpy(dir, home, sizeof(dir));
 		uml_dir++;
@@ -43,18 +45,26 @@ static int __init make_uml_dir(void)
 	if (len > 0 && dir[len - 1] != '/')
 		strlcat(dir, "/", sizeof(dir));
 
+	err = -ENOMEM;
 	uml_dir = malloc(strlen(dir) + 1);
 	if (uml_dir == NULL) {
 		printf("make_uml_dir : malloc failed, errno = %d\n", errno);
-		exit(1);
+		goto err;
 	}
 	strcpy(uml_dir, dir);
 
 	if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
 	        printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno));
-		return(-1);
+		err = -errno;
+		goto err_free;
 	}
 	return 0;
+
+err_free:
+	free(uml_dir);
+err:
+	uml_dir = NULL;
+	return err;
 }
 
 static int actually_do_remove(char *dir)
@@ -65,75 +75,88 @@ static int actually_do_remove(char *dir)
 	char file[256];
 
 	directory = opendir(dir);
-	if(directory == NULL){
-		printk("actually_do_remove : couldn't open directory '%s', "
-		       "errno = %d\n", dir, errno);
-		return(1);
-	}
+	if(directory == NULL)
+		return -errno;
+
 	while((ent = readdir(directory)) != NULL){
 		if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
 			continue;
 		len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
-		if(len > sizeof(file)){
-			printk("Not deleting '%s' from '%s' - name too long\n",
-			       ent->d_name, dir);
-			continue;
-		}
+		if(len > sizeof(file))
+			return -E2BIG;
+
 		sprintf(file, "%s/%s", dir, ent->d_name);
-		if(unlink(file) < 0){
-			printk("actually_do_remove : couldn't remove '%s' "
-			       "from '%s', errno = %d\n", ent->d_name, dir,
-			       errno);
-			return(1);
-		}
-	}
-	if(rmdir(dir) < 0){
-		printk("actually_do_remove : couldn't rmdir '%s', "
-		       "errno = %d\n", dir, errno);
-		return(1);
+		if(unlink(file) < 0)
+			return -errno;
 	}
-	return(0);
+	if(rmdir(dir) < 0)
+		return -errno;
+
+	return 0;
 }
 
-extern int tracing_pid;
+/* This says that there isn't already a user of the specified directory even if
+ * there are errors during the checking.  This is because if these errors
+ * happen, the directory is unusable by the pre-existing UML, so we might as
+ * well take it over.  This could happen either by
+ * 	the existing UML somehow corrupting its umid directory
+ * 	something other than UML sticking stuff in the directory
+ *	this boot racing with a shutdown of the other UML
+ * In any of these cases, the directory isn't useful for anything else.
+ */
 
 static int not_dead_yet(char *dir)
 {
 	char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
 	char pid[sizeof("nnnnn\0")], *end;
-	int dead, fd, p, n;
+	int dead, fd, p, n, err;
+
+	n = snprintf(file, sizeof(file), "%s/pid", dir);
+	if(n >= sizeof(file)){
+		printk("not_dead_yet - pid filename too long\n");
+		err = -E2BIG;
+		goto out;
+	}
 
-	sprintf(file, "%s/pid", dir);
 	dead = 0;
-	fd = os_open_file(file, of_read(OPENFLAGS()), 0);
+	fd = open(file, O_RDONLY);
 	if(fd < 0){
 		if(fd != -ENOENT){
 			printk("not_dead_yet : couldn't open pid file '%s', "
 			       "err = %d\n", file, -fd);
-			return(1);
 		}
-		dead = 1;
+		goto out;
 	}
-	if(fd > 0){
-		n = os_read_file(fd, pid, sizeof(pid));
-		if(n < 0){
-			printk("not_dead_yet : couldn't read pid file '%s', "
-			       "err = %d\n", file, -n);
-			return(1);
-		}
-		p = strtoul(pid, &end, 0);
-		if(end == pid){
-			printk("not_dead_yet : couldn't parse pid file '%s', "
-			       "errno = %d\n", file, errno);
-			dead = 1;
-		}
-		if(((kill(p, 0) < 0) && (errno == ESRCH)) ||
-		   (p == CHOOSE_MODE(tracing_pid, os_getpid())))
-			dead = 1;
+
+	err = 0;
+	n = read(fd, pid, sizeof(pid));
+	if(n <= 0){
+		printk("not_dead_yet : couldn't read pid file '%s', "
+		       "err = %d\n", file, -n);
+		goto out_close;
+	}
+
+	p = strtoul(pid, &end, 0);
+	if(end == pid){
+		printk("not_dead_yet : couldn't parse pid file '%s', "
+		       "errno = %d\n", file, errno);
+		goto out_close;
 	}
-	if(!dead)
-		return(1);
-	return(actually_do_remove(dir));
+
+	if((kill(p, 0) == 0) || (errno != ESRCH))
+		return 1;
+
+	err = actually_do_remove(dir);
+	if(err)
+		printk("not_dead_yet - actually_do_remove failed with "
+		       "err = %d\n", err);
+
+	return err;
+
+ out_close:
+	close(fd);
+ out:
+	return 0;
 }
 
 static void __init create_pid_file(void)
@@ -145,26 +168,26 @@ static void __init create_pid_file(void)
 	if(umid_file_name("pid", file, sizeof(file)))
 		return;
 
-	fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))),
-			  0644);
+	fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
 	if(fd < 0){
-		printf("Open of machine pid file \"%s\" failed: %s\n",
+		printk("Open of machine pid file \"%s\" failed: %s\n",
 		       file, strerror(-fd));
 		return;
 	}
 
-	sprintf(pid, "%d\n", os_getpid());
-	n = os_write_file(fd, pid, strlen(pid));
+	snprintf(pid, sizeof(pid), "%d\n", getpid());
+	n = write(fd, pid, strlen(pid));
 	if(n != strlen(pid))
-		printf("Write of pid file failed - err = %d\n", -n);
-	os_close_file(fd);
+		printk("Write of pid file failed - err = %d\n", -n);
+
+	close(fd);
 }
 
-int __init set_umid(char *name, int (*printer)(const char *fmt, ...))
+int __init set_umid(char *name)
 {
 	if(strlen(name) > UMID_LEN - 1)
-		(*printer)("Unique machine name is being truncated to %d "
-			   "characters\n", UMID_LEN);
+		return -E2BIG;
+
 	strlcpy(umid, name, sizeof(umid));
 
 	return 0;
@@ -172,44 +195,56 @@ int __init set_umid(char *name, int (*printer)(const char *fmt, ...))
 
 static int umid_setup = 0;
 
-int __init make_umid(int (*printer)(const char *fmt, ...))
+int __init make_umid(void)
 {
 	int fd, err;
 	char tmp[256];
 
+	if(umid_setup)
+		return 0;
+
 	make_uml_dir();
 
 	if(*umid == '\0'){
 		strlcpy(tmp, uml_dir, sizeof(tmp));
-		strcat(tmp, "XXXXXX");
+		strlcat(tmp, "XXXXXX", sizeof(tmp));
 		fd = mkstemp(tmp);
 		if(fd < 0){
-			(*printer)("make_umid - mkstemp(%s) failed: %s\n",
-				   tmp,strerror(errno));
-			return(1);
+			printk("make_umid - mkstemp(%s) failed: %s\n",
+			       tmp, strerror(errno));
+			err = -errno;
+			goto err;
 		}
 
-		os_close_file(fd);
+		close(fd);
+
+		set_umid(&tmp[strlen(uml_dir)]);
+
 		/* There's a nice tiny little race between this unlink and
 		 * the mkdir below.  It'd be nice if there were a mkstemp
 		 * for directories.
 		 */
-		unlink(tmp);
-		set_umid(&tmp[strlen(uml_dir)], printer);
+		if(unlink(tmp)){
+			err = -errno;
+			goto err;
+		}
 	}
 
-	sprintf(tmp, "%s%s", uml_dir, umid);
+	snprintf(tmp, sizeof(tmp), "%s%s", uml_dir, umid);
 	err = mkdir(tmp, 0777);
 	if(err < 0){
-		if(errno == EEXIST){
-			if(not_dead_yet(tmp))
-				return -EEXIST;
-			err = mkdir(tmp, 0777);
-		}
+		err = -errno;
+		if(errno != EEXIST)
+			goto err;
+
+		if(not_dead_yet(tmp) < 0)
+			goto err;
+
+		err = mkdir(tmp, 0777);
 	}
 	if(err < 0){
-		(*printer)("Failed to create %s - errno = %d\n", umid, errno);
-		return(-1);
+		printk("Failed to create '%s' - err = %d\n", umid, err);
+		goto err_rmdir;
 	}
 
 	umid_setup = 1;
@@ -217,13 +252,18 @@ int __init make_umid(int (*printer)(const char *fmt, ...))
 	create_pid_file();
 
 	return 0;
+
+ err_rmdir:
+	rmdir(tmp);
+ err:
+	return err;
 }
 
 static int __init make_umid_init(void)
 {
-	make_umid(printk);
+	make_umid();
 
-	return(0);
+	return 0;
 }
 
 __initcall(make_umid_init);
@@ -232,48 +272,48 @@ int __init umid_file_name(char *name, char *buf, int len)
 {
 	int n, err;
 
-	if(!umid_setup){
-		err = make_umid(printk);
-		if(err)
-			return err;
-	}
+	err = make_umid();
+	if(err)
+		return err;
 
-	n = strlen(uml_dir) + strlen(umid) + strlen("/") + strlen(name) + 1;
-	if(n > len){
+	n = snprintf(buf, len, "%s%s/%s", uml_dir, umid, name);
+	if(n >= len){
 		printk("umid_file_name : buffer too short\n");
-		return(-1);
+		return -E2BIG;
 	}
 
-	sprintf(buf, "%s%s/%s", uml_dir, umid, name);
-	return(0);
+	return 0;
 }
 
-extern int umid_is_random;
-
-char *get_umid(int only_if_set)
+char *get_umid(void)
 {
-	if(only_if_set && umid_is_random)
-		return NULL;
 	return umid;
 }
 
 static int __init set_uml_dir(char *name, int *add)
 {
-	if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
-		uml_dir = malloc(strlen(name) + 2);
-		if(uml_dir == NULL){
-			printf("Failed to malloc uml_dir - error = %d\n",
-			       errno);
-			uml_dir = name;
-			/* Return 0 here because do_initcalls doesn't look at
-			 * the return value.
-			 */
-			return(0);
-		}
-		sprintf(uml_dir, "%s/", name);
+	if(*name == '\0'){
+		printf("uml_dir can't be an empty string\n");
+		return 0;
 	}
-	else uml_dir = name;
-	return(0);
+
+	if(name[strlen(name) - 1] == '/'){
+		uml_dir = name;
+		return 0;
+	}
+
+	uml_dir = malloc(strlen(name) + 2);
+	if(uml_dir == NULL){
+		printf("Failed to malloc uml_dir - error = %d\n", errno);
+
+		/* Return 0 here because do_initcalls doesn't look at
+		 * the return value.
+		 */
+		return 0;
+	}
+	sprintf(uml_dir, "%s/", name);
+
+	return 0;
 }
 
 __uml_setup("uml_dir=", set_uml_dir,
@@ -283,10 +323,13 @@ __uml_setup("uml_dir=", set_uml_dir,
 
 static void remove_umid_dir(void)
 {
-	char dir[strlen(uml_dir) + UMID_LEN + 1];
+	char dir[strlen(uml_dir) + UMID_LEN + 1], err;
 
 	sprintf(dir, "%s%s", uml_dir, umid);
-	actually_do_remove(dir);
+	err = actually_do_remove(dir);
+	if(err)
+		printf("remove_umid_dir - actually_do_remove failed with "
+		       "err = %d\n", err);
 }
 
 __uml_exitcall(remove_umid_dir);
-- 
cgit v1.1


From e464bf2bed027ea185992b44bf4b0326387a520d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:01 -0800
Subject: [PATCH] uml: SIGWINCH handling cleanup

Code cleanup - unregister_winch and winch_cleanup had some duplicate code.
This is now abstracted out into free_winch.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/line.c | 54 ++++++++++++++++++++++----------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index a3c3937..46ceb25 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -774,55 +774,49 @@ void register_winch_irq(int fd, int tty_fd, int pid, struct tty_struct *tty)
 		printk("register_winch_irq - failed to register IRQ\n");
 }
 
+static void free_winch(struct winch *winch)
+{
+	list_del(&winch->list);
+
+	if(winch->pid != -1)
+		os_kill_process(winch->pid, 1);
+	if(winch->fd != -1)
+		os_close_file(winch->fd);
+
+	free_irq(WINCH_IRQ, winch);
+	kfree(winch);
+}
+
 static void unregister_winch(struct tty_struct *tty)
 {
 	struct list_head *ele;
-	struct winch *winch, *found = NULL;
+	struct winch *winch;
 
 	spin_lock(&winch_handler_lock);
+
 	list_for_each(ele, &winch_handlers){
 		winch = list_entry(ele, struct winch, list);
                 if(winch->tty == tty){
-                        found = winch;
-                        break;
+			free_winch(winch);
+			break;
                 }
         }
-        if(found == NULL)
-		goto err;
-
-	list_del(&winch->list);
-	spin_unlock(&winch_handler_lock);
-
-        if(winch->pid != -1)
-                os_kill_process(winch->pid, 1);
-
-        free_irq(WINCH_IRQ, winch);
-        kfree(winch);
-
-	return;
-err:
 	spin_unlock(&winch_handler_lock);
 }
 
-/* XXX: No lock as it's an exitcall... is this valid? Depending on cleanup
- * order... are we sure that nothing else is done on the list? */
 static void winch_cleanup(void)
 {
-	struct list_head *ele;
+	struct list_head *ele, *next;
 	struct winch *winch;
 
-	list_for_each(ele, &winch_handlers){
+	spin_lock(&winch_handler_lock);
+
+	list_for_each_safe(ele, next, &winch_handlers){
 		winch = list_entry(ele, struct winch, list);
-		if(winch->fd != -1){
-			/* Why is this different from the above free_irq(),
-			 * which deactivates SIGIO? This searches the FD
-			 * somewhere else and removes it from the list... */
-			deactivate_fd(winch->fd, WINCH_IRQ);
-			os_close_file(winch->fd);
-		}
-		if(winch->pid != -1)
-			os_kill_process(winch->pid, 1);
+		free_winch(winch);
 	}
+
+	spin_unlock(&winch_handler_lock);
 }
 __uml_exitcall(winch_cleanup);
 
-- 
cgit v1.1


From 44700a4469b6bb89e6f1edd32b8a4a915dd967c6 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:02 -0800
Subject: [PATCH] uml: better diagnostics for broken configs

Produce a compile-time error if both MODE_SKAS and MODE_TT are disabled.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/choose-mode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/um/include/choose-mode.h b/arch/um/include/choose-mode.h
index f25fa83..b87b36a 100644
--- a/arch/um/include/choose-mode.h
+++ b/arch/um/include/choose-mode.h
@@ -23,6 +23,9 @@ static inline void *__choose_mode(void *tt, void *skas) {
 
 #elif defined(UML_CONFIG_MODE_TT)
 #define CHOOSE_MODE(tt, skas) (tt)
+
+#else
+#error CONFIG_MODE_SKAS and CONFIG_MODE_TT are both disabled
 #endif
 
 #define CHOOSE_MODE_PROC(tt, skas, args...) \
-- 
cgit v1.1


From 7b033e1fdeef3d8bacac3cd5cfa53c9d670d1f3d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:03 -0800
Subject: [PATCH] uml: add mconsole_reply variant with length param

This is needed for the console output patch, since we have a possibly
non-NULL-terminated string there.  So, the new interface takes a string and a
length, and the old interface calls strlen on its string and calls the new
interface with the length.

There's also a bit of whitespace cleanup.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/mconsole_user.c | 12 +++++++++---
 arch/um/include/mconsole.h      |  8 +++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c
index 310c1f8..4b109fe 100644
--- a/arch/um/drivers/mconsole_user.c
+++ b/arch/um/drivers/mconsole_user.c
@@ -122,12 +122,12 @@ int mconsole_get_request(int fd, struct mc_request *req)
 	return(1);
 }
 
-int mconsole_reply(struct mc_request *req, char *str, int err, int more)
+int mconsole_reply_len(struct mc_request *req, const char *str, int total,
+		       int err, int more)
 {
 	struct mconsole_reply reply;
-	int total, len, n;
+	int len, n;
 
-	total = strlen(str);
 	do {
 		reply.err = err;
 
@@ -155,6 +155,12 @@ int mconsole_reply(struct mc_request *req, char *str, int err, int more)
 	return(0);
 }
 
+int mconsole_reply(struct mc_request *req, const char *str, int err, int more)
+{
+	return mconsole_reply_len(req, str, strlen(str), err, more);
+}
+
+
 int mconsole_unlink_socket(void)
 {
 	unlink(mconsole_socket_name);
diff --git a/arch/um/include/mconsole.h b/arch/um/include/mconsole.h
index b1b512f..58f67d3 100644
--- a/arch/um/include/mconsole.h
+++ b/arch/um/include/mconsole.h
@@ -32,7 +32,7 @@ struct mconsole_reply {
 
 struct mconsole_notify {
 	u32 magic;
-	u32 version;	
+	u32 version;
 	enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG,
 	       MCONSOLE_USER_NOTIFY } type;
 	u32 len;
@@ -66,7 +66,9 @@ struct mc_request
 extern char mconsole_socket_name[];
 
 extern int mconsole_unlink_socket(void);
-extern int mconsole_reply(struct mc_request *req, char *reply, int err,
+extern int mconsole_reply_len(struct mc_request *req, const char *reply,
+			      int len, int err, int more);
+extern int mconsole_reply(struct mc_request *req, const char *str, int err,
 			  int more);
 
 extern void mconsole_version(struct mc_request *req);
@@ -84,7 +86,7 @@ extern void mconsole_proc(struct mc_request *req);
 extern void mconsole_stack(struct mc_request *req);
 
 extern int mconsole_get_request(int fd, struct mc_request *req);
-extern int mconsole_notify(char *sock_name, int type, const void *data, 
+extern int mconsole_notify(char *sock_name, int type, const void *data,
 			   int len);
 extern char *mconsole_notify_socket(void);
 extern void lock_notify(void);
-- 
cgit v1.1


From 6f517d3fc862d3c8d8ba65c0b2472d399aceb9ed Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:04 -0800
Subject: [PATCH] uml: capture printk output for mconsole stack

The stack command now sends the printk output back to the mconsole client.
This is done by registering a special console for the mconsole driver.  This
receives all printk output.  Normally, it is ignored, but when a stack command
is issued, any printk output will be sent back to the client.

This will capture any printk output, whether it is stack output or not, since
we can't tell the difference.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/mconsole_kern.c | 87 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 6 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index b5217bd..e9bbc14 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -20,6 +20,7 @@
 #include "linux/namei.h"
 #include "linux/proc_fs.h"
 #include "linux/syscalls.h"
+#include "linux/console.h"
 #include "asm/irq.h"
 #include "asm/uaccess.h"
 #include "user_util.h"
@@ -480,6 +481,82 @@ void mconsole_sysrq(struct mc_request *req)
 }
 #endif
 
+static DEFINE_SPINLOCK(console_lock);
+static LIST_HEAD(clients);
+static char console_buf[MCONSOLE_MAX_DATA];
+static int console_index = 0;
+
+static void console_write(struct console *console, const char *string,
+			  unsigned len)
+{
+	struct list_head *ele;
+	int n;
+
+	if(list_empty(&clients))
+		return;
+
+	while(1){
+		n = min(len, ARRAY_SIZE(console_buf) - console_index);
+		strncpy(&console_buf[console_index], string, n);
+		console_index += n;
+		string += n;
+		len -= n;
+		if(len == 0)
+			return;
+
+		list_for_each(ele, &clients){
+			struct mconsole_entry *entry;
+
+			entry = list_entry(ele, struct mconsole_entry, list);
+			mconsole_reply_len(&entry->request, console_buf,
+					   console_index, 0, 1);
+		}
+
+		console_index = 0;
+	}
+}
+
+static struct console mc_console = { .name	= "mc",
+				     .write	= console_write,
+				     .flags	= CON_PRINTBUFFER | CON_ENABLED,
+				     .index	= -1 };
+
+static int mc_add_console(void)
+{
+	register_console(&mc_console);
+	return 0;
+}
+
+late_initcall(mc_add_console);
+
+static void with_console(struct mc_request *req, void (*proc)(void *),
+			 void *arg)
+{
+	struct mconsole_entry entry;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&entry.list);
+	entry.request = *req;
+	list_add(&entry.list, &clients);
+	spin_lock_irqsave(&console_lock, flags);
+
+	(*proc)(arg);
+
+	mconsole_reply_len(req, console_buf, console_index, 0, 0);
+	console_index = 0;
+
+	spin_unlock_irqrestore(&console_lock, flags);
+	list_del(&entry.list);
+}
+
+static void stack_proc(void *arg)
+{
+	struct task_struct *from = current, *to = arg;
+
+	to->thread.saved_task = from;
+	switch_to(from, to, from);
+}
+
 /* Mconsole stack trace
  *  Added by Allan Graves, Jeff Dike
  *  Dumps a stacks registers to the linux console.
@@ -489,7 +566,7 @@ void do_stack(struct mc_request *req)
 {
         char *ptr = req->request.data;
         int pid_requested= -1;
-        struct task_struct *from = NULL;
+	struct task_struct *from = NULL;
 	struct task_struct *to = NULL;
 
         /* Would be nice:
@@ -507,17 +584,15 @@ void do_stack(struct mc_request *req)
                 return;
         }
 
-        from = current;
-        to = find_task_by_pid(pid_requested);
+	from = current;
 
+	to = find_task_by_pid(pid_requested);
         if((to == NULL) || (pid_requested == 0)) {
                 mconsole_reply(req, "Couldn't find that pid", 1, 0);
                 return;
         }
-        to->thread.saved_task = current;
 
-        switch_to(from, to, from);
-        mconsole_reply(req, "Stack Dumped to console and message log", 0, 0);
+	with_console(req, stack_proc, to);
 }
 
 void mconsole_stack(struct mc_request *req)
-- 
cgit v1.1


From 4111b025dc64f33803d2147565147428dc51d014 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:05 -0800
Subject: [PATCH] uml: capture printk output for mconsole sysrq

Pass sysrq output back to the mconsole client using the mechanism
introduced for stack output.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/mconsole_kern.c | 48 +++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index e9bbc14..8b453a7 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -463,24 +463,6 @@ void mconsole_remove(struct mc_request *req)
 	mconsole_reply(req, err_msg, err, 0);
 }
 
-#ifdef CONFIG_MAGIC_SYSRQ
-void mconsole_sysrq(struct mc_request *req)
-{
-	char *ptr = req->request.data;
-
-	ptr += strlen("sysrq");
-	while(isspace(*ptr)) ptr++;
-
-	mconsole_reply(req, "", 0, 0);
-	handle_sysrq(*ptr, &current->thread.regs, NULL);
-}
-#else
-void mconsole_sysrq(struct mc_request *req)
-{
-	mconsole_reply(req, "Sysrq not compiled in", 1, 0);
-}
-#endif
-
 static DEFINE_SPINLOCK(console_lock);
 static LIST_HEAD(clients);
 static char console_buf[MCONSOLE_MAX_DATA];
@@ -549,6 +531,36 @@ static void with_console(struct mc_request *req, void (*proc)(void *),
 	list_del(&entry.list);
 }
 
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_proc(void *arg)
+{
+	char *op = arg;
+
+	handle_sysrq(*op, &current->thread.regs, NULL);
+}
+
+void mconsole_sysrq(struct mc_request *req)
+{
+	char *ptr = req->request.data;
+
+	ptr += strlen("sysrq");
+	while(isspace(*ptr)) ptr++;
+
+	/* With 'b', the system will shut down without a chance to reply,
+	 * so in this case, we reply first.
+	 */
+	if(*ptr == 'b')
+		mconsole_reply(req, "", 0, 0);
+
+	with_console(req, sysrq_proc, ptr);
+}
+#else
+void mconsole_sysrq(struct mc_request *req)
+{
+	mconsole_reply(req, "Sysrq not compiled in", 1, 0);
+}
+#endif
+
 static void stack_proc(void *arg)
 {
 	struct task_struct *from = current, *to = arg;
-- 
cgit v1.1


From 3a331a511a2fe522034f3958eecf58751be434ac Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:05 -0800
Subject: [PATCH] uml: fix whitespace in mconsole driver

Fix up some bogus spacing in the mconsole driver.  Also delete the
emacs formatting comment at the end.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/mconsole_kern.c | 96 ++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 54 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 8b453a7..be61012 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -422,7 +422,7 @@ void mconsole_remove(struct mc_request *req)
 {
 	struct mc_device *dev;
 	char *ptr = req->request.data, *err_msg = "";
-        char error[256];
+	char error[256];
 	int err, start, end, n;
 
 	ptr += strlen("remove");
@@ -433,33 +433,33 @@ void mconsole_remove(struct mc_request *req)
 		return;
 	}
 
-        ptr = &ptr[strlen(dev->name)];
-
-        err = 1;
-        n = (*dev->id)(&ptr, &start, &end);
-        if(n < 0){
-                err_msg = "Couldn't parse device number";
-                goto out;
-        }
-        else if((n < start) || (n > end)){
-                sprintf(error, "Invalid device number - must be between "
-                        "%d and %d", start, end);
-                err_msg = error;
-                goto out;
-        }
+	ptr = &ptr[strlen(dev->name)];
+
+	err = 1;
+	n = (*dev->id)(&ptr, &start, &end);
+	if(n < 0){
+		err_msg = "Couldn't parse device number";
+		goto out;
+	}
+	else if((n < start) || (n > end)){
+		sprintf(error, "Invalid device number - must be between "
+			"%d and %d", start, end);
+		err_msg = error;
+		goto out;
+	}
 
 	err = (*dev->remove)(n);
-        switch(err){
-        case -ENODEV:
-                err_msg = "Device doesn't exist";
-                break;
-        case -EBUSY:
-                err_msg = "Device is currently open";
-                break;
-        default:
-                break;
-        }
- out:
+	switch(err){
+	case -ENODEV:
+		err_msg = "Device doesn't exist";
+		break;
+	case -EBUSY:
+		err_msg = "Device is currently open";
+		break;
+	default:
+		break;
+	}
+out:
 	mconsole_reply(req, err_msg, err, 0);
 }
 
@@ -576,34 +576,33 @@ static void stack_proc(void *arg)
  */
 void do_stack(struct mc_request *req)
 {
-        char *ptr = req->request.data;
-        int pid_requested= -1;
+	char *ptr = req->request.data;
+	int pid_requested= -1;
 	struct task_struct *from = NULL;
 	struct task_struct *to = NULL;
 
-        /* Would be nice:
-         * 1) Send showregs output to mconsole.
+	/* Would be nice:
+	 * 1) Send showregs output to mconsole.
 	 * 2) Add a way to stack dump all pids.
 	 */
 
-        ptr += strlen("stack");
-        while(isspace(*ptr)) ptr++;
+	ptr += strlen("stack");
+	while(isspace(*ptr)) ptr++;
 
-        /* Should really check for multiple pids or reject bad args here */
-        /* What do the arguments in mconsole_reply mean? */
-        if(sscanf(ptr, "%d", &pid_requested) == 0){
-                mconsole_reply(req, "Please specify a pid", 1, 0);
-                return;
-        }
+	/* Should really check for multiple pids or reject bad args here */
+	/* What do the arguments in mconsole_reply mean? */
+	if(sscanf(ptr, "%d", &pid_requested) == 0){
+		mconsole_reply(req, "Please specify a pid", 1, 0);
+		return;
+	}
 
 	from = current;
 
 	to = find_task_by_pid(pid_requested);
-        if((to == NULL) || (pid_requested == 0)) {
-                mconsole_reply(req, "Couldn't find that pid", 1, 0);
-                return;
-        }
-
+	if((to == NULL) || (pid_requested == 0)) {
+		mconsole_reply(req, "Couldn't find that pid", 1, 0);
+		return;
+	}
 	with_console(req, stack_proc, to);
 }
 
@@ -772,14 +771,3 @@ char *mconsole_notify_socket(void)
 }
 
 EXPORT_SYMBOL(mconsole_notify_socket);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
-- 
cgit v1.1


From 8d93c700a489eba08514222df414a23852a85d2b Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Fri, 6 Jan 2006 00:19:06 -0800
Subject: [PATCH] uml: free network IRQ correctly

Free the network IRQ when closing down the network devices at shutdown.
Delete the device from the opened devices list on close.

These prevent an -EBADF when later disabling SIGIO on all extant descriptors
and a complaint from free_irq about freeing the IRQ twice.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/net_kern.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index deb2482..fb1f9fb 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -150,6 +150,7 @@ static int uml_net_close(struct net_device *dev)
 	if(lp->close != NULL)
 		(*lp->close)(lp->fd, &lp->user);
 	lp->fd = -1;
+	list_del(&lp->list);
 
 	spin_unlock(&lp->lock);
 	return 0;
@@ -715,6 +716,7 @@ static void close_devices(void)
 
 	list_for_each(ele, &opened){
 		lp = list_entry(ele, struct uml_net_private, list);
+		free_irq(lp->dev->irq, lp->dev);
 		if((lp->close != NULL) && (lp->fd >= 0))
 			(*lp->close)(lp->fd, &lp->user);
 		if(lp->remove != NULL) (*lp->remove)(&lp->user);
-- 
cgit v1.1


From 973bd9937569146de0917f54f05b2942f8257912 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:07 -0800
Subject: [PATCH] s390: atomic primitives

      Hugh Dickins <hugh@veritas.com>

Fix the broken atomic_cmpxchg primitive.  Add atomic_sub_and_test,
atomic64_sub_return, atomic64_sub_and_test, atomic64_cmpxchg,
atomic64_add_unless and atomic64_inc_not_zero.  Replace old style
atomic_compare_and_swap by atomic_cmpxchg.  Shorten the whole header by
defining most primitives with the two inline functions atomic_add_return and
atomic_sub_return.

In addition this patch contains the s390 related fixes of Hugh's "mm: fill
arch atomic64 gaps" patch.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/machine_kexec.c |   2 +-
 arch/s390/kernel/smp.c           |   6 +-
 drivers/s390/block/dasd.c        |   4 +-
 drivers/s390/char/sclp_quiesce.c |   2 +-
 drivers/s390/char/tape_block.c   |   2 +-
 drivers/s390/cio/ccwgroup.c      |   6 +-
 drivers/s390/cio/device.c        |   4 +-
 drivers/s390/net/iucv.c          |   8 +-
 drivers/s390/net/qeth_main.c     |  20 ++---
 include/asm-s390/atomic.h        | 173 ++++++++++++++++-----------------------
 10 files changed, 96 insertions(+), 131 deletions(-)

diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 5aa71b0..f0ed5c6 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -85,7 +85,7 @@ kexec_halt_all_cpus(void *kernel_image)
 		pfault_fini();
 #endif
 
-	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid))
+	if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
 		signal_processor(smp_processor_id(), sigp_stop);
 
 	/* Wait for all other cpus to enter stopped state */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 5856b3f..bd5b311 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -263,7 +263,7 @@ static void do_machine_restart(void * __unused)
 	int cpu;
 	static atomic_t cpuid = ATOMIC_INIT(-1);
 
-	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid))
+	if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
 		signal_processor(smp_processor_id(), sigp_stop);
 
 	/* Wait for all other cpus to enter stopped state */
@@ -313,7 +313,7 @@ static void do_machine_halt(void * __unused)
 {
 	static atomic_t cpuid = ATOMIC_INIT(-1);
 
-	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid) == 0) {
+	if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) == -1) {
 		smp_send_stop();
 		if (MACHINE_IS_VM && strlen(vmhalt_cmd) > 0)
 			cpcmd(vmhalt_cmd, NULL, 0, NULL);
@@ -332,7 +332,7 @@ static void do_machine_power_off(void * __unused)
 {
 	static atomic_t cpuid = ATOMIC_INIT(-1);
 
-	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid) == 0) {
+	if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) == -1) {
 		smp_send_stop();
 		if (MACHINE_IS_VM && strlen(vmpoff_cmd) > 0)
 			cpcmd(vmpoff_cmd, NULL, 0, NULL);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7008d32..6278739 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -7,7 +7,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001
  *
- * $Revision: 1.167 $
+ * $Revision: 1.169 $
  */
 
 #include <linux/config.h>
@@ -1323,7 +1323,7 @@ void
 dasd_schedule_bh(struct dasd_device * device)
 {
 	/* Protect against rescheduling. */
-	if (atomic_compare_and_swap (0, 1, &device->tasklet_scheduled))
+	if (atomic_cmpxchg (&device->tasklet_scheduled, 0, 1) != 0)
 		return;
 	dasd_get_device(device);
 	tasklet_hi_schedule(&device->tasklet);
diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c
index 83f7577..56fa691 100644
--- a/drivers/s390/char/sclp_quiesce.c
+++ b/drivers/s390/char/sclp_quiesce.c
@@ -32,7 +32,7 @@ do_load_quiesce_psw(void * __unused)
 	psw_t quiesce_psw;
 	int cpu;
 
-	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid))
+	if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
 		signal_processor(smp_processor_id(), sigp_stop);
 	/* Wait for all other cpus to enter stopped state */
 	for_each_online_cpu(cpu) {
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1efc9f2..482e07e 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -65,7 +65,7 @@ static void
 tapeblock_trigger_requeue(struct tape_device *device)
 {
 	/* Protect against rescheduling. */
-	if (atomic_compare_and_swap(0, 1, &device->blk_data.requeue_scheduled))
+	if (atomic_cmpxchg(&device->blk_data.requeue_scheduled, 0, 1) != 0)
 		return;
 	schedule_work(&device->blk_data.requeue_task);
 }
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index be9d2d6..e849289 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/ccwgroup.c
  *  bus driver for ccwgroup
- *   $Revision: 1.32 $
+ *   $Revision: 1.33 $
  *
  *    Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
  *                       IBM Corporation
@@ -263,7 +263,7 @@ ccwgroup_set_online(struct ccwgroup_device *gdev)
 	struct ccwgroup_driver *gdrv;
 	int ret;
 
-	if (atomic_compare_and_swap(0, 1, &gdev->onoff))
+	if (atomic_cmpxchg(&gdev->onoff, 0, 1) != 0)
 		return -EAGAIN;
 	if (gdev->state == CCWGROUP_ONLINE) {
 		ret = 0;
@@ -289,7 +289,7 @@ ccwgroup_set_offline(struct ccwgroup_device *gdev)
 	struct ccwgroup_driver *gdrv;
 	int ret;
 
-	if (atomic_compare_and_swap(0, 1, &gdev->onoff))
+	if (atomic_cmpxchg(&gdev->onoff, 0, 1) != 0)
 		return -EAGAIN;
 	if (gdev->state == CCWGROUP_OFFLINE) {
 		ret = 0;
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 85908ca..0590cff 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/device.c
  *  bus driver for ccw devices
- *   $Revision: 1.131 $
+ *   $Revision: 1.137 $
  *
  *    Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
  *			 IBM Corporation
@@ -374,7 +374,7 @@ online_store (struct device *dev, struct device_attribute *attr, const char *buf
 	int i, force, ret;
 	char *tmp;
 
-	if (atomic_compare_and_swap(0, 1, &cdev->private->onoff))
+	if (atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0)
 		return -EAGAIN;
 
 	if (cdev->drv && !try_module_get(cdev->drv->owner)) {
diff --git a/drivers/s390/net/iucv.c b/drivers/s390/net/iucv.c
index df7647c..ecb2f8f 100644
--- a/drivers/s390/net/iucv.c
+++ b/drivers/s390/net/iucv.c
@@ -1,5 +1,5 @@
 /* 
- * $Id: iucv.c,v 1.45 2005/04/26 22:59:06 braunu Exp $
+ * $Id: iucv.c,v 1.47 2005/11/21 11:35:22 mschwide Exp $
  *
  * IUCV network driver
  *
@@ -29,7 +29,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * RELEASE-TAG: IUCV lowlevel driver $Revision: 1.45 $
+ * RELEASE-TAG: IUCV lowlevel driver $Revision: 1.47 $
  *
  */
 
@@ -355,7 +355,7 @@ do { \
 static void
 iucv_banner(void)
 {
-	char vbuf[] = "$Revision: 1.45 $";
+	char vbuf[] = "$Revision: 1.47 $";
 	char *version = vbuf;
 
 	if ((version = strchr(version, ':'))) {
@@ -477,7 +477,7 @@ grab_param(void)
 		ptr++;
 		if (ptr >= iucv_param_pool + PARAM_POOL_SIZE)
 			ptr = iucv_param_pool;
-	} while (atomic_compare_and_swap(0, 1, &ptr->in_use));
+	} while (atomic_cmpxchg(&ptr->in_use, 0, 1) != 0);
 	hint = ptr - iucv_param_pool;
 
 	memset(&ptr->param, 0, sizeof(ptr->param));
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index f8f55cc..7b2663f 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -1396,7 +1396,7 @@ qeth_idx_activate_get_answer(struct qeth_channel *channel,
 	channel->ccw.cda = (__u32) __pa(iob->data);
 
 	wait_event(card->wait_q,
-		   atomic_compare_and_swap(0,1,&channel->irq_pending) == 0);
+		   atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0);
 	QETH_DBF_TEXT(setup, 6, "noirqpnd");
 	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
 	rc = ccw_device_start(channel->ccwdev,
@@ -1463,7 +1463,7 @@ qeth_idx_activate_channel(struct qeth_channel *channel,
 	memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(iob->data), &temp, 2);
 
 	wait_event(card->wait_q,
-		   atomic_compare_and_swap(0,1,&channel->irq_pending) == 0);
+		   atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0);
 	QETH_DBF_TEXT(setup, 6, "noirqpnd");
 	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
 	rc = ccw_device_start(channel->ccwdev,
@@ -1616,7 +1616,7 @@ qeth_issue_next_read(struct qeth_card *card)
 	}
 	qeth_setup_ccw(&card->read, iob->data, QETH_BUFSIZE);
 	wait_event(card->wait_q,
-		   atomic_compare_and_swap(0,1,&card->read.irq_pending) == 0);
+		   atomic_cmpxchg(&card->read.irq_pending, 0, 1) == 0);
 	QETH_DBF_TEXT(trace, 6, "noirqpnd");
 	rc = ccw_device_start(card->read.ccwdev, &card->read.ccw,
 			      (addr_t) iob, 0, 0);
@@ -1882,7 +1882,7 @@ qeth_send_control_data(struct qeth_card *card, int len,
 	spin_unlock_irqrestore(&card->lock, flags);
 	QETH_DBF_HEX(control, 2, iob->data, QETH_DBF_CONTROL_LEN);
 	wait_event(card->wait_q,
-		   atomic_compare_and_swap(0,1,&card->write.irq_pending) == 0);
+		   atomic_cmpxchg(&card->write.irq_pending, 0, 1) == 0);
 	qeth_prepare_control_data(card, len, iob);
 	if (IS_IPA(iob->data))
 		timer.expires = jiffies + QETH_IPA_TIMEOUT;
@@ -1924,7 +1924,7 @@ qeth_osn_send_control_data(struct qeth_card *card, int len,
 	QETH_DBF_TEXT(trace, 5, "osndctrd");
 
 	wait_event(card->wait_q,
-		   atomic_compare_and_swap(0,1,&card->write.irq_pending) == 0);
+		   atomic_cmpxchg(&card->write.irq_pending, 0, 1) == 0);
 	qeth_prepare_control_data(card, len, iob);
 	QETH_DBF_TEXT(trace, 6, "osnoirqp");
 	spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags);
@@ -4236,9 +4236,8 @@ qeth_do_send_packet_fast(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 	QETH_DBF_TEXT(trace, 6, "dosndpfa");
 
 	/* spin until we get the queue ... */
-	while (atomic_compare_and_swap(QETH_OUT_Q_UNLOCKED,
-				       QETH_OUT_Q_LOCKED,
-				       &queue->state));
+	while (atomic_cmpxchg(&queue->state, QETH_OUT_Q_UNLOCKED,
+			      QETH_OUT_Q_LOCKED) != QETH_OUT_Q_UNLOCKED);
 	/* ... now we've got the queue */
 	index = queue->next_buf_to_fill;
 	buffer = &queue->bufs[queue->next_buf_to_fill];
@@ -4292,9 +4291,8 @@ qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 	QETH_DBF_TEXT(trace, 6, "dosndpkt");
 
 	/* spin until we get the queue ... */
-	while (atomic_compare_and_swap(QETH_OUT_Q_UNLOCKED,
-				       QETH_OUT_Q_LOCKED,
-				       &queue->state));
+	while (atomic_cmpxchg(&queue->state, QETH_OUT_Q_UNLOCKED,
+			      QETH_OUT_Q_LOCKED) != QETH_OUT_Q_UNLOCKED);
 	start_index = queue->next_buf_to_fill;
 	buffer = &queue->bufs[queue->next_buf_to_fill];
 	/*
diff --git a/include/asm-s390/atomic.h b/include/asm-s390/atomic.h
index 6d07c7d..d82aedf 100644
--- a/include/asm-s390/atomic.h
+++ b/include/asm-s390/atomic.h
@@ -5,7 +5,7 @@
  *  include/asm-s390/atomic.h
  *
  *  S390 version
- *    Copyright (C) 1999-2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Copyright (C) 1999-2005 IBM Deutschland Entwicklung GmbH, IBM Corporation
  *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
  *               Denis Joseph Barrow,
  *		 Arnd Bergmann (arndb@de.ibm.com)
@@ -45,59 +45,57 @@ typedef struct {
 #define atomic_read(v)          ((v)->counter)
 #define atomic_set(v,i)         (((v)->counter) = (i))
 
-static __inline__ void atomic_add(int i, atomic_t * v)
-{
-	       __CS_LOOP(v, i, "ar");
-}
 static __inline__ int atomic_add_return(int i, atomic_t * v)
 {
 	return __CS_LOOP(v, i, "ar");
 }
-static __inline__ int atomic_add_negative(int i, atomic_t * v)
-{
-	return __CS_LOOP(v, i, "ar") < 0;
-}
-static __inline__ void atomic_sub(int i, atomic_t * v)
-{
-	       __CS_LOOP(v, i, "sr");
-}
+#define atomic_add(_i, _v)		atomic_add_return(_i, _v)
+#define atomic_add_negative(_i, _v)	(atomic_add_return(_i, _v) < 0)
+#define atomic_inc(_v)			atomic_add_return(1, _v)
+#define atomic_inc_return(_v)		atomic_add_return(1, _v)
+#define atomic_inc_and_test(_v)		(atomic_add_return(1, _v) == 0)
+
 static __inline__ int atomic_sub_return(int i, atomic_t * v)
 {
 	return __CS_LOOP(v, i, "sr");
 }
-static __inline__ void atomic_inc(volatile atomic_t * v)
-{
-	       __CS_LOOP(v, 1, "ar");
-}
-static __inline__ int atomic_inc_return(volatile atomic_t * v)
-{
-	return __CS_LOOP(v, 1, "ar");
-}
+#define atomic_sub(_i, _v)		atomic_sub_return(_i, _v)
+#define atomic_sub_and_test(_i, _v)	(atomic_sub_return(_i, _v) == 0)
+#define atomic_dec(_v)			atomic_sub_return(1, _v)
+#define atomic_dec_return(_v)		atomic_sub_return(1, _v)
+#define atomic_dec_and_test(_v)		(atomic_sub_return(1, _v) == 0)
 
-static __inline__ int atomic_inc_and_test(volatile atomic_t * v)
-{
-	return __CS_LOOP(v, 1, "ar") == 0;
-}
-static __inline__ void atomic_dec(volatile atomic_t * v)
-{
-	       __CS_LOOP(v, 1, "sr");
-}
-static __inline__ int atomic_dec_return(volatile atomic_t * v)
-{
-	return __CS_LOOP(v, 1, "sr");
-}
-static __inline__ int atomic_dec_and_test(volatile atomic_t * v)
-{
-	return __CS_LOOP(v, 1, "sr") == 0;
-}
 static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t * v)
 {
 	       __CS_LOOP(v, ~mask, "nr");
 }
+
 static __inline__ void atomic_set_mask(unsigned long mask, atomic_t * v)
 {
 	       __CS_LOOP(v, mask, "or");
 }
+
+static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+	__asm__ __volatile__("  cs   %0,%3,0(%2)\n"
+			     : "+d" (old), "=m" (v->counter)
+			     : "a" (v), "d" (new), "m" (v->counter)
+			     : "cc", "memory" );
+	return old;
+}
+
+static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c, old;
+
+	c = atomic_read(v);
+	while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
+		c = old;
+	return c != u;
+}
+
+#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+
 #undef __CS_LOOP
 
 #ifdef __s390x__
@@ -123,92 +121,61 @@ typedef struct {
 #define atomic64_read(v)          ((v)->counter)
 #define atomic64_set(v,i)         (((v)->counter) = (i))
 
-static __inline__ void atomic64_add(long long i, atomic64_t * v)
-{
-	       __CSG_LOOP(v, i, "agr");
-}
 static __inline__ long long atomic64_add_return(long long i, atomic64_t * v)
 {
 	return __CSG_LOOP(v, i, "agr");
 }
-static __inline__ long long atomic64_add_negative(long long i, atomic64_t * v)
-{
-	return __CSG_LOOP(v, i, "agr") < 0;
-}
-static __inline__ void atomic64_sub(long long i, atomic64_t * v)
-{
-	       __CSG_LOOP(v, i, "sgr");
-}
-static __inline__ void atomic64_inc(volatile atomic64_t * v)
-{
-	       __CSG_LOOP(v, 1, "agr");
-}
-static __inline__ long long atomic64_inc_return(volatile atomic64_t * v)
-{
-	return __CSG_LOOP(v, 1, "agr");
-}
-static __inline__ long long atomic64_inc_and_test(volatile atomic64_t * v)
-{
-	return __CSG_LOOP(v, 1, "agr") == 0;
-}
-static __inline__ void atomic64_dec(volatile atomic64_t * v)
-{
-	       __CSG_LOOP(v, 1, "sgr");
-}
-static __inline__ long long atomic64_dec_return(volatile atomic64_t * v)
-{
-	return __CSG_LOOP(v, 1, "sgr");
-}
-static __inline__ long long atomic64_dec_and_test(volatile atomic64_t * v)
+#define atomic64_add(_i, _v)		atomic64_add_return(_i, _v)
+#define atomic64_add_negative(_i, _v)	(atomic64_add_return(_i, _v) < 0)
+#define atomic64_inc(_v)		atomic64_add_return(1, _v)
+#define atomic64_inc_return(_v)		atomic64_add_return(1, _v)
+#define atomic64_inc_and_test(_v)	(atomic64_add_return(1, _v) == 0)
+
+static __inline__ long long atomic64_sub_return(long long i, atomic64_t * v)
 {
-	return __CSG_LOOP(v, 1, "sgr") == 0;
+	return __CSG_LOOP(v, i, "sgr");
 }
+#define atomic64_sub(_i, _v)		atomic64_sub_return(_i, _v)
+#define atomic64_sub_and_test(_i, _v)	(atomic64_sub_return(_i, _v) == 0)
+#define atomic64_dec(_v)		atomic64_sub_return(1, _v)
+#define atomic64_dec_return(_v)		atomic64_sub_return(1, _v)
+#define atomic64_dec_and_test(_v)	(atomic64_sub_return(1, _v) == 0)
+
 static __inline__ void atomic64_clear_mask(unsigned long mask, atomic64_t * v)
 {
 	       __CSG_LOOP(v, ~mask, "ngr");
 }
+
 static __inline__ void atomic64_set_mask(unsigned long mask, atomic64_t * v)
 {
 	       __CSG_LOOP(v, mask, "ogr");
 }
 
-#undef __CSG_LOOP
-#endif
-
-/*
-  returns 0  if expected_oldval==value in *v ( swap was successful )
-  returns 1  if unsuccessful.
+static __inline__ long long atomic64_cmpxchg(atomic64_t *v,
+					     long long old, long long new)
+{
+	__asm__ __volatile__("  csg  %0,%3,0(%2)\n"
+			     : "+d" (old), "=m" (v->counter)
+			     : "a" (v), "d" (new), "m" (v->counter)
+			     : "cc", "memory" );
+	return old;
+}
 
-  This is non-portable, use bitops or spinlocks instead!
-*/
-static __inline__ int
-atomic_compare_and_swap(int expected_oldval,int new_val,atomic_t *v)
+static __inline__ int atomic64_add_unless(atomic64_t *v,
+					  long long a, long long u)
 {
-        int retval;
-
-        __asm__ __volatile__(
-                "  lr   %0,%3\n"
-                "  cs   %0,%4,0(%2)\n"
-                "  ipm  %0\n"
-                "  srl  %0,28\n"
-                "0:"
-                : "=&d" (retval), "=m" (v->counter)
-                : "a" (v), "d" (expected_oldval) , "d" (new_val),
-		  "m" (v->counter) : "cc", "memory" );
-        return retval;
+	long long c, old;
+
+	c = atomic64_read(v);
+	while (c != u && (old = atomic64_cmpxchg(v, c, c + a)) != c)
+		c = old;
+	return c != u;
 }
 
-#define atomic_cmpxchg(v, o, n) (atomic_compare_and_swap((o), (n), &((v)->counter)))
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
 
-#define atomic_add_unless(v, a, u)				\
-({								\
-	int c, old;						\
-	c = atomic_read(v);					\
-	while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
-		c = old;					\
-	c != (u);						\
-})
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
+#undef __CSG_LOOP
+#endif
 
 #define smp_mb__before_atomic_dec()	smp_mb()
 #define smp_mb__after_atomic_dec()	smp_mb()
-- 
cgit v1.1


From 56dc6a88ec76019e0d0729165cb5b98536270e1d Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:09 -0800
Subject: [PATCH] s390: cms volume label definitions

Moved definition of CMS volume label to vtoc.h and modify partitions/ibm.c to
use this volume label definition instead of anonymous array.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/block/dasd_diag.c |  7 ++++---
 drivers/s390/block/dasd_diag.h | 23 +----------------------
 fs/partitions/ibm.c            | 30 +++++++++++++++---------------
 include/asm-s390/vtoc.h        | 24 ++++++++++++++++++++++++
 4 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index ab8754e..16c4b7d 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.51 $
+ * $Revision: 1.52 $
  */
 
 #include <linux/config.h>
@@ -25,6 +25,7 @@
 #include <asm/io.h>
 #include <asm/s390_ext.h>
 #include <asm/todclk.h>
+#include <asm/vtoc.h>
 
 #include "dasd_int.h"
 #include "dasd_diag.h"
@@ -329,7 +330,7 @@ dasd_diag_check_device(struct dasd_device *device)
 	struct dasd_diag_private *private;
 	struct dasd_diag_characteristics *rdc_data;
 	struct dasd_diag_bio bio;
-	struct dasd_diag_cms_label *label;
+	struct vtoc_cms_label *label;
 	blocknum_t end_block;
 	unsigned int sb, bsize;
 	int rc;
@@ -380,7 +381,7 @@ dasd_diag_check_device(struct dasd_device *device)
 	mdsk_term_io(device);
 
 	/* figure out blocksize of device */
-	label = (struct dasd_diag_cms_label *) get_zeroed_page(GFP_KERNEL);
+	label = (struct vtoc_cms_label *) get_zeroed_page(GFP_KERNEL);
 	if (label == NULL)  {
 		DEV_MESSAGE(KERN_WARNING, device, "%s",
 			    "No memory to allocate initialization request");
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index df31484..37edf6e 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.8 $
+ * $Revision: 1.9 $
  */
 
 #define MDSK_WRITE_REQ 0x01
@@ -44,27 +44,6 @@ struct dasd_diag_characteristics {
 	u8 rdev_features;
 } __attribute__ ((packed, aligned(4)));
 
-struct dasd_diag_cms_label {
-	u8 label_id[4];
-	u8 vol_id[6];
-	u16 version_id;
-	u32 block_size;
-	u32 origin_ptr;
-	u32 usable_count;
-	u32 formatted_count;
-	u32 block_count;
-	u32 used_count;
-	u32 fst_size;
-	u32 fst_count;
-	u8 format_date[6];
-	u8 reserved1[2];
-	u32 disk_offset;
-	u32 map_block;
-	u32 hblk_disp;
-	u32 user_disp;
-	u8 reserved2[4];
-	u8 segment_name[8];
-} __attribute__ ((packed));
 
 #ifdef CONFIG_ARCH_S390X
 #define DASD_DIAG_FLAGA_DEFAULT		DASD_DIAG_FLAGA_FORMAT_64BIT
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb..78010ad 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	struct hd_geometry *geo;
 	char type[5] = {0,};
 	char name[7] = {0,};
-	struct vtoc_volume_label *vlabel;
+	union label_t {
+		struct vtoc_volume_label vol;
+		struct vtoc_cms_label cms;
+	} *label;
 	unsigned char *data;
 	Sector sect;
 
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		goto out_noinfo;
 	if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
 		goto out_nogeo;
-	if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label),
-			      GFP_KERNEL)) == NULL)
-		goto out_novlab;
+	if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
+		goto out_nolab;
 	
 	if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		strncpy(name, data + 8, 6);
 	else
 		strncpy(name, data + 4, 6);
-	memcpy (vlabel, data, sizeof(struct vtoc_volume_label));
+	memcpy(label, data, sizeof(union label_t));
 	put_dev_sector(sect);
 
 	EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		/*
 		 * VM style CMS1 labeled disk
 		 */
-		int *label = (int *) vlabel;
-
-		if (label[13] != 0) {
+		if (label->cms.disk_offset != 0) {
 			printk("CMS1/%8s(MDSK):", name);
 			/* disk is reserved minidisk */
-			blocksize = label[3];
-			offset = label[13];
-			size = (label[7] - 1)*(blocksize >> 9);
+			blocksize = label->cms.block_size;
+			offset = label->cms.disk_offset;
+			size = (label->cms.block_count - 1) * (blocksize >> 9);
 		} else {
 			printk("CMS1/%8s:", name);
 			offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 		printk("VOL1/%8s:", name);
 
 		/* get block number and read then go through format1 labels */
-		blk = cchhb2blk(&vlabel->vtoc, geo) + 1;
+		blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
 		counter = 0;
 		while ((data = read_dev_sector(bdev, blk*(blocksize/512),
 					       &sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	}
 
 	printk("\n");
-	kfree(vlabel);
+	kfree(label);
 	kfree(geo);
 	kfree(info);
 	return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 out_readerr:
 out_badsect:
 out_noioctl:
-	kfree(vlabel);
-out_novlab:
+	kfree(label);
+out_nolab:
 	kfree(geo);
 out_nogeo:
 	kfree(info);
diff --git a/include/asm-s390/vtoc.h b/include/asm-s390/vtoc.h
index 41d369f..d1de5b7 100644
--- a/include/asm-s390/vtoc.h
+++ b/include/asm-s390/vtoc.h
@@ -176,4 +176,28 @@ struct vtoc_format7_label
 	struct vtoc_cchhb DS7PTRDS; /* pointer to next FMT7 DSCB */
 } __attribute__ ((packed));
 
+struct vtoc_cms_label {
+	u8 label_id[4];		/* Label identifier */
+	u8 vol_id[6];		/* Volid */
+	u16 version_id;		/* Version identifier */
+	u32 block_size;		/* Disk block size */
+	u32 origin_ptr;		/* Disk origin pointer */
+	u32 usable_count;	/* Number of usable cylinders/blocks */
+	u32 formatted_count;	/* Maximum number of formatted cylinders/
+				 * blocks */
+	u32 block_count;	/* Disk size in CMS blocks */
+	u32 used_count;		/* Number of CMS blocks in use */
+	u32 fst_size;		/* File Status Table (FST) size */
+	u32 fst_count;		/* Number of FSTs per CMS block */
+	u8 format_date[6];	/* Disk FORMAT date */
+	u8 reserved1[2];
+	u32 disk_offset;	/* Disk offset when reserved*/
+	u32 map_block;		/* Allocation Map Block with next hole */
+	u32 hblk_disp;		/* Displacement into HBLK data of next hole */
+	u32 user_disp;		/* Displacement into user part of Allocation
+				 * map */
+	u8 reserved2[4];
+	u8 segment_name[8];	/* Name of shared segment */
+} __attribute__ ((packed));
+
 #endif /* _ASM_S390_VTOC_H */
-- 
cgit v1.1


From a63a4931c301a14ca79c41fec0b99d898dbba1fb Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:09 -0800
Subject: [PATCH] s390: uaccess warnings

Convert __access_ok to an inline C function and change __get_user primitive to
avoid uaccess compiler warnings.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/compat_linux.c |  2 +-
 include/asm-s390/uaccess.h      | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index ed877d0..41b197a 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -279,7 +279,7 @@ asmlinkage long sys32_getegid16(void)
 
 static inline long get_tv32(struct timeval *o, struct compat_timeval *i)
 {
-	return (!access_ok(VERIFY_READ, tv32, sizeof(*tv32)) ||
+	return (!access_ok(VERIFY_READ, o, sizeof(*o)) ||
 		(__get_user(o->tv_sec, &i->tv_sec) ||
 		 __get_user(o->tv_usec, &i->tv_usec)));
 }
diff --git a/include/asm-s390/uaccess.h b/include/asm-s390/uaccess.h
index 10a619d..be104f2 100644
--- a/include/asm-s390/uaccess.h
+++ b/include/asm-s390/uaccess.h
@@ -61,8 +61,10 @@
 #define segment_eq(a,b) ((a).ar4 == (b).ar4)
 
 
-#define __access_ok(addr,size) (1)
-
+static inline int __access_ok(const void *addr, unsigned long size)
+{
+	return 1;
+}
 #define access_ok(type,addr,size) __access_ok(addr,size)
 
 /*
@@ -206,25 +208,25 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 	case 1: {						\
 		unsigned char __x;				\
 		__get_user_asm(__x, ptr, __gu_err);		\
-		(x) = (__typeof__(*(ptr))) __x;			\
+		(x) = *(__typeof__(*(ptr)) *) &__x;		\
 		break;						\
 	};							\
 	case 2: {						\
 		unsigned short __x;				\
 		__get_user_asm(__x, ptr, __gu_err);		\
-		(x) = (__typeof__(*(ptr))) __x;			\
+		(x) = *(__typeof__(*(ptr)) *) &__x;		\
 		break;						\
 	};							\
 	case 4: {						\
 		unsigned int __x;				\
 		__get_user_asm(__x, ptr, __gu_err);		\
-		(x) = (__typeof__(*(ptr))) __x;			\
+		(x) = *(__typeof__(*(ptr)) *) &__x;		\
 		break;						\
 	};							\
 	case 8: {						\
 		unsigned long long __x;				\
 		__get_user_asm(__x, ptr, __gu_err);		\
-		(x) = (__typeof__(*(ptr))) __x;			\
+		(x) = *(__typeof__(*(ptr)) *) &__x;		\
 		break;						\
 	};							\
 	default:						\
-- 
cgit v1.1


From 4e3df37e7fb4e41bec84465ff31949737160ed58 Mon Sep 17 00:00:00 2001
From: Cedric Le Goater <clg@fr.ibm.com>
Date: Fri, 6 Jan 2006 00:19:10 -0800
Subject: [PATCH] s390: rt_sigreturn fix

Check return code of do_sigaltstack and force a SIGSEGV if it is -EFAULT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/compat_signal.c | 2 --
 arch/s390/kernel/signal.c        | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 4ff6808..fa2b3bc 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -467,8 +467,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 	if (err)
 		goto badframe; 
 
-	/* It is more difficult to avoid calling this function than to
-	   call it and ignore errors.  */
 	set_fs (KERNEL_DS);
 	do_sigaltstack((stack_t __user *)&st, NULL, regs->gprs[15]);
 	set_fs (old_fs);
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6e0110d..13592d0 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -254,9 +254,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	if (restore_sigregs(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
-	/* It is more difficult to avoid calling this function than to
-	   call it and ignore errors.  */
-	do_sigaltstack(&frame->uc.uc_stack, NULL, regs->gprs[15]);
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL,
+			   regs->gprs[15]) == -EFAULT)
+		goto badframe;
 	return regs->gprs[2];
 
 badframe:
-- 
cgit v1.1


From 088c4ec16aa6b865dcf690051ddac30eb2bf6bcc Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:11 -0800
Subject: [PATCH] s390: update default configuration

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/defconfig | 55 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 45d44c6..0c495fe 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.14-rc1
-# Wed Sep 14 16:46:19 2005
+# Linux kernel version: 2.6.15-rc2
+# Mon Nov 21 13:51:30 2005
 #
 CONFIG_MMU=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
@@ -65,6 +65,24 @@ CONFIG_KMOD=y
 CONFIG_STOP_MACHINE=y
 
 #
+# Block layer
+#
+# CONFIG_LBD is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+
+#
 # Base setup
 #
 
@@ -97,6 +115,7 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 # CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPLIT_PTLOCK_CPUS=4
 
 #
 # I/O subsystem configuration
@@ -188,10 +207,18 @@ CONFIG_IPV6=y
 # CONFIG_NET_DIVERT is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
 CONFIG_NET_SCHED=y
 CONFIG_NET_SCH_CLK_JIFFIES=y
 # CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set
 # CONFIG_NET_SCH_CLK_CPU is not set
+
+#
+# Queueing/Scheduling
+#
 CONFIG_NET_SCH_CBQ=m
 # CONFIG_NET_SCH_HTB is not set
 # CONFIG_NET_SCH_HFSC is not set
@@ -204,8 +231,10 @@ CONFIG_NET_SCH_GRED=m
 CONFIG_NET_SCH_DSMARK=m
 # CONFIG_NET_SCH_NETEM is not set
 # CONFIG_NET_SCH_INGRESS is not set
-CONFIG_NET_QOS=y
-CONFIG_NET_ESTIMATOR=y
+
+#
+# Classification
+#
 CONFIG_NET_CLS=y
 # CONFIG_NET_CLS_BASIC is not set
 CONFIG_NET_CLS_TCINDEX=m
@@ -214,18 +243,18 @@ CONFIG_NET_CLS_ROUTE=y
 CONFIG_NET_CLS_FW=m
 CONFIG_NET_CLS_U32=m
 # CONFIG_CLS_U32_PERF is not set
-# CONFIG_NET_CLS_IND is not set
 CONFIG_NET_CLS_RSVP=m
 CONFIG_NET_CLS_RSVP6=m
 # CONFIG_NET_EMATCH is not set
 # CONFIG_NET_CLS_ACT is not set
 CONFIG_NET_CLS_POLICE=y
+# CONFIG_NET_CLS_IND is not set
+CONFIG_NET_ESTIMATOR=y
 
 #
 # Network testing
 #
 # CONFIG_NET_PKTGEN is not set
-# CONFIG_NETFILTER_NETLINK is not set
 # CONFIG_HAMRADIO is not set
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
@@ -276,6 +305,7 @@ CONFIG_SCSI_FC_ATTRS=y
 #
 # SCSI low-level drivers
 #
+# CONFIG_ISCSI_TCP is not set
 # CONFIG_SCSI_SATA is not set
 # CONFIG_SCSI_DEBUG is not set
 CONFIG_ZFCP=y
@@ -292,7 +322,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_COUNT=16
 CONFIG_BLK_DEV_RAM_SIZE=4096
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_LBD is not set
 # CONFIG_CDROM_PKTCDVD is not set
 
 #
@@ -305,15 +334,8 @@ CONFIG_DASD_PROFILE=y
 CONFIG_DASD_ECKD=y
 CONFIG_DASD_FBA=y
 CONFIG_DASD_DIAG=y
+CONFIG_DASD_EER=m
 # CONFIG_DASD_CMB is not set
-
-#
-# IO Schedulers
-#
-CONFIG_IOSCHED_NOOP=y
-CONFIG_IOSCHED_AS=y
-CONFIG_IOSCHED_DEADLINE=y
-CONFIG_IOSCHED_CFQ=y
 # CONFIG_ATA_OVER_ETH is not set
 
 #
@@ -378,7 +400,6 @@ CONFIG_S390_TAPE_34XX=m
 # CONFIG_VMLOGRDR is not set
 # CONFIG_VMCP is not set
 # CONFIG_MONREADER is not set
-# CONFIG_DCSS_SHM is not set
 
 #
 # Cryptographic devices
@@ -593,6 +614,8 @@ CONFIG_DEBUG_PREEMPT=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_INFO is not set
 CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+# CONFIG_RCU_TORTURE_TEST is not set
 
 #
 # Security options
-- 
cgit v1.1


From 089545f0c71bab6511395c2a060d7f81a99bad58 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:12 -0800
Subject: [PATCH] s390: cputime_t fixes

There are some more places where the use of cputime_t instead of an integer
type and the associated macros is necessary for the virtual cputime accounting
on s390.  Affected are the s390 specific appldata code and BSD process
accounting.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/appldata/appldata_os.c | 14 +++++++-------
 kernel/acct.c                    | 16 +++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index e0a476b..99ddd3b 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -141,19 +141,19 @@ static void appldata_get_os_data(void *data)
 	j = 0;
 	for_each_online_cpu(i) {
 		os_data->os_cpu[j].per_cpu_user =
-					kstat_cpu(i).cpustat.user;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.user);
 		os_data->os_cpu[j].per_cpu_nice =
-					kstat_cpu(i).cpustat.nice;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.nice);
 		os_data->os_cpu[j].per_cpu_system =
-					kstat_cpu(i).cpustat.system;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.system);
 		os_data->os_cpu[j].per_cpu_idle =
-					kstat_cpu(i).cpustat.idle;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.idle);
 		os_data->os_cpu[j].per_cpu_irq =
-					kstat_cpu(i).cpustat.irq;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.irq);
 		os_data->os_cpu[j].per_cpu_softirq =
-					kstat_cpu(i).cpustat.softirq;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.softirq);
 		os_data->os_cpu[j].per_cpu_iowait =
-					kstat_cpu(i).cpustat.iowait;
+			cputime_to_jiffies(kstat_cpu(i).cpustat.iowait);
 		j++;
 	}
 
diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6b..38d57fa 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	u64 elapsed;
 	u64 run_time;
 	struct timespec uptime;
+	unsigned long jiffies;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
 	do_div(elapsed, AHZ);
 	ac.ac_btime = xtime.tv_sec - elapsed;
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->utime +
-					    current->group_leader->utime));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->stime +
-					    current->group_leader->stime));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+						 current->signal->utime));
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+						 current->signal->stime));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
@@ -580,7 +581,8 @@ void acct_process(long exitcode)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = tsk->stime - tsk->acct_stimexpd;
+		long delta =
+			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
 
 		if (delta == 0)
 			return;
-- 
cgit v1.1


From 6810a2bce3aa6573faa9920487274f166fe95c6e Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:13 -0800
Subject: [PATCH] s390: re-activated path detection

If we receive path not operational indications (pnom in pmcw nonzero), we
switch off those paths.  To catch them becoming available again, we have to
recalculate the lpm from the pmcw each time we start path verification.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/device_pgid.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index 0adac8a..757b270 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -22,6 +22,7 @@
 #include "cio_debug.h"
 #include "css.h"
 #include "device.h"
+#include "ioasm.h"
 
 /*
  * Start Sense Path Group ID helper function. Used in ccw_device_recog
@@ -364,8 +365,22 @@ ccw_device_verify_irq(struct ccw_device *cdev, enum dev_event dev_event)
 void
 ccw_device_verify_start(struct ccw_device *cdev)
 {
+	struct subchannel *sch = to_subchannel(cdev->dev.parent);
+
 	cdev->private->flags.pgid_single = 0;
 	cdev->private->iretry = 5;
+	/*
+	 * Update sch->lpm with current values to catch paths becoming
+	 * available again.
+	 */
+	if (stsch(sch->irq, &sch->schib)) {
+		ccw_device_verify_done(cdev, -ENODEV);
+		return;
+	}
+	sch->lpm = sch->schib.pmcw.pim &
+		sch->schib.pmcw.pam &
+		sch->schib.pmcw.pom &
+		sch->opm;
 	__ccw_device_verify_start(cdev);
 }
 
-- 
cgit v1.1


From cfb1b55595a0dfd87b5849e8d0216c029f34445f Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:14 -0800
Subject: [PATCH] s390: move s390_root_dev_* out of the cio layer

Extract the s390_root_dev_* functions from the common I/O layer as they are
also used by non-ccw device drivers.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/Makefile        |  2 +-
 drivers/s390/block/dcssblk.c |  2 +-
 drivers/s390/cio/css.c       | 41 ----------------------------------
 drivers/s390/net/cu3088.c    |  3 ++-
 drivers/s390/net/iucv.c      |  2 +-
 drivers/s390/net/qeth_main.c |  1 +
 drivers/s390/s390_rdev.c     | 53 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-s390/ccwdev.h    |  3 ---
 include/asm-s390/s390_rdev.h | 15 +++++++++++++
 9 files changed, 74 insertions(+), 48 deletions(-)
 create mode 100644 drivers/s390/s390_rdev.c
 create mode 100644 include/asm-s390/s390_rdev.h

diff --git a/drivers/s390/Makefile b/drivers/s390/Makefile
index c99a2fe..9803c93 100644
--- a/drivers/s390/Makefile
+++ b/drivers/s390/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the S/390 specific device drivers
 #
 
-obj-y += s390mach.o sysinfo.o
+obj-y += s390mach.o sysinfo.o s390_rdev.o
 obj-y += cio/ block/ char/ crypto/ net/ scsi/
 
 drivers-y += drivers/s390/built-in.o
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4fde411..2e727f4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -15,7 +15,7 @@
 #include <asm/io.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
-#include <asm/ccwdev.h> 	// for s390_root_dev_(un)register()
+#include <asm/s390_rdev.h>
 
 //#define DCSSBLK_DEBUG		/* Debug messages on/off */
 #define DCSSBLK_NAME "dcssblk"
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 555119c..7e4d57b 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -481,45 +481,6 @@ struct bus_type css_bus_type = {
 
 subsys_initcall(init_channel_subsystem);
 
-/*
- * Register root devices for some drivers. The release function must not be
- * in the device drivers, so we do it here.
- */
-static void
-s390_root_dev_release(struct device *dev)
-{
-	kfree(dev);
-}
-
-struct device *
-s390_root_dev_register(const char *name)
-{
-	struct device *dev;
-	int ret;
-
-	if (!strlen(name))
-		return ERR_PTR(-EINVAL);
-	dev = kmalloc(sizeof(struct device), GFP_KERNEL);
-	if (!dev)
-		return ERR_PTR(-ENOMEM);
-	memset(dev, 0, sizeof(struct device));
-	strncpy(dev->bus_id, name, min(strlen(name), (size_t)BUS_ID_SIZE));
-	dev->release = s390_root_dev_release;
-	ret = device_register(dev);
-	if (ret) {
-		kfree(dev);
-		return ERR_PTR(ret);
-	}
-	return dev;
-}
-
-void
-s390_root_dev_unregister(struct device *dev)
-{
-	if (dev)
-		device_unregister(dev);
-}
-
 int
 css_enqueue_subchannel_slow(unsigned long schid)
 {
@@ -564,6 +525,4 @@ css_slow_subchannels_exist(void)
 
 MODULE_LICENSE("GPL");
 EXPORT_SYMBOL(css_bus_type);
-EXPORT_SYMBOL(s390_root_dev_register);
-EXPORT_SYMBOL(s390_root_dev_unregister);
 EXPORT_SYMBOL_GPL(css_characteristics_avail);
diff --git a/drivers/s390/net/cu3088.c b/drivers/s390/net/cu3088.c
index 0075894..77dacb4 100644
--- a/drivers/s390/net/cu3088.c
+++ b/drivers/s390/net/cu3088.c
@@ -1,5 +1,5 @@
 /*
- * $Id: cu3088.c,v 1.35 2005/03/30 19:28:52 richtera Exp $
+ * $Id: cu3088.c,v 1.36 2005/10/25 14:37:17 cohuck Exp $
  *
  * CTC / LCS ccw_device driver
  *
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 
+#include <asm/s390_rdev.h>
 #include <asm/ccwdev.h>
 #include <asm/ccwgroup.h>
 
diff --git a/drivers/s390/net/iucv.c b/drivers/s390/net/iucv.c
index ecb2f8f..ea81773 100644
--- a/drivers/s390/net/iucv.c
+++ b/drivers/s390/net/iucv.c
@@ -54,7 +54,7 @@
 #include <asm/s390_ext.h>
 #include <asm/ebcdic.h>
 #include <asm/smp.h>
-#include <asm/ccwdev.h> //for root device stuff
+#include <asm/s390_rdev.h>
 
 /* FLAGS:
  * All flags are defined in the field IPFLAGS1 of each function
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index 7b2663f..97f927c 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -65,6 +65,7 @@
 #include <asm/timex.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
+#include <asm/s390_rdev.h>
 
 #include "qeth.h"
 #include "qeth_mpc.h"
diff --git a/drivers/s390/s390_rdev.c b/drivers/s390/s390_rdev.c
new file mode 100644
index 0000000..566cc3d
--- /dev/null
+++ b/drivers/s390/s390_rdev.c
@@ -0,0 +1,53 @@
+/*
+ *  drivers/s390/s390_rdev.c
+ *  s390 root device
+ *   $Revision: 1.2 $
+ *
+ *    Copyright (C) 2002, 2005 IBM Deutschland Entwicklung GmbH,
+ *			 IBM Corporation
+ *    Author(s): Cornelia Huck (cohuck@de.ibm.com)
+ *		  Carsten Otte  (cotte@de.ibm.com)
+ */
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <asm/s390_rdev.h>
+
+static void
+s390_root_dev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+struct device *
+s390_root_dev_register(const char *name)
+{
+	struct device *dev;
+	int ret;
+
+	if (!strlen(name))
+		return ERR_PTR(-EINVAL);
+	dev = kmalloc(sizeof(struct device), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+	memset(dev, 0, sizeof(struct device));
+	strncpy(dev->bus_id, name, min(strlen(name), (size_t)BUS_ID_SIZE));
+	dev->release = s390_root_dev_release;
+	ret = device_register(dev);
+	if (ret) {
+		kfree(dev);
+		return ERR_PTR(ret);
+	}
+	return dev;
+}
+
+void
+s390_root_dev_unregister(struct device *dev)
+{
+	if (dev)
+		device_unregister(dev);
+}
+
+EXPORT_SYMBOL(s390_root_dev_register);
+EXPORT_SYMBOL(s390_root_dev_unregister);
diff --git a/include/asm-s390/ccwdev.h b/include/asm-s390/ccwdev.h
index 3eb231a..12456cb 100644
--- a/include/asm-s390/ccwdev.h
+++ b/include/asm-s390/ccwdev.h
@@ -185,8 +185,5 @@ extern struct ccw_device *ccw_device_probe_console(void);
 extern int _ccw_device_get_device_number(struct ccw_device *);
 extern int _ccw_device_get_subchannel_number(struct ccw_device *);
 
-extern struct device *s390_root_dev_register(const char *);
-extern void s390_root_dev_unregister(struct device *);
-
 extern void *ccw_device_get_chp_desc(struct ccw_device *, int);
 #endif /* _S390_CCWDEV_H_ */
diff --git a/include/asm-s390/s390_rdev.h b/include/asm-s390/s390_rdev.h
new file mode 100644
index 0000000..3ad78f2
--- /dev/null
+++ b/include/asm-s390/s390_rdev.h
@@ -0,0 +1,15 @@
+/*
+ *  include/asm-s390/ccwdev.h
+ *
+ *    Copyright (C) 2002,2005 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author(s): Cornelia Huck <cohuck@de.ibm.com>
+ *               Carsten Otte  <cotte@de.ibm.com>
+ *
+ *  Interface for s390 root device
+ */
+
+#ifndef _S390_RDEV_H_
+#define _S390_RDEV_H_
+extern struct device *s390_root_dev_register(const char *);
+extern void s390_root_dev_unregister(struct device *);
+#endif /* _S390_RDEV_H_ */
-- 
cgit v1.1


From 9a7af289660dc749d7c58234191601046a9bf488 Mon Sep 17 00:00:00 2001
From: Horst Hummel <horst.hummel@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:14 -0800
Subject: [PATCH] s390: BIODASDPRRD ioctl return code

The IOCTL BIODASDPRRD had no return code for 'profiling is inactive' and
therefore tunedasd wrote misleading message for request-counter = 0.
Introduce return-code EIO for inactive profiling.

Signed-off-by: Horst Hummel <horst.hummel@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/block/dasd_ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 789595b..044b753 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -7,7 +7,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001
  *
- * $Revision: 1.47 $
+ * $Revision: 1.50 $
  *
  * i/o controls for the dasd driver.
  */
@@ -352,6 +352,9 @@ dasd_ioctl_read_profile(struct block_device *bdev, int no, long args)
 	if (device == NULL)
 		return -ENODEV;
 
+	if (dasd_profile_level == DASD_PROFILE_OFF)
+		return -EIO;
+
 	if (copy_to_user((long __user *) args, (long *) &device->profile,
 			 sizeof (struct dasd_profile_info_t)))
 		return -EFAULT;
-- 
cgit v1.1


From 1c01b8a5963aec60488c1c97d67cffd8b5275e3f Mon Sep 17 00:00:00 2001
From: Horst Hummel <horst.hummel@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:15 -0800
Subject: [PATCH] s390: dasd failfast support

To properly support multipath-failover handling, the linux block layer has
introduced a special request flag, 'REQ_FAILFAST'.  This flag is now used to
return requests immediately in case the device is not operational.

Signed-off-by: Horst Hummel <horst.hummel@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/block/dasd.c      | 28 ++++++++++++++++++++--------
 drivers/s390/block/dasd_diag.c |  4 +++-
 drivers/s390/block/dasd_eckd.c |  7 ++++++-
 drivers/s390/block/dasd_fba.c  |  4 +++-
 drivers/s390/block/dasd_int.h  |  3 ++-
 5 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 6278739..1141a59 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -7,7 +7,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001
  *
- * $Revision: 1.169 $
+ * $Revision: 1.172 $
  */
 
 #include <linux/config.h>
@@ -1224,6 +1224,12 @@ __dasd_start_head(struct dasd_device * device)
 	if (list_empty(&device->ccw_queue))
 		return;
 	cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, list);
+        /* check FAILFAST */
+	if (device->stopped & ~DASD_STOPPED_PENDING &&
+	    test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags)) {
+		cqr->status = DASD_CQR_FAILED;
+		dasd_schedule_bh(device);
+	}
 	if ((cqr->status == DASD_CQR_QUEUED) &&
 	    (!device->stopped)) {
 		/* try to start the first I/O that can be started */
@@ -1750,8 +1756,10 @@ dasd_exit(void)
  * SECTION: common functions for ccw_driver use
  */
 
-/* initial attempt at a probe function. this can be simplified once
- * the other detection code is gone */
+/*
+ * Initial attempt at a probe function. this can be simplified once
+ * the other detection code is gone.
+ */
 int
 dasd_generic_probe (struct ccw_device *cdev,
 		    struct dasd_discipline *discipline)
@@ -1770,8 +1778,10 @@ dasd_generic_probe (struct ccw_device *cdev,
 	return ret;
 }
 
-/* this will one day be called from a global not_oper handler.
- * It is also used by driver_unregister during module unload */
+/*
+ * This will one day be called from a global not_oper handler.
+ * It is also used by driver_unregister during module unload.
+ */
 void
 dasd_generic_remove (struct ccw_device *cdev)
 {
@@ -1798,9 +1808,11 @@ dasd_generic_remove (struct ccw_device *cdev)
 	dasd_delete_device(device);
 }
 
-/* activate a device. This is called from dasd_{eckd,fba}_probe() when either
+/*
+ * Activate a device. This is called from dasd_{eckd,fba}_probe() when either
  * the device is detected for the first time and is supposed to be used
- * or the user has started activation through sysfs */
+ * or the user has started activation through sysfs.
+ */
 int
 dasd_generic_set_online (struct ccw_device *cdev,
 			 struct dasd_discipline *discipline)
@@ -1917,7 +1929,6 @@ dasd_generic_notify(struct ccw_device *cdev, int event)
 				if (cqr->status == DASD_CQR_IN_IO)
 					cqr->status = DASD_CQR_FAILED;
 			device->stopped |= DASD_STOPPED_DC_EIO;
-			dasd_schedule_bh(device);
 		} else {
 			list_for_each_entry(cqr, &device->ccw_queue, list)
 				if (cqr->status == DASD_CQR_IN_IO) {
@@ -1927,6 +1938,7 @@ dasd_generic_notify(struct ccw_device *cdev, int event)
 			device->stopped |= DASD_STOPPED_DC_WAIT;
 			dasd_set_timer(device, 0);
 		}
+		dasd_schedule_bh(device);
 		ret = 1;
 		break;
 	case CIO_OPER:
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index 16c4b7d..a33d406 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.52 $
+ * $Revision: 1.53 $
  */
 
 #include <linux/config.h>
@@ -549,6 +549,8 @@ dasd_diag_build_cp(struct dasd_device * device, struct request *req)
 	}
 	cqr->retries = DIAG_MAX_RETRIES;
 	cqr->buildclk = get_clock();
+	if (req->flags & REQ_FAILFAST)
+		set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->device = device;
 	cqr->expires = DIAG_TIMEOUT;
 	cqr->status = DASD_CQR_FILLED;
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 811060e..efc4cf6 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -7,7 +7,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.71 $
+ * $Revision: 1.74 $
  */
 
 #include <linux/config.h>
@@ -1136,6 +1136,8 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
 			recid++;
 		}
 	}
+	if (req->flags & REQ_FAILFAST)
+		set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->device = device;
 	cqr->expires = 5 * 60 * HZ;	/* 5 minutes */
 	cqr->lpm = private->path_data.ppm;
@@ -1252,6 +1254,7 @@ dasd_eckd_release(struct block_device *bdev, int no, long args)
 	cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
 	cqr->device = device;
 	clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
+	set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->retries = 0;
 	cqr->expires = 2 * HZ;
 	cqr->buildclk = get_clock();
@@ -1296,6 +1299,7 @@ dasd_eckd_reserve(struct block_device *bdev, int no, long args)
 	cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
 	cqr->device = device;
 	clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
+	set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->retries = 0;
 	cqr->expires = 2 * HZ;
 	cqr->buildclk = get_clock();
@@ -1339,6 +1343,7 @@ dasd_eckd_steal_lock(struct block_device *bdev, int no, long args)
 	cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
 	cqr->device = device;
 	clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
+	set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->retries = 0;
 	cqr->expires = 2 * HZ;
 	cqr->buildclk = get_clock();
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 28cb461..9bac8d8 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -4,7 +4,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.40 $
+ * $Revision: 1.41 $
  */
 
 #include <linux/config.h>
@@ -352,6 +352,8 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
 			recid++;
 		}
 	}
+	if (req->flags & REQ_FAILFAST)
+		set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
 	cqr->device = device;
 	cqr->expires = 5 * 60 * HZ;	/* 5 minutes */
 	cqr->retries = 32;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 9fab04f..2fb05c4 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -6,7 +6,7 @@
  * Bugreports.to..: <Linux390@de.ibm.com>
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  *
- * $Revision: 1.65 $
+ * $Revision: 1.68 $
  */
 
 #ifndef DASD_INT_H
@@ -208,6 +208,7 @@ struct dasd_ccw_req {
 
 /* per dasd_ccw_req flags */
 #define DASD_CQR_FLAGS_USE_ERP   0	/* use ERP for this request */
+#define DASD_CQR_FLAGS_FAILFAST  1	/* FAILFAST */
 
 /* Signature for error recovery functions. */
 typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *);
-- 
cgit v1.1


From d0f4c16febf258ba8c0f917ac3ba935fc5459566 Mon Sep 17 00:00:00 2001
From: Andreas Krebbel <krebbel1@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:16 -0800
Subject: [PATCH] s390: add oprofile callgraph support

Signed-off-by: Andreas Krebbel <krebbel1@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/oprofile/Makefile    |  2 +-
 arch/s390/oprofile/backtrace.c | 79 ++++++++++++++++++++++++++++++++++++++++++
 arch/s390/oprofile/init.c      |  4 +++
 3 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/oprofile/backtrace.c

diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
index ec34927..537b2d8 100644
--- a/arch/s390/oprofile/Makefile
+++ b/arch/s390/oprofile/Makefile
@@ -6,4 +6,4 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprofilefs.o oprofile_stats.o  \
 		timer_int.o )
 
-oprofile-y				:= $(DRIVER_OBJS) init.o
+oprofile-y				:= $(DRIVER_OBJS) init.o backtrace.o
diff --git a/arch/s390/oprofile/backtrace.c b/arch/s390/oprofile/backtrace.c
new file mode 100644
index 0000000..bc4b84a
--- /dev/null
+++ b/arch/s390/oprofile/backtrace.c
@@ -0,0 +1,79 @@
+/**
+ * arch/s390/oprofile/backtrace.c
+ *
+ * S390 Version
+ *   Copyright (C) 2005 IBM Corporation, IBM Deutschland Entwicklung GmbH.
+ *   Author(s): Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
+ */
+
+#include <linux/oprofile.h>
+
+#include <asm/processor.h> /* for struct stack_frame */
+
+static unsigned long
+__show_trace(unsigned int *depth, unsigned long sp,
+	     unsigned long low, unsigned long high)
+{
+	struct stack_frame *sf;
+	struct pt_regs *regs;
+
+	while (*depth) {
+		sp = sp & PSW_ADDR_INSN;
+		if (sp < low || sp > high - sizeof(*sf))
+			return sp;
+		sf = (struct stack_frame *) sp;
+		(*depth)--;
+		oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+
+		/* Follow the backchain.  */
+		while (*depth) {
+			low = sp;
+			sp = sf->back_chain & PSW_ADDR_INSN;
+			if (!sp)
+				break;
+			if (sp <= low || sp > high - sizeof(*sf))
+				return sp;
+			sf = (struct stack_frame *) sp;
+			(*depth)--;
+			oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+
+		}
+
+		if (*depth == 0)
+			break;
+
+		/* Zero backchain detected, check for interrupt frame.  */
+		sp = (unsigned long) (sf + 1);
+		if (sp <= low || sp > high - sizeof(*regs))
+			return sp;
+		regs = (struct pt_regs *) sp;
+		(*depth)--;
+		oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
+		low = sp;
+		sp = regs->gprs[15];
+	}
+	return sp;
+}
+
+void s390_backtrace(struct pt_regs * const regs, unsigned int depth)
+{
+	unsigned long head;
+	struct stack_frame* head_sf;
+
+	if (user_mode (regs))
+		return;
+
+	head = regs->gprs[15];
+	head_sf = (struct stack_frame*)head;
+
+	if (!head_sf->back_chain)
+		return;
+
+	head = head_sf->back_chain;
+
+	head = __show_trace(&depth, head, S390_lowcore.async_stack - ASYNC_SIZE,
+			    S390_lowcore.async_stack);
+
+	__show_trace(&depth, head, S390_lowcore.thread_info,
+		     S390_lowcore.thread_info + THREAD_SIZE);
+}
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index a65ead0..7a99511 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -12,8 +12,12 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 
+
+extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth);
+
 int __init oprofile_arch_init(struct oprofile_operations* ops)
 {
+	ops->backtrace = s390_backtrace;
 	return -ENODEV;
 }
 
-- 
cgit v1.1


From c1e26e1ef7ab50f30e5fbf004fe96ed44321ca78 Mon Sep 17 00:00:00 2001
From: Jan Glauber <jan.glauber@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:17 -0800
Subject: [PATCH] s390: in-kernel crypto rename

Replace all references to z990 by s390 in the in-kernel crypto files in
arch/s390/crypto.  The code is not specific to a particular machine (z990) but
to the s390 platform.  Big diff, does nothing..

Signed-off-by: Jan Glauber <jan.glauber@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/crypto/Makefile           |   6 +-
 arch/s390/crypto/crypt_s390.h       | 383 ++++++++++++++++++++++++++++++++++++
 arch/s390/crypto/crypt_s390_query.c | 113 +++++++++++
 arch/s390/crypto/crypt_z990.h       | 374 -----------------------------------
 arch/s390/crypto/crypt_z990_query.c | 111 -----------
 arch/s390/crypto/des_s390.c         | 284 ++++++++++++++++++++++++++
 arch/s390/crypto/des_z990.c         | 284 --------------------------
 arch/s390/crypto/sha1_s390.c        | 167 ++++++++++++++++
 arch/s390/crypto/sha1_z990.c        | 167 ----------------
 arch/s390/defconfig                 |   4 +-
 crypto/Kconfig                      |   8 +-
 11 files changed, 956 insertions(+), 945 deletions(-)
 create mode 100644 arch/s390/crypto/crypt_s390.h
 create mode 100644 arch/s390/crypto/crypt_s390_query.c
 delete mode 100644 arch/s390/crypto/crypt_z990.h
 delete mode 100644 arch/s390/crypto/crypt_z990_query.c
 create mode 100644 arch/s390/crypto/des_s390.c
 delete mode 100644 arch/s390/crypto/des_z990.c
 create mode 100644 arch/s390/crypto/sha1_s390.c
 delete mode 100644 arch/s390/crypto/sha1_z990.c

diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 96a05e6..50843f8 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -2,7 +2,7 @@
 # Cryptographic API
 #
 
-obj-$(CONFIG_CRYPTO_SHA1_Z990) += sha1_z990.o
-obj-$(CONFIG_CRYPTO_DES_Z990) += des_z990.o des_check_key.o
+obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o
+obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o
 
-obj-$(CONFIG_CRYPTO_TEST) += crypt_z990_query.o
+obj-$(CONFIG_CRYPTO_TEST) += crypt_s390_query.o
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
new file mode 100644
index 0000000..4d24f66
--- /dev/null
+++ b/arch/s390/crypto/crypt_s390.h
@@ -0,0 +1,383 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for s390 cryptographic instructions.
+ *
+ *   Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation
+ *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_ARCH_S390_CRYPT_S390_H
+#define _CRYPTO_ARCH_S390_CRYPT_S390_H
+
+#include <asm/errno.h>
+
+#define CRYPT_S390_OP_MASK 0xFF00
+#define CRYPT_S390_FUNC_MASK 0x00FF
+
+/* s930 cryptographic operations */
+enum crypt_s390_operations {
+	CRYPT_S390_KM   = 0x0100,
+	CRYPT_S390_KMC  = 0x0200,
+	CRYPT_S390_KIMD = 0x0300,
+	CRYPT_S390_KLMD = 0x0400,
+	CRYPT_S390_KMAC = 0x0500
+};
+
+/* function codes for KM (CIPHER MESSAGE) instruction
+ * 0x80 is the decipher modifier bit
+ */
+enum crypt_s390_km_func {
+	KM_QUERY            = CRYPT_S390_KM | 0,
+	KM_DEA_ENCRYPT      = CRYPT_S390_KM | 1,
+	KM_DEA_DECRYPT      = CRYPT_S390_KM | 1 | 0x80,
+	KM_TDEA_128_ENCRYPT = CRYPT_S390_KM | 2,
+	KM_TDEA_128_DECRYPT = CRYPT_S390_KM | 2 | 0x80,
+	KM_TDEA_192_ENCRYPT = CRYPT_S390_KM | 3,
+	KM_TDEA_192_DECRYPT = CRYPT_S390_KM | 3 | 0x80,
+};
+
+/* function codes for KMC (CIPHER MESSAGE WITH CHAINING)
+ * instruction
+ */
+enum crypt_s390_kmc_func {
+	KMC_QUERY            = CRYPT_S390_KMC | 0,
+	KMC_DEA_ENCRYPT      = CRYPT_S390_KMC | 1,
+	KMC_DEA_DECRYPT      = CRYPT_S390_KMC | 1 | 0x80,
+	KMC_TDEA_128_ENCRYPT = CRYPT_S390_KMC | 2,
+	KMC_TDEA_128_DECRYPT = CRYPT_S390_KMC | 2 | 0x80,
+	KMC_TDEA_192_ENCRYPT = CRYPT_S390_KMC | 3,
+	KMC_TDEA_192_DECRYPT = CRYPT_S390_KMC | 3 | 0x80,
+};
+
+/* function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
+ * instruction
+ */
+enum crypt_s390_kimd_func {
+	KIMD_QUERY   = CRYPT_S390_KIMD | 0,
+	KIMD_SHA_1   = CRYPT_S390_KIMD | 1,
+};
+
+/* function codes for KLMD (COMPUTE LAST MESSAGE DIGEST)
+ * instruction
+ */
+enum crypt_s390_klmd_func {
+	KLMD_QUERY   = CRYPT_S390_KLMD | 0,
+	KLMD_SHA_1   = CRYPT_S390_KLMD | 1,
+};
+
+/* function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
+ * instruction
+ */
+enum crypt_s390_kmac_func {
+	KMAC_QUERY    = CRYPT_S390_KMAC | 0,
+	KMAC_DEA      = CRYPT_S390_KMAC | 1,
+	KMAC_TDEA_128 = CRYPT_S390_KMAC | 2,
+	KMAC_TDEA_192 = CRYPT_S390_KMAC | 3
+};
+
+/* status word for s390 crypto instructions' QUERY functions */
+struct crypt_s390_query_status {
+	u64 high;
+	u64 low;
+};
+
+/*
+ * Standard fixup and ex_table sections for crypt_s390 inline functions.
+ * label 0: the s390 crypto operation
+ * label 1: just after 1 to catch illegal operation exception
+ *          (unsupported model)
+ * label 6: the return point after fixup
+ * label 7: set error value if exception _in_ crypto operation
+ * label 8: set error value if illegal operation exception
+ * [ret] is the variable to receive the error code
+ * [ERR] is the error code value
+ */
+#ifndef __s390x__
+#define __crypt_s390_fixup \
+	".section .fixup,\"ax\" \n"	\
+	"7:	lhi	%0,%h[e1] \n"	\
+	"	bras	1,9f \n"	\
+	"	.long	6b \n"		\
+	"8:	lhi	%0,%h[e2] \n"	\
+	"	bras	1,9f \n"	\
+	"	.long	6b \n"		\
+	"9:	l	1,0(1) \n"	\
+	"	br	1 \n"		\
+	".previous \n"			\
+	".section __ex_table,\"a\" \n"	\
+	"	.align	4 \n"		\
+	"	.long	0b,7b \n"	\
+	"	.long	1b,8b \n"	\
+	".previous"
+#else /* __s390x__ */
+#define __crypt_s390_fixup \
+	".section .fixup,\"ax\" \n"	\
+	"7:	lhi	%0,%h[e1] \n"	\
+	"	jg	6b \n"		\
+	"8:	lhi	%0,%h[e2] \n"	\
+	"	jg	6b \n"		\
+	".previous\n"			\
+	".section __ex_table,\"a\" \n"	\
+	"	.align	8 \n"		\
+	"	.quad	0b,7b \n"	\
+	"	.quad	1b,8b \n"	\
+	".previous"
+#endif /* __s390x__ */
+
+/*
+ * Standard code for setting the result of s390 crypto instructions.
+ * %0: the register which will receive the result
+ * [result]: the register containing the result (e.g. second operand length
+ * to compute number of processed bytes].
+ */
+#ifndef __s390x__
+#define __crypt_s390_set_result \
+	"	lr	%0,%[result] \n"
+#else /* __s390x__ */
+#define __crypt_s390_set_result \
+	"	lgr	%0,%[result] \n"
+#endif
+
+/*
+ * Executes the KM (CIPHER MESSAGE) operation of the CPU.
+ * @param func: the function code passed to KM; see crypt_s390_km_func
+ * @param param: address of parameter block; see POP for details on each func
+ * @param dest: address of destination memory area
+ * @param src: address of source memory area
+ * @param src_len: length of src operand in bytes
+ * @returns < zero for failure, 0 for the query func, number of processed bytes
+ * 	for encryption/decryption funcs
+ */
+static inline int
+crypt_s390_km(long func, void* param, u8* dest, const u8* src, long src_len)
+{
+	register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
+	register void* __param asm("1") = param;
+	register u8* __dest asm("4") = dest;
+	register const u8* __src asm("2") = src;
+	register long __src_len asm("3") = src_len;
+	int ret;
+
+	ret = 0;
+	__asm__ __volatile__ (
+		"0:	.insn	rre,0xB92E0000,%1,%2 \n" /* KM opcode */
+		"1:	brc	1,0b \n" /* handle partial completion */
+		__crypt_s390_set_result
+		"6:	\n"
+		__crypt_s390_fixup
+		: "+d" (ret), "+a" (__dest), "+a" (__src),
+		  [result] "+d" (__src_len)
+		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
+		  "a" (__param)
+		: "cc", "memory"
+	);
+	if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
+		ret = src_len - ret;
+	}
+	return ret;
+}
+
+/*
+ * Executes the KMC (CIPHER MESSAGE WITH CHAINING) operation of the CPU.
+ * @param func: the function code passed to KM; see crypt_s390_kmc_func
+ * @param param: address of parameter block; see POP for details on each func
+ * @param dest: address of destination memory area
+ * @param src: address of source memory area
+ * @param src_len: length of src operand in bytes
+ * @returns < zero for failure, 0 for the query func, number of processed bytes
+ * 	for encryption/decryption funcs
+ */
+static inline int
+crypt_s390_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
+{
+	register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
+	register void* __param asm("1") = param;
+	register u8* __dest asm("4") = dest;
+	register const u8* __src asm("2") = src;
+	register long __src_len asm("3") = src_len;
+	int ret;
+
+	ret = 0;
+	__asm__ __volatile__ (
+		"0:	.insn	rre,0xB92F0000,%1,%2 \n" /* KMC opcode */
+		"1:	brc	1,0b \n" /* handle partial completion */
+		__crypt_s390_set_result
+		"6:	\n"
+		__crypt_s390_fixup
+		: "+d" (ret), "+a" (__dest), "+a" (__src),
+		  [result] "+d" (__src_len)
+		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
+		  "a" (__param)
+		: "cc", "memory"
+	);
+	if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
+		ret = src_len - ret;
+	}
+	return ret;
+}
+
+/*
+ * Executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) operation
+ * of the CPU.
+ * @param func: the function code passed to KM; see crypt_s390_kimd_func
+ * @param param: address of parameter block; see POP for details on each func
+ * @param src: address of source memory area
+ * @param src_len: length of src operand in bytes
+ * @returns < zero for failure, 0 for the query func, number of processed bytes
+ * 	for digest funcs
+ */
+static inline int
+crypt_s390_kimd(long func, void* param, const u8* src, long src_len)
+{
+	register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
+	register void* __param asm("1") = param;
+	register const u8* __src asm("2") = src;
+	register long __src_len asm("3") = src_len;
+	int ret;
+
+	ret = 0;
+	__asm__ __volatile__ (
+		"0:	.insn	rre,0xB93E0000,%1,%1 \n" /* KIMD opcode */
+		"1:	brc	1,0b \n" /* handle partical completion */
+		__crypt_s390_set_result
+		"6:	\n"
+		__crypt_s390_fixup
+		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
+		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
+		  "a" (__param)
+		: "cc", "memory"
+	);
+	if (ret >= 0 && (func & CRYPT_S390_FUNC_MASK)){
+		ret = src_len - ret;
+	}
+	return ret;
+}
+
+/*
+ * Executes the KLMD (COMPUTE LAST MESSAGE DIGEST) operation of the CPU.
+ * @param func: the function code passed to KM; see crypt_s390_klmd_func
+ * @param param: address of parameter block; see POP for details on each func
+ * @param src: address of source memory area
+ * @param src_len: length of src operand in bytes
+ * @returns < zero for failure, 0 for the query func, number of processed bytes
+ * 	for digest funcs
+ */
+static inline int
+crypt_s390_klmd(long func, void* param, const u8* src, long src_len)
+{
+	register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
+	register void* __param asm("1") = param;
+	register const u8* __src asm("2") = src;
+	register long __src_len asm("3") = src_len;
+	int ret;
+
+	ret = 0;
+	__asm__ __volatile__ (
+		"0:	.insn	rre,0xB93F0000,%1,%1 \n" /* KLMD opcode */
+		"1:	brc	1,0b \n" /* handle partical completion */
+		__crypt_s390_set_result
+		"6:	\n"
+		__crypt_s390_fixup
+		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
+		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
+		  "a" (__param)
+		: "cc", "memory"
+	);
+	if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
+		ret = src_len - ret;
+	}
+	return ret;
+}
+
+/*
+ * Executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) operation
+ * of the CPU.
+ * @param func: the function code passed to KM; see crypt_s390_klmd_func
+ * @param param: address of parameter block; see POP for details on each func
+ * @param src: address of source memory area
+ * @param src_len: length of src operand in bytes
+ * @returns < zero for failure, 0 for the query func, number of processed bytes
+ * 	for digest funcs
+ */
+static inline int
+crypt_s390_kmac(long func, void* param, const u8* src, long src_len)
+{
+	register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
+	register void* __param asm("1") = param;
+	register const u8* __src asm("2") = src;
+	register long __src_len asm("3") = src_len;
+	int ret;
+
+	ret = 0;
+	__asm__ __volatile__ (
+		"0:	.insn	rre,0xB91E0000,%5,%5 \n" /* KMAC opcode */
+		"1:	brc	1,0b \n" /* handle partical completion */
+		__crypt_s390_set_result
+		"6:	\n"
+		__crypt_s390_fixup
+		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
+		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
+		  "a" (__param)
+		: "cc", "memory"
+	);
+	if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
+		ret = src_len - ret;
+	}
+	return ret;
+}
+
+/**
+ * Tests if a specific crypto function is implemented on the machine.
+ * @param func:	the function code of the specific function; 0 if op in general
+ * @return	1 if func available; 0 if func or op in general not available
+ */
+static inline int
+crypt_s390_func_available(int func)
+{
+	int ret;
+
+	struct crypt_s390_query_status status = {
+		.high = 0,
+		.low = 0
+	};
+	switch (func & CRYPT_S390_OP_MASK){
+		case CRYPT_S390_KM:
+			ret = crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0);
+			break;
+		case CRYPT_S390_KMC:
+			ret = crypt_s390_kmc(KMC_QUERY, &status, NULL, NULL, 0);
+			break;
+		case CRYPT_S390_KIMD:
+			ret = crypt_s390_kimd(KIMD_QUERY, &status, NULL, 0);
+			break;
+		case CRYPT_S390_KLMD:
+			ret = crypt_s390_klmd(KLMD_QUERY, &status, NULL, 0);
+			break;
+		case CRYPT_S390_KMAC:
+			ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0);
+			break;
+		default:
+			ret = 0;
+			return ret;
+	}
+	if (ret >= 0){
+		func &= CRYPT_S390_FUNC_MASK;
+		func &= 0x7f; //mask modifier bit
+		if (func < 64){
+			ret = (status.high >> (64 - func - 1)) & 0x1;
+		} else {
+			ret = (status.low >> (128 - func - 1)) & 0x1;
+		}
+	} else {
+		ret = 0;
+	}
+	return ret;
+}
+
+#endif // _CRYPTO_ARCH_S390_CRYPT_S390_H
diff --git a/arch/s390/crypto/crypt_s390_query.c b/arch/s390/crypto/crypt_s390_query.c
new file mode 100644
index 0000000..0fa6bdf
--- /dev/null
+++ b/arch/s390/crypto/crypt_s390_query.c
@@ -0,0 +1,113 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for s390 cryptographic instructions.
+ * Testing module for querying processor crypto capabilities.
+ *
+ * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ * Author(s): Thomas Spatzier (tspat@de.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/errno.h>
+#include "crypt_s390.h"
+
+static void query_available_functions(void)
+{
+	printk(KERN_INFO "#####################\n");
+
+	/* query available KM functions */
+	printk(KERN_INFO "KM_QUERY: %d\n",
+		crypt_s390_func_available(KM_QUERY));
+	printk(KERN_INFO "KM_DEA: %d\n",
+		crypt_s390_func_available(KM_DEA_ENCRYPT));
+	printk(KERN_INFO "KM_TDEA_128: %d\n",
+		crypt_s390_func_available(KM_TDEA_128_ENCRYPT));
+	printk(KERN_INFO "KM_TDEA_192: %d\n",
+		crypt_s390_func_available(KM_TDEA_192_ENCRYPT));
+
+	/* query available KMC functions */
+	printk(KERN_INFO "KMC_QUERY: %d\n",
+		crypt_s390_func_available(KMC_QUERY));
+	printk(KERN_INFO "KMC_DEA: %d\n",
+		crypt_s390_func_available(KMC_DEA_ENCRYPT));
+	printk(KERN_INFO "KMC_TDEA_128: %d\n",
+		crypt_s390_func_available(KMC_TDEA_128_ENCRYPT));
+	printk(KERN_INFO "KMC_TDEA_192: %d\n",
+		crypt_s390_func_available(KMC_TDEA_192_ENCRYPT));
+
+	/* query available KIMD fucntions */
+	printk(KERN_INFO "KIMD_QUERY: %d\n",
+		crypt_s390_func_available(KIMD_QUERY));
+	printk(KERN_INFO "KIMD_SHA_1: %d\n",
+		crypt_s390_func_available(KIMD_SHA_1));
+
+	/* query available KLMD functions */
+	printk(KERN_INFO "KLMD_QUERY: %d\n",
+		crypt_s390_func_available(KLMD_QUERY));
+	printk(KERN_INFO "KLMD_SHA_1: %d\n",
+		crypt_s390_func_available(KLMD_SHA_1));
+
+	/* query available KMAC functions */
+	printk(KERN_INFO "KMAC_QUERY: %d\n",
+		crypt_s3990_func_available(KMAC_QUERY));
+	printk(KERN_INFO "KMAC_DEA: %d\n",
+		crypt_s390_func_available(KMAC_DEA));
+	printk(KERN_INFO "KMAC_TDEA_128: %d\n",
+		crypt_s390_func_available(KMAC_TDEA_128));
+	printk(KERN_INFO "KMAC_TDEA_192: %d\n",
+		crypt_s390_func_available(KMAC_TDEA_192));
+}
+
+static int init(void)
+{
+	struct crypt_s390_query_status status = {
+		.high = 0,
+		.low = 0
+	};
+
+	printk(KERN_INFO "crypt_s390: querying available crypto functions\n");
+	crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0);
+	printk(KERN_INFO "KM:\t%016llx %016llx\n",
+			(unsigned long long) status.high,
+			(unsigned long long) status.low);
+	status.high = status.low = 0;
+	crypt_s390_kmc(KMC_QUERY, &status, NULL, NULL, 0);
+	printk(KERN_INFO "KMC:\t%016llx %016llx\n",
+			(unsigned long long) status.high,
+			(unsigned long long) status.low);
+	status.high = status.low = 0;
+	crypt_s390_kimd(KIMD_QUERY, &status, NULL, 0);
+	printk(KERN_INFO "KIMD:\t%016llx %016llx\n",
+			(unsigned long long) status.high,
+			(unsigned long long) status.low);
+	status.high = status.low = 0;
+	crypt_s390_klmd(KLMD_QUERY, &status, NULL, 0);
+	printk(KERN_INFO "KLMD:\t%016llx %016llx\n",
+			(unsigned long long) status.high,
+			(unsigned long long) status.low);
+	status.high = status.low = 0;
+	crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0);
+	printk(KERN_INFO "KMAC:\t%016llx %016llx\n",
+			(unsigned long long) status.high,
+			(unsigned long long) status.low);
+
+	query_available_functions();
+	return -ECANCELED;
+}
+
+static void __exit cleanup(void)
+{
+}
+
+module_init(init);
+module_exit(cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/crypt_z990.h b/arch/s390/crypto/crypt_z990.h
deleted file mode 100644
index 4df660b..0000000
--- a/arch/s390/crypto/crypt_z990.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Support for z990 cryptographic instructions.
- *
- *   Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation
- *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-#ifndef _CRYPTO_ARCH_S390_CRYPT_Z990_H
-#define _CRYPTO_ARCH_S390_CRYPT_Z990_H
-
-#include <asm/errno.h>
-
-#define CRYPT_Z990_OP_MASK 0xFF00
-#define CRYPT_Z990_FUNC_MASK 0x00FF
-
-
-/*z990 cryptographic operations*/
-enum crypt_z990_operations {
-	CRYPT_Z990_KM   = 0x0100,
-	CRYPT_Z990_KMC  = 0x0200,
-	CRYPT_Z990_KIMD = 0x0300,
-	CRYPT_Z990_KLMD = 0x0400,
-	CRYPT_Z990_KMAC = 0x0500
-};
-
-/*function codes for KM (CIPHER MESSAGE) instruction*/
-enum crypt_z990_km_func {
-	KM_QUERY            = CRYPT_Z990_KM | 0,
-	KM_DEA_ENCRYPT      = CRYPT_Z990_KM | 1,
-	KM_DEA_DECRYPT      = CRYPT_Z990_KM | 1 | 0x80, //modifier bit->decipher
-	KM_TDEA_128_ENCRYPT = CRYPT_Z990_KM | 2,
-	KM_TDEA_128_DECRYPT = CRYPT_Z990_KM | 2 | 0x80,
-	KM_TDEA_192_ENCRYPT = CRYPT_Z990_KM | 3,
-	KM_TDEA_192_DECRYPT = CRYPT_Z990_KM | 3 | 0x80,
-};
-
-/*function codes for KMC (CIPHER MESSAGE WITH CHAINING) instruction*/
-enum crypt_z990_kmc_func {
-	KMC_QUERY            = CRYPT_Z990_KMC | 0,
-	KMC_DEA_ENCRYPT      = CRYPT_Z990_KMC | 1,
-	KMC_DEA_DECRYPT      = CRYPT_Z990_KMC | 1 | 0x80, //modifier bit->decipher
-	KMC_TDEA_128_ENCRYPT = CRYPT_Z990_KMC | 2,
-	KMC_TDEA_128_DECRYPT = CRYPT_Z990_KMC | 2 | 0x80,
-	KMC_TDEA_192_ENCRYPT = CRYPT_Z990_KMC | 3,
-	KMC_TDEA_192_DECRYPT = CRYPT_Z990_KMC | 3 | 0x80,
-};
-
-/*function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) instruction*/
-enum crypt_z990_kimd_func {
-	KIMD_QUERY   = CRYPT_Z990_KIMD | 0,
-	KIMD_SHA_1   = CRYPT_Z990_KIMD | 1,
-};
-
-/*function codes for KLMD (COMPUTE LAST MESSAGE DIGEST) instruction*/
-enum crypt_z990_klmd_func {
-	KLMD_QUERY   = CRYPT_Z990_KLMD | 0,
-	KLMD_SHA_1   = CRYPT_Z990_KLMD | 1,
-};
-
-/*function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) instruction*/
-enum crypt_z990_kmac_func {
-	KMAC_QUERY    = CRYPT_Z990_KMAC | 0,
-	KMAC_DEA      = CRYPT_Z990_KMAC | 1,
-	KMAC_TDEA_128 = CRYPT_Z990_KMAC | 2,
-	KMAC_TDEA_192 = CRYPT_Z990_KMAC | 3
-};
-
-/*status word for z990 crypto instructions' QUERY functions*/
-struct crypt_z990_query_status {
-	u64 high;
-	u64 low;
-};
-
-/*
- * Standard fixup and ex_table sections for crypt_z990 inline functions.
- * label 0: the z990 crypto operation
- * label 1: just after 1 to catch illegal operation exception on non-z990
- * label 6: the return point after fixup
- * label 7: set error value if exception _in_ crypto operation
- * label 8: set error value if illegal operation exception
- * [ret] is the variable to receive the error code
- * [ERR] is the error code value
- */
-#ifndef __s390x__
-#define __crypt_z990_fixup \
-	".section .fixup,\"ax\" \n"	\
-	"7:	lhi	%0,%h[e1] \n"	\
-	"	bras	1,9f \n"	\
-	"	.long	6b \n"		\
-	"8:	lhi	%0,%h[e2] \n"	\
-	"	bras	1,9f \n"	\
-	"	.long	6b \n"		\
-	"9:	l	1,0(1) \n"	\
-	"	br	1 \n"		\
-	".previous \n"			\
-	".section __ex_table,\"a\" \n"	\
-	"	.align	4 \n"		\
-	"	.long	0b,7b \n"	\
-	"	.long	1b,8b \n"	\
-	".previous"
-#else /* __s390x__ */
-#define __crypt_z990_fixup \
-	".section .fixup,\"ax\" \n"	\
-	"7:	lhi	%0,%h[e1] \n"	\
-	"	jg	6b \n"		\
-	"8:	lhi	%0,%h[e2] \n"	\
-	"	jg	6b \n"		\
-	".previous\n"			\
-	".section __ex_table,\"a\" \n"	\
-	"	.align	8 \n"		\
-	"	.quad	0b,7b \n"	\
-	"	.quad	1b,8b \n"	\
-	".previous"
-#endif /* __s390x__ */
-
-/*
- * Standard code for setting the result of z990 crypto instructions.
- * %0: the register which will receive the result
- * [result]: the register containing the result (e.g. second operand length
- * to compute number of processed bytes].
- */
-#ifndef __s390x__
-#define __crypt_z990_set_result \
-	"	lr	%0,%[result] \n"
-#else /* __s390x__ */
-#define __crypt_z990_set_result \
-	"	lgr	%0,%[result] \n"
-#endif
-
-/*
- * Executes the KM (CIPHER MESSAGE) operation of the z990 CPU.
- * @param func: the function code passed to KM; see crypt_z990_km_func
- * @param param: address of parameter block; see POP for details on each func
- * @param dest: address of destination memory area
- * @param src: address of source memory area
- * @param src_len: length of src operand in bytes
- * @returns < zero for failure, 0 for the query func, number of processed bytes
- * 	for encryption/decryption funcs
- */
-static inline int
-crypt_z990_km(long func, void* param, u8* dest, const u8* src, long src_len)
-{
-	register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK;
-	register void* __param asm("1") = param;
-	register u8* __dest asm("4") = dest;
-	register const u8* __src asm("2") = src;
-	register long __src_len asm("3") = src_len;
-	int ret;
-
-	ret = 0;
-	__asm__ __volatile__ (
-		"0:	.insn	rre,0xB92E0000,%1,%2 \n" //KM opcode
-		"1:	brc	1,0b \n" //handle partial completion
-		__crypt_z990_set_result
-		"6:	\n"
-		__crypt_z990_fixup
-		: "+d" (ret), "+a" (__dest), "+a" (__src),
-		  [result] "+d" (__src_len)
-		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
-		  "a" (__param)
-		: "cc", "memory"
-	);
-	if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){
-		ret = src_len - ret;
-	}
-	return ret;
-}
-
-/*
- * Executes the KMC (CIPHER MESSAGE WITH CHAINING) operation of the z990 CPU.
- * @param func: the function code passed to KM; see crypt_z990_kmc_func
- * @param param: address of parameter block; see POP for details on each func
- * @param dest: address of destination memory area
- * @param src: address of source memory area
- * @param src_len: length of src operand in bytes
- * @returns < zero for failure, 0 for the query func, number of processed bytes
- * 	for encryption/decryption funcs
- */
-static inline int
-crypt_z990_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
-{
-	register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK;
-	register void* __param asm("1") = param;
-	register u8* __dest asm("4") = dest;
-	register const u8* __src asm("2") = src;
-	register long __src_len asm("3") = src_len;
-	int ret;
-
-	ret = 0;
-	__asm__ __volatile__ (
-		"0:	.insn	rre,0xB92F0000,%1,%2 \n" //KMC opcode
-		"1:	brc	1,0b \n" //handle partial completion
-		__crypt_z990_set_result
-		"6:	\n"
-		__crypt_z990_fixup
-		: "+d" (ret), "+a" (__dest), "+a" (__src),
-		  [result] "+d" (__src_len)
-		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
-		  "a" (__param)
-		: "cc", "memory"
-	);
-	if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){
-		ret = src_len - ret;
-	}
-	return ret;
-}
-
-/*
- * Executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) operation
- * of the z990 CPU.
- * @param func: the function code passed to KM; see crypt_z990_kimd_func
- * @param param: address of parameter block; see POP for details on each func
- * @param src: address of source memory area
- * @param src_len: length of src operand in bytes
- * @returns < zero for failure, 0 for the query func, number of processed bytes
- * 	for digest funcs
- */
-static inline int
-crypt_z990_kimd(long func, void* param, const u8* src, long src_len)
-{
-	register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK;
-	register void* __param asm("1") = param;
-	register const u8* __src asm("2") = src;
-	register long __src_len asm("3") = src_len;
-	int ret;
-
-	ret = 0;
-	__asm__ __volatile__ (
-		"0:	.insn	rre,0xB93E0000,%1,%1 \n" //KIMD opcode
-		"1:	brc	1,0b \n" /*handle partical completion of kimd*/
-		__crypt_z990_set_result
-		"6:	\n"
-		__crypt_z990_fixup
-		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
-		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
-		  "a" (__param)
-		: "cc", "memory"
-	);
-	if (ret >= 0 && (func & CRYPT_Z990_FUNC_MASK)){
-		ret = src_len - ret;
-	}
-	return ret;
-}
-
-/*
- * Executes the KLMD (COMPUTE LAST MESSAGE DIGEST) operation of the z990 CPU.
- * @param func: the function code passed to KM; see crypt_z990_klmd_func
- * @param param: address of parameter block; see POP for details on each func
- * @param src: address of source memory area
- * @param src_len: length of src operand in bytes
- * @returns < zero for failure, 0 for the query func, number of processed bytes
- * 	for digest funcs
- */
-static inline int
-crypt_z990_klmd(long func, void* param, const u8* src, long src_len)
-{
-	register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK;
-	register void* __param asm("1") = param;
-	register const u8* __src asm("2") = src;
-	register long __src_len asm("3") = src_len;
-	int ret;
-
-	ret = 0;
-	__asm__ __volatile__ (
-		"0:	.insn	rre,0xB93F0000,%1,%1 \n" //KLMD opcode
-		"1:	brc	1,0b \n" /*handle partical completion of klmd*/
-		__crypt_z990_set_result
-		"6:	\n"
-		__crypt_z990_fixup
-		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
-		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
-		  "a" (__param)
-		: "cc", "memory"
-	);
-	if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){
-		ret = src_len - ret;
-	}
-	return ret;
-}
-
-/*
- * Executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) operation
- * of the z990 CPU.
- * @param func: the function code passed to KM; see crypt_z990_klmd_func
- * @param param: address of parameter block; see POP for details on each func
- * @param src: address of source memory area
- * @param src_len: length of src operand in bytes
- * @returns < zero for failure, 0 for the query func, number of processed bytes
- * 	for digest funcs
- */
-static inline int
-crypt_z990_kmac(long func, void* param, const u8* src, long src_len)
-{
-	register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK;
-	register void* __param asm("1") = param;
-	register const u8* __src asm("2") = src;
-	register long __src_len asm("3") = src_len;
-	int ret;
-
-	ret = 0;
-	__asm__ __volatile__ (
-		"0:	.insn	rre,0xB91E0000,%5,%5 \n" //KMAC opcode
-		"1:	brc	1,0b \n" /*handle partical completion of klmd*/
-		__crypt_z990_set_result
-		"6:	\n"
-		__crypt_z990_fixup
-		: "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
-		: [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
-		  "a" (__param)
-		: "cc", "memory"
-	);
-	if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){
-		ret = src_len - ret;
-	}
-	return ret;
-}
-
-/**
- * Tests if a specific z990 crypto function is implemented on the machine.
- * @param func:	the function code of the specific function; 0 if op in general
- * @return	1 if func available; 0 if func or op in general not available
- */
-static inline int
-crypt_z990_func_available(int func)
-{
-	int ret;
-
-	struct crypt_z990_query_status status = {
-		.high = 0,
-		.low = 0
-	};
-	switch (func & CRYPT_Z990_OP_MASK){
-		case CRYPT_Z990_KM:
-			ret = crypt_z990_km(KM_QUERY, &status, NULL, NULL, 0);
-			break;
-		case CRYPT_Z990_KMC:
-			ret = crypt_z990_kmc(KMC_QUERY, &status, NULL, NULL, 0);
-			break;
-		case CRYPT_Z990_KIMD:
-			ret = crypt_z990_kimd(KIMD_QUERY, &status, NULL, 0);
-			break;
-		case CRYPT_Z990_KLMD:
-			ret = crypt_z990_klmd(KLMD_QUERY, &status, NULL, 0);
-			break;
-		case CRYPT_Z990_KMAC:
-			ret = crypt_z990_kmac(KMAC_QUERY, &status, NULL, 0);
-			break;
-		default:
-			ret = 0;
-			return ret;
-	}
-	if (ret >= 0){
-		func &= CRYPT_Z990_FUNC_MASK;
-		func &= 0x7f; //mask modifier bit
-		if (func < 64){
-			ret = (status.high >> (64 - func - 1)) & 0x1;
-		} else {
-			ret = (status.low >> (128 - func - 1)) & 0x1;
-		}
-	} else {
-		ret = 0;
-	}
-	return ret;
-}
-
-
-#endif // _CRYPTO_ARCH_S390_CRYPT_Z990_H
diff --git a/arch/s390/crypto/crypt_z990_query.c b/arch/s390/crypto/crypt_z990_query.c
deleted file mode 100644
index 7133983..0000000
--- a/arch/s390/crypto/crypt_z990_query.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Support for z990 cryptographic instructions.
- * Testing module for querying processor crypto capabilities.
- *
- * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
- * Author(s): Thomas Spatzier (tspat@de.ibm.com)
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <asm/errno.h>
-#include "crypt_z990.h"
-
-static void
-query_available_functions(void)
-{
-	printk(KERN_INFO "#####################\n");
-	//query available KM functions
-	printk(KERN_INFO "KM_QUERY: %d\n",
-			crypt_z990_func_available(KM_QUERY));
-	printk(KERN_INFO "KM_DEA: %d\n",
-			crypt_z990_func_available(KM_DEA_ENCRYPT));
-	printk(KERN_INFO "KM_TDEA_128: %d\n",
-			crypt_z990_func_available(KM_TDEA_128_ENCRYPT));
-	printk(KERN_INFO "KM_TDEA_192: %d\n",
-			crypt_z990_func_available(KM_TDEA_192_ENCRYPT));
-	//query available KMC functions
-	printk(KERN_INFO "KMC_QUERY: %d\n",
-			crypt_z990_func_available(KMC_QUERY));
-	printk(KERN_INFO "KMC_DEA: %d\n",
-			crypt_z990_func_available(KMC_DEA_ENCRYPT));
-	printk(KERN_INFO "KMC_TDEA_128: %d\n",
-			crypt_z990_func_available(KMC_TDEA_128_ENCRYPT));
-	printk(KERN_INFO "KMC_TDEA_192: %d\n",
-			crypt_z990_func_available(KMC_TDEA_192_ENCRYPT));
-	//query available KIMD fucntions
-	printk(KERN_INFO "KIMD_QUERY: %d\n",
-			crypt_z990_func_available(KIMD_QUERY));
-	printk(KERN_INFO "KIMD_SHA_1: %d\n",
-			crypt_z990_func_available(KIMD_SHA_1));
-	//query available KLMD functions
-	printk(KERN_INFO "KLMD_QUERY: %d\n",
-			crypt_z990_func_available(KLMD_QUERY));
-	printk(KERN_INFO "KLMD_SHA_1: %d\n",
-			crypt_z990_func_available(KLMD_SHA_1));
-	//query available KMAC functions
-	printk(KERN_INFO "KMAC_QUERY: %d\n",
-			crypt_z990_func_available(KMAC_QUERY));
-	printk(KERN_INFO "KMAC_DEA: %d\n",
-			crypt_z990_func_available(KMAC_DEA));
-	printk(KERN_INFO "KMAC_TDEA_128: %d\n",
-			crypt_z990_func_available(KMAC_TDEA_128));
-	printk(KERN_INFO "KMAC_TDEA_192: %d\n",
-			crypt_z990_func_available(KMAC_TDEA_192));
-}
-
-static int
-init(void)
-{
-	struct crypt_z990_query_status status = {
-		.high = 0,
-		.low = 0
-	};
-
-	printk(KERN_INFO "crypt_z990: querying available crypto functions\n");
-	crypt_z990_km(KM_QUERY, &status, NULL, NULL, 0);
-	printk(KERN_INFO "KM: %016llx %016llx\n",
-			(unsigned long long) status.high,
-			(unsigned long long) status.low);
-	status.high = status.low = 0;
-	crypt_z990_kmc(KMC_QUERY, &status, NULL, NULL, 0);
-	printk(KERN_INFO "KMC: %016llx %016llx\n",
-			(unsigned long long) status.high,
-			(unsigned long long) status.low);
-	status.high = status.low = 0;
-	crypt_z990_kimd(KIMD_QUERY, &status, NULL, 0);
-	printk(KERN_INFO "KIMD: %016llx %016llx\n",
-			(unsigned long long) status.high,
-			(unsigned long long) status.low);
-	status.high = status.low = 0;
-	crypt_z990_klmd(KLMD_QUERY, &status, NULL, 0);
-	printk(KERN_INFO "KLMD: %016llx %016llx\n",
-			(unsigned long long) status.high,
-			(unsigned long long) status.low);
-	status.high = status.low = 0;
-	crypt_z990_kmac(KMAC_QUERY, &status, NULL, 0);
-	printk(KERN_INFO "KMAC: %016llx %016llx\n",
-			(unsigned long long) status.high,
-			(unsigned long long) status.low);
-
-	query_available_functions();
-	return -1;
-}
-
-static void __exit
-cleanup(void)
-{
-}
-
-module_init(init);
-module_exit(cleanup);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c
new file mode 100644
index 0000000..a38bb2a
--- /dev/null
+++ b/arch/s390/crypto/des_s390.c
@@ -0,0 +1,284 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the DES Cipher Algorithm.
+ *
+ * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ * Author(s): Thomas Spatzier (tspat@de.ibm.com)
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include "crypt_s390.h"
+#include "crypto_des.h"
+
+#define DES_BLOCK_SIZE 8
+#define DES_KEY_SIZE 8
+
+#define DES3_128_KEY_SIZE	(2 * DES_KEY_SIZE)
+#define DES3_128_BLOCK_SIZE	DES_BLOCK_SIZE
+
+#define DES3_192_KEY_SIZE	(3 * DES_KEY_SIZE)
+#define DES3_192_BLOCK_SIZE	DES_BLOCK_SIZE
+
+struct crypt_s390_des_ctx {
+	u8 iv[DES_BLOCK_SIZE];
+	u8 key[DES_KEY_SIZE];
+};
+
+struct crypt_s390_des3_128_ctx {
+	u8 iv[DES_BLOCK_SIZE];
+	u8 key[DES3_128_KEY_SIZE];
+};
+
+struct crypt_s390_des3_192_ctx {
+	u8 iv[DES_BLOCK_SIZE];
+	u8 key[DES3_192_KEY_SIZE];
+};
+
+static int
+des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+{
+	struct crypt_s390_des_ctx *dctx;
+	int ret;
+
+	dctx = ctx;
+	//test if key is valid (not a weak key)
+	ret = crypto_des_check_key(key, keylen, flags);
+	if (ret == 0){
+		memcpy(dctx->key, key, keylen);
+	}
+	return ret;
+}
+
+
+static void
+des_encrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
+}
+
+static void
+des_decrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_DEA_DECRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
+}
+
+static struct crypto_alg des_alg = {
+	.cra_name		=	"des",
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	DES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct crypt_s390_des_ctx),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(des_alg.cra_list),
+	.cra_u			=	{ .cipher = {
+	.cia_min_keysize	=	DES_KEY_SIZE,
+	.cia_max_keysize	=	DES_KEY_SIZE,
+	.cia_setkey		= 	des_setkey,
+	.cia_encrypt		=	des_encrypt,
+	.cia_decrypt		=	des_decrypt } }
+};
+
+/*
+ * RFC2451:
+ *
+ *   For DES-EDE3, there is no known need to reject weak or
+ *   complementation keys.  Any weakness is obviated by the use of
+ *   multiple keys.
+ *
+ *   However, if the two  independent 64-bit keys are equal,
+ *   then the DES3 operation is simply the same as DES.
+ *   Implementers MUST reject keys that exhibit this property.
+ *
+ */
+static int
+des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+{
+	int i, ret;
+	struct crypt_s390_des3_128_ctx *dctx;
+	const u8* temp_key = key;
+
+	dctx = ctx;
+	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE))) {
+
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_SCHED;
+		return -EINVAL;
+	}
+	for (i = 0; i < 2; i++,	temp_key += DES_KEY_SIZE) {
+		ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags);
+		if (ret < 0)
+			return ret;
+	}
+	memcpy(dctx->key, key, keylen);
+	return 0;
+}
+
+static void
+des3_128_encrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des3_128_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src,
+			DES3_128_BLOCK_SIZE);
+}
+
+static void
+des3_128_decrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des3_128_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src,
+			DES3_128_BLOCK_SIZE);
+}
+
+static struct crypto_alg des3_128_alg = {
+	.cra_name		=	"des3_ede128",
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	DES3_128_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct crypt_s390_des3_128_ctx),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(des3_128_alg.cra_list),
+	.cra_u			=	{ .cipher = {
+	.cia_min_keysize	=	DES3_128_KEY_SIZE,
+	.cia_max_keysize	=	DES3_128_KEY_SIZE,
+	.cia_setkey		= 	des3_128_setkey,
+	.cia_encrypt		=	des3_128_encrypt,
+	.cia_decrypt		=	des3_128_decrypt } }
+};
+
+/*
+ * RFC2451:
+ *
+ *   For DES-EDE3, there is no known need to reject weak or
+ *   complementation keys.  Any weakness is obviated by the use of
+ *   multiple keys.
+ *
+ *   However, if the first two or last two independent 64-bit keys are
+ *   equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the
+ *   same as DES.  Implementers MUST reject keys that exhibit this
+ *   property.
+ *
+ */
+static int
+des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
+{
+	int i, ret;
+	struct crypt_s390_des3_192_ctx *dctx;
+	const u8* temp_key;
+
+	dctx = ctx;
+	temp_key = key;
+	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) &&
+	    memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2],
+	    					DES_KEY_SIZE))) {
+
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_SCHED;
+		return -EINVAL;
+	}
+	for (i = 0; i < 3; i++, temp_key += DES_KEY_SIZE) {
+		ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags);
+		if (ret < 0){
+			return ret;
+		}
+	}
+	memcpy(dctx->key, key, keylen);
+	return 0;
+}
+
+static void
+des3_192_encrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des3_192_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src,
+			DES3_192_BLOCK_SIZE);
+}
+
+static void
+des3_192_decrypt(void *ctx, u8 *dst, const u8 *src)
+{
+	struct crypt_s390_des3_192_ctx *dctx;
+
+	dctx = ctx;
+	crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src,
+			DES3_192_BLOCK_SIZE);
+}
+
+static struct crypto_alg des3_192_alg = {
+	.cra_name		=	"des3_ede",
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	DES3_192_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct crypt_s390_des3_192_ctx),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(des3_192_alg.cra_list),
+	.cra_u			=	{ .cipher = {
+	.cia_min_keysize	=	DES3_192_KEY_SIZE,
+	.cia_max_keysize	=	DES3_192_KEY_SIZE,
+	.cia_setkey		= 	des3_192_setkey,
+	.cia_encrypt		=	des3_192_encrypt,
+	.cia_decrypt		=	des3_192_decrypt } }
+};
+
+
+
+static int
+init(void)
+{
+	int ret;
+
+	if (!crypt_s390_func_available(KM_DEA_ENCRYPT) ||
+	    !crypt_s390_func_available(KM_TDEA_128_ENCRYPT) ||
+	    !crypt_s390_func_available(KM_TDEA_192_ENCRYPT)){
+		return -ENOSYS;
+	}
+
+	ret = 0;
+	ret |= (crypto_register_alg(&des_alg) == 0)? 0:1;
+	ret |= (crypto_register_alg(&des3_128_alg) == 0)? 0:2;
+	ret |= (crypto_register_alg(&des3_192_alg) == 0)? 0:4;
+	if (ret){
+		crypto_unregister_alg(&des3_192_alg);
+		crypto_unregister_alg(&des3_128_alg);
+		crypto_unregister_alg(&des_alg);
+		return -EEXIST;
+	}
+
+	printk(KERN_INFO "crypt_s390: des_s390 loaded.\n");
+	return 0;
+}
+
+static void __exit
+fini(void)
+{
+	crypto_unregister_alg(&des3_192_alg);
+	crypto_unregister_alg(&des3_128_alg);
+	crypto_unregister_alg(&des_alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_ALIAS("des");
+MODULE_ALIAS("des3_ede");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms");
diff --git a/arch/s390/crypto/des_z990.c b/arch/s390/crypto/des_z990.c
deleted file mode 100644
index 813cf37..0000000
--- a/arch/s390/crypto/des_z990.c
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Cryptographic API.
- *
- * z990 implementation of the DES Cipher Algorithm.
- *
- * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
- * Author(s): Thomas Spatzier (tspat@de.ibm.com)
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <asm/scatterlist.h>
-#include <linux/crypto.h>
-#include "crypt_z990.h"
-#include "crypto_des.h"
-
-#define DES_BLOCK_SIZE 8
-#define DES_KEY_SIZE 8
-
-#define DES3_128_KEY_SIZE	(2 * DES_KEY_SIZE)
-#define DES3_128_BLOCK_SIZE	DES_BLOCK_SIZE
-
-#define DES3_192_KEY_SIZE	(3 * DES_KEY_SIZE)
-#define DES3_192_BLOCK_SIZE	DES_BLOCK_SIZE
-
-struct crypt_z990_des_ctx {
-	u8 iv[DES_BLOCK_SIZE];
-	u8 key[DES_KEY_SIZE];
-};
-
-struct crypt_z990_des3_128_ctx {
-	u8 iv[DES_BLOCK_SIZE];
-	u8 key[DES3_128_KEY_SIZE];
-};
-
-struct crypt_z990_des3_192_ctx {
-	u8 iv[DES_BLOCK_SIZE];
-	u8 key[DES3_192_KEY_SIZE];
-};
-
-static int
-des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
-{
-	struct crypt_z990_des_ctx *dctx;
-	int ret;
-
-	dctx = ctx;
-	//test if key is valid (not a weak key)
-	ret = crypto_des_check_key(key, keylen, flags);
-	if (ret == 0){
-		memcpy(dctx->key, key, keylen);
-	}
-	return ret;
-}
-
-
-static void
-des_encrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_DEA_ENCRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
-}
-
-static void
-des_decrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_DEA_DECRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
-}
-
-static struct crypto_alg des_alg = {
-	.cra_name		=	"des",
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	DES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct crypt_z990_des_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(des_alg.cra_list),
-	.cra_u			=	{ .cipher = {
-	.cia_min_keysize	=	DES_KEY_SIZE,
-	.cia_max_keysize	=	DES_KEY_SIZE,
-	.cia_setkey		= 	des_setkey,
-	.cia_encrypt		=	des_encrypt,
-	.cia_decrypt		=	des_decrypt } }
-};
-
-/*
- * RFC2451:
- *
- *   For DES-EDE3, there is no known need to reject weak or
- *   complementation keys.  Any weakness is obviated by the use of
- *   multiple keys.
- *
- *   However, if the two  independent 64-bit keys are equal,
- *   then the DES3 operation is simply the same as DES.
- *   Implementers MUST reject keys that exhibit this property.
- *
- */
-static int
-des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
-{
-	int i, ret;
-	struct crypt_z990_des3_128_ctx *dctx;
-	const u8* temp_key = key;
-
-	dctx = ctx;
-	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE))) {
-
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_SCHED;
-		return -EINVAL;
-	}
-	for (i = 0; i < 2; i++,	temp_key += DES_KEY_SIZE) {
-		ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags);
-		if (ret < 0)
-			return ret;
-	}
-	memcpy(dctx->key, key, keylen);
-	return 0;
-}
-
-static void
-des3_128_encrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des3_128_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src,
-			DES3_128_BLOCK_SIZE);
-}
-
-static void
-des3_128_decrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des3_128_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src,
-			DES3_128_BLOCK_SIZE);
-}
-
-static struct crypto_alg des3_128_alg = {
-	.cra_name		=	"des3_ede128",
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	DES3_128_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct crypt_z990_des3_128_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(des3_128_alg.cra_list),
-	.cra_u			=	{ .cipher = {
-	.cia_min_keysize	=	DES3_128_KEY_SIZE,
-	.cia_max_keysize	=	DES3_128_KEY_SIZE,
-	.cia_setkey		= 	des3_128_setkey,
-	.cia_encrypt		=	des3_128_encrypt,
-	.cia_decrypt		=	des3_128_decrypt } }
-};
-
-/*
- * RFC2451:
- *
- *   For DES-EDE3, there is no known need to reject weak or
- *   complementation keys.  Any weakness is obviated by the use of
- *   multiple keys.
- *
- *   However, if the first two or last two independent 64-bit keys are
- *   equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the
- *   same as DES.  Implementers MUST reject keys that exhibit this
- *   property.
- *
- */
-static int
-des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
-{
-	int i, ret;
-	struct crypt_z990_des3_192_ctx *dctx;
-	const u8* temp_key;
-
-	dctx = ctx;
-	temp_key = key;
-	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) &&
-	    memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2],
-	    					DES_KEY_SIZE))) {
-
-		*flags |= CRYPTO_TFM_RES_BAD_KEY_SCHED;
-		return -EINVAL;
-	}
-	for (i = 0; i < 3; i++, temp_key += DES_KEY_SIZE) {
-		ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags);
-		if (ret < 0){
-			return ret;
-		}
-	}
-	memcpy(dctx->key, key, keylen);
-	return 0;
-}
-
-static void
-des3_192_encrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des3_192_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src,
-			DES3_192_BLOCK_SIZE);
-}
-
-static void
-des3_192_decrypt(void *ctx, u8 *dst, const u8 *src)
-{
-	struct crypt_z990_des3_192_ctx *dctx;
-
-	dctx = ctx;
-	crypt_z990_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src,
-			DES3_192_BLOCK_SIZE);
-}
-
-static struct crypto_alg des3_192_alg = {
-	.cra_name		=	"des3_ede",
-	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		=	DES3_192_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct crypt_z990_des3_192_ctx),
-	.cra_module		=	THIS_MODULE,
-	.cra_list		=	LIST_HEAD_INIT(des3_192_alg.cra_list),
-	.cra_u			=	{ .cipher = {
-	.cia_min_keysize	=	DES3_192_KEY_SIZE,
-	.cia_max_keysize	=	DES3_192_KEY_SIZE,
-	.cia_setkey		= 	des3_192_setkey,
-	.cia_encrypt		=	des3_192_encrypt,
-	.cia_decrypt		=	des3_192_decrypt } }
-};
-
-
-
-static int
-init(void)
-{
-	int ret;
-
-	if (!crypt_z990_func_available(KM_DEA_ENCRYPT) ||
-	    !crypt_z990_func_available(KM_TDEA_128_ENCRYPT) ||
-	    !crypt_z990_func_available(KM_TDEA_192_ENCRYPT)){
-		return -ENOSYS;
-	}
-
-	ret = 0;
-	ret |= (crypto_register_alg(&des_alg) == 0)? 0:1;
-	ret |= (crypto_register_alg(&des3_128_alg) == 0)? 0:2;
-	ret |= (crypto_register_alg(&des3_192_alg) == 0)? 0:4;
-	if (ret){
-		crypto_unregister_alg(&des3_192_alg);
-		crypto_unregister_alg(&des3_128_alg);
-		crypto_unregister_alg(&des_alg);
-		return -EEXIST;
-	}
-
-	printk(KERN_INFO "crypt_z990: des_z990 loaded.\n");
-	return 0;
-}
-
-static void __exit
-fini(void)
-{
-	crypto_unregister_alg(&des3_192_alg);
-	crypto_unregister_alg(&des3_128_alg);
-	crypto_unregister_alg(&des_alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_ALIAS("des");
-MODULE_ALIAS("des3_ede");
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms");
diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c
new file mode 100644
index 0000000..98c896b
--- /dev/null
+++ b/arch/s390/crypto/sha1_s390.c
@@ -0,0 +1,167 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the SHA1 Secure Hash Algorithm.
+ *
+ * Derived from cryptoapi implementation, adapted for in-place
+ * scatterlist interface.  Originally based on the public domain
+ * implementation written by Steve Reid.
+ *
+ * s390 Version:
+ *   Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation
+ *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
+ *
+ * Derived from "crypto/sha1.c"
+ *   Copyright (c) Alan Smithee.
+ *   Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ *   Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+#include <asm/byteorder.h>
+#include "crypt_s390.h"
+
+#define SHA1_DIGEST_SIZE	20
+#define SHA1_BLOCK_SIZE		64
+
+struct crypt_s390_sha1_ctx {
+	u64 count;
+	u32 state[5];
+	u32 buf_len;
+	u8 buffer[2 * SHA1_BLOCK_SIZE];
+};
+
+static void
+sha1_init(void *ctx)
+{
+	static const struct crypt_s390_sha1_ctx initstate = {
+		.state = {
+			0x67452301,
+			0xEFCDAB89,
+			0x98BADCFE,
+			0x10325476,
+			0xC3D2E1F0
+		},
+	};
+	memcpy(ctx, &initstate, sizeof(initstate));
+}
+
+static void
+sha1_update(void *ctx, const u8 *data, unsigned int len)
+{
+	struct crypt_s390_sha1_ctx *sctx;
+	long imd_len;
+
+	sctx = ctx;
+	sctx->count += len * 8; //message bit length
+
+	//anything in buffer yet? -> must be completed
+	if (sctx->buf_len && (sctx->buf_len + len) >= SHA1_BLOCK_SIZE) {
+		//complete full block and hash
+		memcpy(sctx->buffer + sctx->buf_len, data,
+				SHA1_BLOCK_SIZE - sctx->buf_len);
+		crypt_s390_kimd(KIMD_SHA_1, sctx->state, sctx->buffer,
+				SHA1_BLOCK_SIZE);
+		data += SHA1_BLOCK_SIZE - sctx->buf_len;
+		len -= SHA1_BLOCK_SIZE - sctx->buf_len;
+		sctx->buf_len = 0;
+	}
+
+	//rest of data contains full blocks?
+	imd_len = len & ~0x3ful;
+	if (imd_len){
+		crypt_s390_kimd(KIMD_SHA_1, sctx->state, data, imd_len);
+		data += imd_len;
+		len -= imd_len;
+	}
+	//anything left? store in buffer
+	if (len){
+		memcpy(sctx->buffer + sctx->buf_len , data, len);
+		sctx->buf_len += len;
+	}
+}
+
+
+static void
+pad_message(struct crypt_s390_sha1_ctx* sctx)
+{
+	int index;
+
+	index = sctx->buf_len;
+	sctx->buf_len = (sctx->buf_len < 56)?
+		SHA1_BLOCK_SIZE:2 * SHA1_BLOCK_SIZE;
+	//start pad with 1
+	sctx->buffer[index] = 0x80;
+	//pad with zeros
+	index++;
+	memset(sctx->buffer + index, 0x00, sctx->buf_len - index);
+	//append length
+	memcpy(sctx->buffer + sctx->buf_len - 8, &sctx->count,
+			sizeof sctx->count);
+}
+
+/* Add padding and return the message digest. */
+static void
+sha1_final(void* ctx, u8 *out)
+{
+	struct crypt_s390_sha1_ctx *sctx = ctx;
+
+	//must perform manual padding
+	pad_message(sctx);
+	crypt_s390_kimd(KIMD_SHA_1, sctx->state, sctx->buffer, sctx->buf_len);
+	//copy digest to out
+	memcpy(out, sctx->state, SHA1_DIGEST_SIZE);
+	/* Wipe context */
+	memset(sctx, 0, sizeof *sctx);
+}
+
+static struct crypto_alg alg = {
+	.cra_name	=	"sha1",
+	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
+	.cra_blocksize	=	SHA1_BLOCK_SIZE,
+	.cra_ctxsize	=	sizeof(struct crypt_s390_sha1_ctx),
+	.cra_module	=	THIS_MODULE,
+	.cra_list       =       LIST_HEAD_INIT(alg.cra_list),
+	.cra_u		=	{ .digest = {
+	.dia_digestsize	=	SHA1_DIGEST_SIZE,
+	.dia_init   	= 	sha1_init,
+	.dia_update 	=	sha1_update,
+	.dia_final  	=	sha1_final } }
+};
+
+static int
+init(void)
+{
+	int ret = -ENOSYS;
+
+	if (crypt_s390_func_available(KIMD_SHA_1)){
+		ret = crypto_register_alg(&alg);
+		if (ret == 0){
+			printk(KERN_INFO "crypt_s390: sha1_s390 loaded.\n");
+		}
+	}
+	return ret;
+}
+
+static void __exit
+fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_ALIAS("sha1");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha1_z990.c b/arch/s390/crypto/sha1_z990.c
deleted file mode 100644
index 298174d..0000000
--- a/arch/s390/crypto/sha1_z990.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Cryptographic API.
- *
- * z990 implementation of the SHA1 Secure Hash Algorithm.
- *
- * Derived from cryptoapi implementation, adapted for in-place
- * scatterlist interface.  Originally based on the public domain
- * implementation written by Steve Reid.
- *
- * s390 Version:
- *   Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation
- *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
- *
- * Derived from "crypto/sha1.c"
- *   Copyright (c) Alan Smithee.
- *   Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- *   Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-#include <asm/byteorder.h>
-#include "crypt_z990.h"
-
-#define SHA1_DIGEST_SIZE	20
-#define SHA1_BLOCK_SIZE		64
-
-struct crypt_z990_sha1_ctx {
-        u64 count;
-        u32 state[5];
-	u32 buf_len;
-        u8 buffer[2 * SHA1_BLOCK_SIZE];
-};
-
-static void
-sha1_init(void *ctx)
-{
-	static const struct crypt_z990_sha1_ctx initstate = {
-		.state = {
-			0x67452301,
-			0xEFCDAB89,
-			0x98BADCFE,
-			0x10325476,
-			0xC3D2E1F0
-		},
-	};
-	memcpy(ctx, &initstate, sizeof(initstate));
-}
-
-static void
-sha1_update(void *ctx, const u8 *data, unsigned int len)
-{
-	struct crypt_z990_sha1_ctx *sctx;
-	long imd_len;
-
-	sctx = ctx;
-	sctx->count += len * 8; //message bit length
-
-	//anything in buffer yet? -> must be completed
-	if (sctx->buf_len && (sctx->buf_len + len) >= SHA1_BLOCK_SIZE) {
-		//complete full block and hash
-		memcpy(sctx->buffer + sctx->buf_len, data,
-				SHA1_BLOCK_SIZE - sctx->buf_len);
-		crypt_z990_kimd(KIMD_SHA_1, sctx->state, sctx->buffer,
-				SHA1_BLOCK_SIZE);
-		data += SHA1_BLOCK_SIZE - sctx->buf_len;
-		len -= SHA1_BLOCK_SIZE - sctx->buf_len;
-		sctx->buf_len = 0;
-	}
-
-	//rest of data contains full blocks?
-	imd_len = len & ~0x3ful;
-	if (imd_len){
-		crypt_z990_kimd(KIMD_SHA_1, sctx->state, data, imd_len);
-		data += imd_len;
-		len -= imd_len;
-	}
-	//anything left? store in buffer
-	if (len){
-		memcpy(sctx->buffer + sctx->buf_len , data, len);
-		sctx->buf_len += len;
-	}
-}
-
-
-static void
-pad_message(struct crypt_z990_sha1_ctx* sctx)
-{
-	int index;
-
-	index = sctx->buf_len;
-	sctx->buf_len = (sctx->buf_len < 56)?
-		SHA1_BLOCK_SIZE:2 * SHA1_BLOCK_SIZE;
-	//start pad with 1
-	sctx->buffer[index] = 0x80;
-	//pad with zeros
-	index++;
-	memset(sctx->buffer + index, 0x00, sctx->buf_len - index);
-	//append length
-	memcpy(sctx->buffer + sctx->buf_len - 8, &sctx->count,
-			sizeof sctx->count);
-}
-
-/* Add padding and return the message digest. */
-static void
-sha1_final(void* ctx, u8 *out)
-{
-	struct crypt_z990_sha1_ctx *sctx = ctx;
-
-	//must perform manual padding
-	pad_message(sctx);
-	crypt_z990_kimd(KIMD_SHA_1, sctx->state, sctx->buffer, sctx->buf_len);
-	//copy digest to out
-	memcpy(out, sctx->state, SHA1_DIGEST_SIZE);
-	/* Wipe context */
-	memset(sctx, 0, sizeof *sctx);
-}
-
-static struct crypto_alg alg = {
-	.cra_name	=	"sha1",
-	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
-	.cra_blocksize	=	SHA1_BLOCK_SIZE,
-	.cra_ctxsize	=	sizeof(struct crypt_z990_sha1_ctx),
-	.cra_module	=	THIS_MODULE,
-	.cra_list       =       LIST_HEAD_INIT(alg.cra_list),
-	.cra_u		=	{ .digest = {
-	.dia_digestsize	=	SHA1_DIGEST_SIZE,
-	.dia_init   	= 	sha1_init,
-	.dia_update 	=	sha1_update,
-	.dia_final  	=	sha1_final } }
-};
-
-static int
-init(void)
-{
-	int ret = -ENOSYS;
-
-	if (crypt_z990_func_available(KIMD_SHA_1)){
-		ret = crypto_register_alg(&alg);
-		if (ret == 0){
-			printk(KERN_INFO "crypt_z990: sha1_z990 loaded.\n");
-		}
-	}
-	return ret;
-}
-
-static void __exit
-fini(void)
-{
-	crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_ALIAS("sha1");
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm");
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 0c495fe..0cb2995 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -632,13 +632,13 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_MD4 is not set
 # CONFIG_CRYPTO_MD5 is not set
 # CONFIG_CRYPTO_SHA1 is not set
-# CONFIG_CRYPTO_SHA1_Z990 is not set
+# CONFIG_CRYPTO_SHA1_S390 is not set
 # CONFIG_CRYPTO_SHA256 is not set
 # CONFIG_CRYPTO_SHA512 is not set
 # CONFIG_CRYPTO_WP512 is not set
 # CONFIG_CRYPTO_TGR192 is not set
 # CONFIG_CRYPTO_DES is not set
-# CONFIG_CRYPTO_DES_Z990 is not set
+# CONFIG_CRYPTO_DES_S390 is not set
 # CONFIG_CRYPTO_BLOWFISH is not set
 # CONFIG_CRYPTO_TWOFISH is not set
 # CONFIG_CRYPTO_SERPENT is not set
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 89299f4..85af571 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -40,8 +40,8 @@ config CRYPTO_SHA1
 	help
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
 
-config CRYPTO_SHA1_Z990
-	tristate "SHA1 digest algorithm for IBM zSeries z990"
+config CRYPTO_SHA1_S390
+	tristate "SHA1 digest algorithm (s390)"
 	depends on CRYPTO && ARCH_S390
 	help
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
@@ -98,8 +98,8 @@ config CRYPTO_DES
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
 
-config CRYPTO_DES_Z990
-	tristate "DES and Triple DES cipher algorithms for IBM zSeries z990"
+config CRYPTO_DES_S390
+	tristate "DES and Triple DES cipher algorithms (s390)"
 	depends on CRYPTO && ARCH_S390
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
-- 
cgit v1.1


From 0a497c17fee428604e06320272ff74415eacdc31 Mon Sep 17 00:00:00 2001
From: Jan Glauber <jan.glauber@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:18 -0800
Subject: [PATCH] s390: sha256 support

Add support for the hardware accelerated sha256 crypto algorithm.

Signed-off-by: Jan Glauber <jan.glauber@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/crypto/Makefile           |   1 +
 arch/s390/crypto/crypt_s390.h       |   2 +
 arch/s390/crypto/crypt_s390_query.c |   6 +-
 arch/s390/crypto/sha256_s390.c      | 151 ++++++++++++++++++++++++++++++++++++
 arch/s390/defconfig                 |   1 +
 crypto/Kconfig                      |  11 +++
 6 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 arch/s390/crypto/sha256_s390.c

diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 50843f8..3fccf61 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o
+obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o
 
 obj-$(CONFIG_CRYPTO_TEST) += crypt_s390_query.o
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
index 4d24f66..b70a410 100644
--- a/arch/s390/crypto/crypt_s390.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -61,6 +61,7 @@ enum crypt_s390_kmc_func {
 enum crypt_s390_kimd_func {
 	KIMD_QUERY   = CRYPT_S390_KIMD | 0,
 	KIMD_SHA_1   = CRYPT_S390_KIMD | 1,
+	KIMD_SHA_256 = CRYPT_S390_KIMD | 2,
 };
 
 /* function codes for KLMD (COMPUTE LAST MESSAGE DIGEST)
@@ -69,6 +70,7 @@ enum crypt_s390_kimd_func {
 enum crypt_s390_klmd_func {
 	KLMD_QUERY   = CRYPT_S390_KLMD | 0,
 	KLMD_SHA_1   = CRYPT_S390_KLMD | 1,
+	KLMD_SHA_256 = CRYPT_S390_KLMD | 2,
 };
 
 /* function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
diff --git a/arch/s390/crypto/crypt_s390_query.c b/arch/s390/crypto/crypt_s390_query.c
index 0fa6bdf..67081b8 100644
--- a/arch/s390/crypto/crypt_s390_query.c
+++ b/arch/s390/crypto/crypt_s390_query.c
@@ -48,16 +48,20 @@ static void query_available_functions(void)
 		crypt_s390_func_available(KIMD_QUERY));
 	printk(KERN_INFO "KIMD_SHA_1: %d\n",
 		crypt_s390_func_available(KIMD_SHA_1));
+	printk(KERN_INFO "KIMD_SHA_256: %d\n",
+		crypt_s390_func_available(KIMD_SHA_256));
 
 	/* query available KLMD functions */
 	printk(KERN_INFO "KLMD_QUERY: %d\n",
 		crypt_s390_func_available(KLMD_QUERY));
 	printk(KERN_INFO "KLMD_SHA_1: %d\n",
 		crypt_s390_func_available(KLMD_SHA_1));
+	printk(KERN_INFO "KLMD_SHA_256: %d\n",
+		crypt_s390_func_available(KLMD_SHA_256));
 
 	/* query available KMAC functions */
 	printk(KERN_INFO "KMAC_QUERY: %d\n",
-		crypt_s3990_func_available(KMAC_QUERY));
+		crypt_s390_func_available(KMAC_QUERY));
 	printk(KERN_INFO "KMAC_DEA: %d\n",
 		crypt_s390_func_available(KMAC_DEA));
 	printk(KERN_INFO "KMAC_TDEA_128: %d\n",
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
new file mode 100644
index 0000000..b75bdbd
--- /dev/null
+++ b/arch/s390/crypto/sha256_s390.c
@@ -0,0 +1,151 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the SHA256 Secure Hash Algorithm.
+ *
+ * s390 Version:
+ *   Copyright (C) 2005 IBM Deutschland GmbH, IBM Corporation
+ *   Author(s): Jan Glauber (jang@de.ibm.com)
+ *
+ * Derived from "crypto/sha256.c"
+ * and "arch/s390/crypto/sha1_s390.c"
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+
+#include "crypt_s390.h"
+
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_BLOCK_SIZE	64
+
+struct s390_sha256_ctx {
+	u64 count;
+	u32 state[8];
+	u8 buf[2 * SHA256_BLOCK_SIZE];
+};
+
+static void sha256_init(void *ctx)
+{
+	struct s390_sha256_ctx *sctx = ctx;
+
+	sctx->state[0] = 0x6a09e667;
+	sctx->state[1] = 0xbb67ae85;
+	sctx->state[2] = 0x3c6ef372;
+	sctx->state[3] = 0xa54ff53a;
+	sctx->state[4] = 0x510e527f;
+	sctx->state[5] = 0x9b05688c;
+	sctx->state[6] = 0x1f83d9ab;
+	sctx->state[7] = 0x5be0cd19;
+	sctx->count = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
+}
+
+static void sha256_update(void *ctx, const u8 *data, unsigned int len)
+{
+	struct s390_sha256_ctx *sctx = ctx;
+	unsigned int index;
+
+	/* how much is already in the buffer? */
+	index = sctx->count / 8 & 0x3f;
+
+	/* update message bit length */
+	sctx->count += len * 8;
+
+	/* process one block */
+	if ((index + len) >= SHA256_BLOCK_SIZE) {
+		memcpy(sctx->buf + index, data, SHA256_BLOCK_SIZE - index);
+		crypt_s390_kimd(KIMD_SHA_256, sctx->state, sctx->buf,
+				SHA256_BLOCK_SIZE);
+		data += SHA256_BLOCK_SIZE - index;
+		len -= SHA256_BLOCK_SIZE - index;
+	}
+
+	/* anything left? */
+	if (len)
+		memcpy(sctx->buf + index , data, len);
+}
+
+static void pad_message(struct s390_sha256_ctx* sctx)
+{
+	int index, end;
+
+	index = sctx->count / 8 & 0x3f;
+	end = index < 56 ? SHA256_BLOCK_SIZE : 2 * SHA256_BLOCK_SIZE;
+
+	/* start pad with 1 */
+	sctx->buf[index] = 0x80;
+
+	/* pad with zeros */
+	index++;
+	memset(sctx->buf + index, 0x00, end - index - 8);
+
+	/* append message length */
+	memcpy(sctx->buf + end - 8, &sctx->count, sizeof sctx->count);
+
+	sctx->count = end * 8;
+}
+
+/* Add padding and return the message digest */
+static void sha256_final(void* ctx, u8 *out)
+{
+	struct s390_sha256_ctx *sctx = ctx;
+
+	/* must perform manual padding */
+	pad_message(sctx);
+
+	crypt_s390_kimd(KIMD_SHA_256, sctx->state, sctx->buf,
+			sctx->count / 8);
+
+	/* copy digest to out */
+	memcpy(out, sctx->state, SHA256_DIGEST_SIZE);
+
+	/* wipe context */
+	memset(sctx, 0, sizeof *sctx);
+}
+
+static struct crypto_alg alg = {
+	.cra_name	=	"sha256",
+	.cra_flags	=	CRYPTO_ALG_TYPE_DIGEST,
+	.cra_blocksize	=	SHA256_BLOCK_SIZE,
+	.cra_ctxsize	=	sizeof(struct s390_sha256_ctx),
+	.cra_module	=	THIS_MODULE,
+	.cra_list	=	LIST_HEAD_INIT(alg.cra_list),
+	.cra_u		=	{ .digest = {
+	.dia_digestsize	=	SHA256_DIGEST_SIZE,
+	.dia_init   	= 	sha256_init,
+	.dia_update 	=	sha256_update,
+	.dia_final  	=	sha256_final } }
+};
+
+static int init(void)
+{
+	int ret;
+
+	if (!crypt_s390_func_available(KIMD_SHA_256))
+		return -ENOSYS;
+
+	ret = crypto_register_alg(&alg);
+	if (ret != 0)
+		printk(KERN_INFO "crypt_s390: sha256_s390 couldn't be loaded.");
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_ALIAS("sha256");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm");
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 0cb2995..fd00a9f 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -634,6 +634,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_SHA1 is not set
 # CONFIG_CRYPTO_SHA1_S390 is not set
 # CONFIG_CRYPTO_SHA256 is not set
+# CONFIG_CRYPTO_SHA256_S390 is not set
 # CONFIG_CRYPTO_SHA512 is not set
 # CONFIG_CRYPTO_WP512 is not set
 # CONFIG_CRYPTO_TGR192 is not set
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 85af571..9fdab74 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -44,6 +44,7 @@ config CRYPTO_SHA1_S390
 	tristate "SHA1 digest algorithm (s390)"
 	depends on CRYPTO && ARCH_S390
 	help
+	  This is the s390 hardware accelerated implementation of the
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
 
 config CRYPTO_SHA256
@@ -55,6 +56,16 @@ config CRYPTO_SHA256
 	  This version of SHA implements a 256 bit hash with 128 bits of
 	  security against collision attacks.
 
+config CRYPTO_SHA256_S390
+	tristate "SHA256 digest algorithm (s390)"
+	depends on CRYPTO && ARCH_S390
+	help
+	  This is the s390 hardware accelerated implementation of the
+	  SHA256 secure hash standard (DFIPS 180-2).
+
+	  This version of SHA implements a 256 bit hash with 128 bits of
+	  security against collision attacks.
+
 config CRYPTO_SHA512
 	tristate "SHA384 and SHA512 digest algorithms"
 	depends on CRYPTO
-- 
cgit v1.1


From bf754ae8ef8bc443c067601d9401103e4001e7c5 Mon Sep 17 00:00:00 2001
From: Jan Glauber <jan.glauber@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:18 -0800
Subject: [PATCH] s390: aes support

Add support for the hardware accelerated AES crypto algorithm.

Signed-off-by: Jan Glauber <jan.glauber@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/crypto/Makefile           |   1 +
 arch/s390/crypto/aes_s390.c         | 248 ++++++++++++++++++++++++++++++++++++
 arch/s390/crypto/crypt_s390.h       |  40 ++++--
 arch/s390/crypto/crypt_s390_query.c |  12 ++
 arch/s390/defconfig                 |   1 +
 crypto/Kconfig                      |  20 +++
 6 files changed, 308 insertions(+), 14 deletions(-)
 create mode 100644 arch/s390/crypto/aes_s390.c

diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 3fccf61..bfe2541 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -5,5 +5,6 @@
 obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o
 obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o
+obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 
 obj-$(CONFIG_CRYPTO_TEST) += crypt_s390_query.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
new file mode 100644
index 0000000..7a1033d
--- /dev/null
+++ b/arch/s390/crypto/aes_s390.c
@@ -0,0 +1,248 @@
+/*
+ * Cryptographic API.
+ *
+ * s390 implementation of the AES Cipher Algorithm.
+ *
+ * s390 Version:
+ *   Copyright (C) 2005 IBM Deutschland GmbH, IBM Corporation
+ *   Author(s): Jan Glauber (jang@de.ibm.com)
+ *
+ * Derived from "crypto/aes.c"
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/crypto.h>
+#include "crypt_s390.h"
+
+#define AES_MIN_KEY_SIZE	16
+#define AES_MAX_KEY_SIZE	32
+
+/* data block size for all key lengths */
+#define AES_BLOCK_SIZE		16
+
+int has_aes_128 = 0;
+int has_aes_192 = 0;
+int has_aes_256 = 0;
+
+struct s390_aes_ctx {
+	u8 iv[AES_BLOCK_SIZE];
+	u8 key[AES_MAX_KEY_SIZE];
+	int key_len;
+};
+
+static int aes_set_key(void *ctx, const u8 *in_key, unsigned int key_len,
+		       u32 *flags)
+{
+	struct s390_aes_ctx *sctx = ctx;
+
+	switch (key_len) {
+	case 16:
+		if (!has_aes_128)
+			goto fail;
+		break;
+	case 24:
+		if (!has_aes_192)
+			goto fail;
+
+		break;
+	case 32:
+		if (!has_aes_256)
+			goto fail;
+		break;
+	default:
+		/* invalid key length */
+		goto fail;
+		break;
+	}
+
+	sctx->key_len = key_len;
+	memcpy(sctx->key, in_key, key_len);
+	return 0;
+fail:
+	*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+	return -EINVAL;
+}
+
+static void aes_encrypt(void *ctx, u8 *out, const u8 *in)
+{
+	const struct s390_aes_ctx *sctx = ctx;
+
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	case 24:
+		crypt_s390_km(KM_AES_192_ENCRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	case 32:
+		crypt_s390_km(KM_AES_256_ENCRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	}
+}
+
+static void aes_decrypt(void *ctx, u8 *out, const u8 *in)
+{
+	const struct s390_aes_ctx *sctx = ctx;
+
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	case 24:
+		crypt_s390_km(KM_AES_192_DECRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	case 32:
+		crypt_s390_km(KM_AES_256_DECRYPT, &sctx->key, out, in,
+			      AES_BLOCK_SIZE);
+		break;
+	}
+}
+
+static unsigned int aes_encrypt_ecb(const struct cipher_desc *desc, u8 *out,
+				    const u8 *in, unsigned int nbytes)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
+
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in, nbytes);
+		break;
+	case 24:
+		crypt_s390_km(KM_AES_192_ENCRYPT, &sctx->key, out, in, nbytes);
+		break;
+	case 32:
+		crypt_s390_km(KM_AES_256_ENCRYPT, &sctx->key, out, in, nbytes);
+		break;
+	}
+	return nbytes & ~(AES_BLOCK_SIZE - 1);
+}
+
+static unsigned int aes_decrypt_ecb(const struct cipher_desc *desc, u8 *out,
+				    const u8 *in, unsigned int nbytes)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
+
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in, nbytes);
+		break;
+	case 24:
+		crypt_s390_km(KM_AES_192_DECRYPT, &sctx->key, out, in, nbytes);
+		break;
+	case 32:
+		crypt_s390_km(KM_AES_256_DECRYPT, &sctx->key, out, in, nbytes);
+		break;
+	}
+	return nbytes & ~(AES_BLOCK_SIZE - 1);
+}
+
+static unsigned int aes_encrypt_cbc(const struct cipher_desc *desc, u8 *out,
+				    const u8 *in, unsigned int nbytes)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
+
+	memcpy(&sctx->iv, desc->info, AES_BLOCK_SIZE);
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_kmc(KMC_AES_128_ENCRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	case 24:
+		crypt_s390_kmc(KMC_AES_192_ENCRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	case 32:
+		crypt_s390_kmc(KMC_AES_256_ENCRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	}
+	memcpy(desc->info, &sctx->iv, AES_BLOCK_SIZE);
+
+	return nbytes & ~(AES_BLOCK_SIZE - 1);
+}
+
+static unsigned int aes_decrypt_cbc(const struct cipher_desc *desc, u8 *out,
+				    const u8 *in, unsigned int nbytes)
+{
+	struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
+
+	memcpy(&sctx->iv, desc->info, AES_BLOCK_SIZE);
+	switch (sctx->key_len) {
+	case 16:
+		crypt_s390_kmc(KMC_AES_128_DECRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	case 24:
+		crypt_s390_kmc(KMC_AES_192_DECRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	case 32:
+		crypt_s390_kmc(KMC_AES_256_DECRYPT, &sctx->iv, out, in, nbytes);
+		break;
+	}
+	return nbytes & ~(AES_BLOCK_SIZE - 1);
+}
+
+
+static struct crypto_alg aes_alg = {
+	.cra_name		=	"aes",
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct s390_aes_ctx),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
+	.cra_u			=	{
+		.cipher = {
+			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
+			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
+			.cia_setkey		=	aes_set_key,
+			.cia_encrypt		=	aes_encrypt,
+			.cia_decrypt		=	aes_decrypt,
+			.cia_encrypt_ecb	=	aes_encrypt_ecb,
+			.cia_decrypt_ecb	=	aes_decrypt_ecb,
+			.cia_encrypt_cbc	=	aes_encrypt_cbc,
+			.cia_decrypt_cbc	=	aes_decrypt_cbc,
+		}
+	}
+};
+
+static int __init aes_init(void)
+{
+	int ret;
+
+	if (crypt_s390_func_available(KM_AES_128_ENCRYPT))
+		has_aes_128 = 1;
+	if (crypt_s390_func_available(KM_AES_192_ENCRYPT))
+		has_aes_192 = 1;
+	if (crypt_s390_func_available(KM_AES_256_ENCRYPT))
+		has_aes_256 = 1;
+
+	if (!has_aes_128 && !has_aes_192 && !has_aes_256)
+		return -ENOSYS;
+
+	ret = crypto_register_alg(&aes_alg);
+	if (ret != 0)
+		printk(KERN_INFO "crypt_s390: aes_s390 couldn't be loaded.\n");
+	return ret;
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_ALIAS("aes");
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
+MODULE_LICENSE("GPL");
+
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
index b70a410..d6712cf 100644
--- a/arch/s390/crypto/crypt_s390.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -33,26 +33,38 @@ enum crypt_s390_operations {
  * 0x80 is the decipher modifier bit
  */
 enum crypt_s390_km_func {
-	KM_QUERY            = CRYPT_S390_KM | 0,
-	KM_DEA_ENCRYPT      = CRYPT_S390_KM | 1,
-	KM_DEA_DECRYPT      = CRYPT_S390_KM | 1 | 0x80,
-	KM_TDEA_128_ENCRYPT = CRYPT_S390_KM | 2,
-	KM_TDEA_128_DECRYPT = CRYPT_S390_KM | 2 | 0x80,
-	KM_TDEA_192_ENCRYPT = CRYPT_S390_KM | 3,
-	KM_TDEA_192_DECRYPT = CRYPT_S390_KM | 3 | 0x80,
+	KM_QUERY	    = CRYPT_S390_KM | 0x0,
+	KM_DEA_ENCRYPT      = CRYPT_S390_KM | 0x1,
+	KM_DEA_DECRYPT      = CRYPT_S390_KM | 0x1 | 0x80,
+	KM_TDEA_128_ENCRYPT = CRYPT_S390_KM | 0x2,
+	KM_TDEA_128_DECRYPT = CRYPT_S390_KM | 0x2 | 0x80,
+	KM_TDEA_192_ENCRYPT = CRYPT_S390_KM | 0x3,
+	KM_TDEA_192_DECRYPT = CRYPT_S390_KM | 0x3 | 0x80,
+	KM_AES_128_ENCRYPT  = CRYPT_S390_KM | 0x12,
+	KM_AES_128_DECRYPT  = CRYPT_S390_KM | 0x12 | 0x80,
+	KM_AES_192_ENCRYPT  = CRYPT_S390_KM | 0x13,
+	KM_AES_192_DECRYPT  = CRYPT_S390_KM | 0x13 | 0x80,
+	KM_AES_256_ENCRYPT  = CRYPT_S390_KM | 0x14,
+	KM_AES_256_DECRYPT  = CRYPT_S390_KM | 0x14 | 0x80,
 };
 
 /* function codes for KMC (CIPHER MESSAGE WITH CHAINING)
  * instruction
  */
 enum crypt_s390_kmc_func {
-	KMC_QUERY            = CRYPT_S390_KMC | 0,
-	KMC_DEA_ENCRYPT      = CRYPT_S390_KMC | 1,
-	KMC_DEA_DECRYPT      = CRYPT_S390_KMC | 1 | 0x80,
-	KMC_TDEA_128_ENCRYPT = CRYPT_S390_KMC | 2,
-	KMC_TDEA_128_DECRYPT = CRYPT_S390_KMC | 2 | 0x80,
-	KMC_TDEA_192_ENCRYPT = CRYPT_S390_KMC | 3,
-	KMC_TDEA_192_DECRYPT = CRYPT_S390_KMC | 3 | 0x80,
+	KMC_QUERY            = CRYPT_S390_KMC | 0x0,
+	KMC_DEA_ENCRYPT      = CRYPT_S390_KMC | 0x1,
+	KMC_DEA_DECRYPT      = CRYPT_S390_KMC | 0x1 | 0x80,
+	KMC_TDEA_128_ENCRYPT = CRYPT_S390_KMC | 0x2,
+	KMC_TDEA_128_DECRYPT = CRYPT_S390_KMC | 0x2 | 0x80,
+	KMC_TDEA_192_ENCRYPT = CRYPT_S390_KMC | 0x3,
+	KMC_TDEA_192_DECRYPT = CRYPT_S390_KMC | 0x3 | 0x80,
+	KMC_AES_128_ENCRYPT  = CRYPT_S390_KMC | 0x12,
+	KMC_AES_128_DECRYPT  = CRYPT_S390_KMC | 0x12 | 0x80,
+	KMC_AES_192_ENCRYPT  = CRYPT_S390_KMC | 0x13,
+	KMC_AES_192_DECRYPT  = CRYPT_S390_KMC | 0x13 | 0x80,
+	KMC_AES_256_ENCRYPT  = CRYPT_S390_KMC | 0x14,
+	KMC_AES_256_DECRYPT  = CRYPT_S390_KMC | 0x14 | 0x80,
 };
 
 /* function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
diff --git a/arch/s390/crypto/crypt_s390_query.c b/arch/s390/crypto/crypt_s390_query.c
index 67081b8..def02bd 100644
--- a/arch/s390/crypto/crypt_s390_query.c
+++ b/arch/s390/crypto/crypt_s390_query.c
@@ -32,6 +32,12 @@ static void query_available_functions(void)
 		crypt_s390_func_available(KM_TDEA_128_ENCRYPT));
 	printk(KERN_INFO "KM_TDEA_192: %d\n",
 		crypt_s390_func_available(KM_TDEA_192_ENCRYPT));
+	printk(KERN_INFO "KM_AES_128: %d\n",
+		crypt_s390_func_available(KM_AES_128_ENCRYPT));
+	printk(KERN_INFO "KM_AES_192: %d\n",
+		crypt_s390_func_available(KM_AES_192_ENCRYPT));
+	printk(KERN_INFO "KM_AES_256: %d\n",
+		crypt_s390_func_available(KM_AES_256_ENCRYPT));
 
 	/* query available KMC functions */
 	printk(KERN_INFO "KMC_QUERY: %d\n",
@@ -42,6 +48,12 @@ static void query_available_functions(void)
 		crypt_s390_func_available(KMC_TDEA_128_ENCRYPT));
 	printk(KERN_INFO "KMC_TDEA_192: %d\n",
 		crypt_s390_func_available(KMC_TDEA_192_ENCRYPT));
+	printk(KERN_INFO "KMC_AES_128: %d\n",
+		crypt_s390_func_available(KMC_AES_128_ENCRYPT));
+	printk(KERN_INFO "KMC_AES_192: %d\n",
+		crypt_s390_func_available(KMC_AES_192_ENCRYPT));
+	printk(KERN_INFO "KMC_AES_256: %d\n",
+		crypt_s390_func_available(KMC_AES_256_ENCRYPT));
 
 	/* query available KIMD fucntions */
 	printk(KERN_INFO "KIMD_QUERY: %d\n",
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index fd00a9f..f195c7e 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -644,6 +644,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_TWOFISH is not set
 # CONFIG_CRYPTO_SERPENT is not set
 # CONFIG_CRYPTO_AES is not set
+# CONFIG_CRYPTO_AES_S390 is not set
 # CONFIG_CRYPTO_CAST5 is not set
 # CONFIG_CRYPTO_CAST6 is not set
 # CONFIG_CRYPTO_TEA is not set
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 9fdab74..c696f7a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -215,6 +215,26 @@ config CRYPTO_AES_X86_64
 
 	  See <http://csrc.nist.gov/encryption/aes/> for more information.
 
+config CRYPTO_AES_S390
+	tristate "AES cipher algorithms (s390)"
+	depends on CRYPTO && ARCH_S390
+	help
+	  This is the s390 hardware accelerated implementation of the
+	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
+	  algorithm.
+
+	  Rijndael appears to be consistently a very good performer in
+	  both hardware and software across a wide range of computing
+	  environments regardless of its use in feedback or non-feedback
+	  modes. Its key setup time is excellent, and its key agility is
+	  good. Rijndael's very low memory requirements make it very well
+	  suited for restricted-space environments, in which it also
+	  demonstrates excellent performance. Rijndael's operations are
+	  among the easiest to defend against power and timing attacks.
+
+	  On s390 the System z9-109 currently only supports the key size
+	  of 128 bit.
+
 config CRYPTO_CAST5
 	tristate "CAST5 (CAST-128) cipher algorithm"
 	depends on CRYPTO
-- 
cgit v1.1


From 05f29fcdb0c6c99484c8bea5e244fe2f4edc9337 Mon Sep 17 00:00:00 2001
From: Jan Glauber <jan.glauber@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:19 -0800
Subject: [PATCH] s390: in-kernel crypto test vectors

Add new test vectors to the AES test suite for AES CBC and AES with plaintext
larger than AES blocksize.

Signed-off-by: Jan Glauber <jan.glauber@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 crypto/tcrypt.c |  4 ++++
 crypto/tcrypt.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 53f4ee8..49e344f 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -805,6 +805,8 @@ static void do_test(void)
 		//AES
 		test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS);
 		test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS);
+		test_cipher ("aes", MODE_CBC, ENCRYPT, aes_cbc_enc_tv_template, AES_CBC_ENC_TEST_VECTORS);
+		test_cipher ("aes", MODE_CBC, DECRYPT, aes_cbc_dec_tv_template, AES_CBC_DEC_TEST_VECTORS);
 
 		//CAST5
 		test_cipher ("cast5", MODE_ECB, ENCRYPT, cast5_enc_tv_template, CAST5_ENC_TEST_VECTORS);
@@ -910,6 +912,8 @@ static void do_test(void)
 	case 10:
 		test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS);
 		test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS);
+		test_cipher ("aes", MODE_CBC, ENCRYPT, aes_cbc_enc_tv_template, AES_CBC_ENC_TEST_VECTORS);
+		test_cipher ("aes", MODE_CBC, DECRYPT, aes_cbc_dec_tv_template, AES_CBC_DEC_TEST_VECTORS);
 		break;
 
 	case 11:
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index 522ffd4..733d07e 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -1836,6 +1836,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
  */
 #define AES_ENC_TEST_VECTORS 3
 #define AES_DEC_TEST_VECTORS 3
+#define AES_CBC_ENC_TEST_VECTORS 2
+#define AES_CBC_DEC_TEST_VECTORS 2
 
 static struct cipher_testvec aes_enc_tv_template[] = {
 	{ /* From FIPS-197 */
@@ -1911,6 +1913,68 @@ static struct cipher_testvec aes_dec_tv_template[] = {
 	},
 };
 
+static struct cipher_testvec aes_cbc_enc_tv_template[] = {
+	{ /* From RFC 3602 */
+		.key    = { 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b,
+			    0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 },
+		.klen   = 16,
+		.iv	= { 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30,
+			    0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 },
+		.input	= { "Single block msg" },
+		.ilen   = 16,
+		.result = { 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8,
+			    0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a },
+		.rlen   = 16,
+	}, {
+		.key    = { 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0,
+			    0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a },
+		.klen   = 16,
+		.iv     = { 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28,
+			    0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 },
+		.input  = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+			    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+			    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
+		.ilen   = 32,
+		.result = { 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a,
+			    0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, 0xdc, 0x0a,
+			    0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9,
+			    0x1b, 0x82, 0x66, 0xbe, 0xa6, 0xd6, 0x1a, 0xb1 },
+		.rlen   = 32,
+	},
+};
+
+static struct cipher_testvec aes_cbc_dec_tv_template[] = {
+	{ /* From RFC 3602 */
+		.key    = { 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b,
+			    0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 },
+		.klen   = 16,
+		.iv     = { 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30,
+			    0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 },
+		.input  = { 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8,
+			    0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a },
+		.ilen   = 16,
+		.result = { "Single block msg" },
+		.rlen   = 16,
+	}, {
+		.key    = { 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0,
+			    0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a },
+		.klen   = 16,
+		.iv     = { 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28,
+			    0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 },
+		.input  = { 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a,
+			    0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, 0xdc, 0x0a,
+			    0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9,
+			    0x1b, 0x82, 0x66, 0xbe, 0xa6, 0xd6, 0x1a, 0xb1 },
+		.ilen   = 32,
+		.result = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+			    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+			    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
+		.rlen   = 32,
+	},
+};
+
 /* Cast5 test vectors from RFC 2144 */
 #define CAST5_ENC_TEST_VECTORS	3
 #define CAST5_DEC_TEST_VECTORS	3
-- 
cgit v1.1


From 8129ee164267dc030b8e1d541ee3643c0b9f2fa1 Mon Sep 17 00:00:00 2001
From: Frank Pavlic <pavlic@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:20 -0800
Subject: [PATCH] s390: qdio V=V pass-through

New feature V=V qdio pass-through.

QDIO and HiperSockets processing in z/VM V=V guest environments (as well as
V=R with z/VM running in LPAR mode) requires shadowing of all QDIO
architecture queue elements.  Especially the shadowing of SBALs and SLSBs
structures in the hypervisor, and the need to issue SIGA SYNC operations to
observe state changes, eventually causes significant CPU processing overhead
in the hypervisor.

The QDIO pass-through support for V=V guests avoids the shadowing of SBALs and
SLSBs.  This significantly reduces the hypervisor overhead for QDIO based I/O.

Signed-off-by: Frank Pavlic <pavlic@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig       |   7 +-
 drivers/s390/cio/chsc.h |   4 +-
 drivers/s390/cio/qdio.c | 589 ++++++++++++++++++++++++++++++++++++++----------
 drivers/s390/cio/qdio.h | 104 ++++++---
 include/asm-s390/qdio.h |   8 +-
 5 files changed, 556 insertions(+), 156 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 477ac27..1846fbf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -240,8 +240,8 @@ config MACHCHK_WARNING
 config QDIO
 	tristate "QDIO support"
 	---help---
-	  This driver provides the Queued Direct I/O base support for the
-	  IBM S/390 (G5 and G6) and eServer zSeries (z800, z890, z900 and z990).
+	  This driver provides the Queued Direct I/O base support for
+	  IBM mainframes.
 
 	  For details please refer to the documentation provided by IBM at
 	  <http://www10.software.ibm.com/developerworks/opensource/linux390>
@@ -263,7 +263,8 @@ config QDIO_DEBUG
 	bool "Extended debugging information"
 	depends on QDIO
 	help
-	  Say Y here to get extended debugging output in /proc/s390dbf/qdio...
+	  Say Y here to get extended debugging output in
+	    /sys/kernel/debug/s390dbf/qdio...
 	  Warning: this option reduces the performance of the QDIO module.
 
 	  If unsure, say N.
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index be20da4..6945013 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -43,7 +43,9 @@ struct css_general_char {
 	u32 ext_mb : 1;  /* bit 48 */
 	u32 : 7;
 	u32 aif_tdd : 1; /* bit 56 */
-	u32 : 10;
+	u32 : 1;
+	u32 qebsm : 1;   /* bit 58 */
+	u32 : 8;
 	u32 aif_osa : 1; /* bit 67 */
 	u32 : 28;
 }__attribute__((packed));
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index eb39218..e8bdfcd 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -56,7 +56,7 @@
 #include "ioasm.h"
 #include "chsc.h"
 
-#define VERSION_QDIO_C "$Revision: 1.108 $"
+#define VERSION_QDIO_C "$Revision: 1.113 $"
 
 /****************** MODULE PARAMETER VARIABLES ********************/
 MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com>");
@@ -76,6 +76,7 @@ static struct qdio_perf_stats perf_stats;
 #endif /* QDIO_PERFORMANCE_STATS */
 
 static int hydra_thinints;
+static int is_passthrough = 0;
 static int omit_svs;
 
 static int indicator_used[INDICATORS_PER_CACHELINE];
@@ -136,12 +137,126 @@ qdio_release_q(struct qdio_q *q)
 	atomic_dec(&q->use_count);
 }
 
-static volatile inline void 
-qdio_set_slsb(volatile char *slsb, unsigned char value)
+/*check ccq  */
+static inline int
+qdio_check_ccq(struct qdio_q *q, unsigned int ccq)
 {
-	xchg((char*)slsb,value);
+	char dbf_text[15];
+
+	if (ccq == 0 || ccq == 32 || ccq == 96)
+		return 0;
+	if (ccq == 97)
+		return 1;
+	/*notify devices immediately*/
+	sprintf(dbf_text,"%d", ccq);
+	QDIO_DBF_TEXT2(1,trace,dbf_text);
+	return -EIO;
 }
+/* EQBS: extract buffer states */
+static inline int
+qdio_do_eqbs(struct qdio_q *q, unsigned char *state,
+	     unsigned int *start, unsigned int *cnt)
+{
+	struct qdio_irq *irq;
+	unsigned int tmp_cnt, q_no, ccq;
+	int rc ;
+	char dbf_text[15];
 
+	ccq = 0;
+	tmp_cnt = *cnt;
+	irq = (struct qdio_irq*)q->irq_ptr;
+	q_no = q->q_no;
+	if(!q->is_input_q)
+		q_no += irq->no_input_qs;
+	ccq = do_eqbs(irq->sch_token, state, q_no, start, cnt);
+	rc = qdio_check_ccq(q, ccq);
+	if (rc < 0) {
+                QDIO_DBF_TEXT2(1,trace,"eqberr");
+                sprintf(dbf_text,"%2x,%2x,%d,%d",tmp_cnt, *cnt, ccq, q_no);
+                QDIO_DBF_TEXT2(1,trace,dbf_text);
+		q->handler(q->cdev,QDIO_STATUS_ACTIVATE_CHECK_CONDITION|
+				QDIO_STATUS_LOOK_FOR_ERROR,
+				0, 0, 0, -1, -1, q->int_parm);
+		return 0;
+	}
+	return (tmp_cnt - *cnt);
+}
+
+/* SQBS: set buffer states */
+static inline int
+qdio_do_sqbs(struct qdio_q *q, unsigned char state,
+	     unsigned int *start, unsigned int *cnt)
+{
+	struct qdio_irq *irq;
+	unsigned int tmp_cnt, q_no, ccq;
+	int rc;
+	char dbf_text[15];
+
+	ccq = 0;
+	tmp_cnt = *cnt;
+	irq = (struct qdio_irq*)q->irq_ptr;
+	q_no = q->q_no;
+	if(!q->is_input_q)
+		q_no += irq->no_input_qs;
+	ccq = do_sqbs(irq->sch_token, state, q_no, start, cnt);
+	rc = qdio_check_ccq(q, ccq);
+	if (rc < 0) {
+                QDIO_DBF_TEXT3(1,trace,"sqberr");
+                sprintf(dbf_text,"%2x,%2x,%d,%d",tmp_cnt,*cnt,ccq,q_no);
+                QDIO_DBF_TEXT3(1,trace,dbf_text);
+		q->handler(q->cdev,QDIO_STATUS_ACTIVATE_CHECK_CONDITION|
+				QDIO_STATUS_LOOK_FOR_ERROR,
+				0, 0, 0, -1, -1, q->int_parm);
+		return 0;
+	}
+	return (tmp_cnt - *cnt);
+}
+
+static inline int
+qdio_set_slsb(struct qdio_q *q, unsigned int *bufno,
+	      unsigned char state, unsigned int *count)
+{
+	volatile char *slsb;
+	struct qdio_irq *irq;
+
+	irq = (struct qdio_irq*)q->irq_ptr;
+	if (!irq->is_qebsm) {
+		slsb = (char *)&q->slsb.acc.val[(*bufno)];
+		xchg(slsb, state);
+		return 1;
+	}
+	return qdio_do_sqbs(q, state, bufno, count);
+}
+
+#ifdef CONFIG_QDIO_DEBUG
+static inline void
+qdio_trace_slsb(struct qdio_q *q)
+{
+	if (q->queue_type==QDIO_TRACE_QTYPE) {
+		if (q->is_input_q)
+			QDIO_DBF_HEX2(0,slsb_in,&q->slsb,
+				      QDIO_MAX_BUFFERS_PER_Q);
+		else
+			QDIO_DBF_HEX2(0,slsb_out,&q->slsb,
+				      QDIO_MAX_BUFFERS_PER_Q);
+	}
+}
+#endif
+
+static inline int
+set_slsb(struct qdio_q *q, unsigned int *bufno,
+	 unsigned char state, unsigned int *count)
+{
+	int rc;
+#ifdef CONFIG_QDIO_DEBUG
+	qdio_trace_slsb(q);
+#endif
+	rc = qdio_set_slsb(q, bufno, state, count);
+#ifdef CONFIG_QDIO_DEBUG
+	qdio_trace_slsb(q);
+#endif
+	return rc;
+}
 static inline int 
 qdio_siga_sync(struct qdio_q *q, unsigned int gpr2,
 	       unsigned int gpr3)
@@ -155,7 +270,7 @@ qdio_siga_sync(struct qdio_q *q, unsigned int gpr2,
 	perf_stats.siga_syncs++;
 #endif /* QDIO_PERFORMANCE_STATS */
 
-	cc = do_siga_sync(q->irq, gpr2, gpr3);
+	cc = do_siga_sync(0x10000|q->irq, gpr2, gpr3);
 	if (cc)
 		QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
 
@@ -170,6 +285,19 @@ qdio_siga_sync_q(struct qdio_q *q)
 	return qdio_siga_sync(q, q->mask, 0);
 }
 
+static int
+__do_siga_output(struct qdio_q *q, unsigned int *busy_bit)
+{
+       struct qdio_irq *irq;
+       unsigned int fc = 0;
+
+       irq = (struct qdio_irq *) q->irq_ptr;
+       if (!irq->is_qebsm)
+               return do_siga_output(0x10000|q->irq, q->mask, busy_bit, fc);
+       fc |= 0x80;
+       return do_siga_output(irq->sch_token, q->mask, busy_bit, fc);
+}
+
 /* 
  * returns QDIO_SIGA_ERROR_ACCESS_EXCEPTION as cc, when SIGA returns
  * an access exception 
@@ -189,7 +317,7 @@ qdio_siga_output(struct qdio_q *q)
 	QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
 
 	for (;;) {
-		cc = do_siga_output(q->irq, q->mask, &busy_bit);
+		cc = __do_siga_output(q, &busy_bit);
 //QDIO_PRINT_ERR("cc=%x, busy=%x\n",cc,busy_bit);
 		if ((cc==2) && (busy_bit) && (q->is_iqdio_q)) {
 			if (!start_time) 
@@ -221,7 +349,7 @@ qdio_siga_input(struct qdio_q *q)
 	perf_stats.siga_ins++;
 #endif /* QDIO_PERFORMANCE_STATS */
 
-	cc = do_siga_input(q->irq, q->mask);
+	cc = do_siga_input(0x10000|q->irq, q->mask);
 	
 	if (cc)
 		QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
@@ -230,7 +358,7 @@ qdio_siga_input(struct qdio_q *q)
 }
 
 /* locked by the locks in qdio_activate and qdio_cleanup */
-static __u32 volatile *
+static __u32 *
 qdio_get_indicator(void)
 {
 	int i;
@@ -258,7 +386,7 @@ qdio_put_indicator(__u32 *addr)
 		atomic_dec(&spare_indicator_usecount);
 }
 
-static inline volatile void 
+static inline void
 tiqdio_clear_summary_bit(__u32 *location)
 {
 	QDIO_DBF_TEXT5(0,trace,"clrsummb");
@@ -267,7 +395,7 @@ tiqdio_clear_summary_bit(__u32 *location)
 	xchg(location,0);
 }
 
-static inline volatile void
+static inline  void
 tiqdio_set_summary_bit(__u32 *location)
 {
 	QDIO_DBF_TEXT5(0,trace,"setsummb");
@@ -336,7 +464,9 @@ static inline int
 qdio_stop_polling(struct qdio_q *q)
 {
 #ifdef QDIO_USE_PROCESSING_STATE
-	int gsf;
+       unsigned int tmp, gsf, count = 1;
+       unsigned char state = 0;
+       struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
 
 	if (!atomic_swap(&q->polling,0)) 
 		return 1;
@@ -348,17 +478,22 @@ qdio_stop_polling(struct qdio_q *q)
 	if (!q->is_input_q)
 		return 1;
 
-	gsf=GET_SAVED_FRONTIER(q);
-	set_slsb(&q->slsb.acc.val[(gsf+QDIO_MAX_BUFFERS_PER_Q-1)&
-				  (QDIO_MAX_BUFFERS_PER_Q-1)],
-		 SLSB_P_INPUT_NOT_INIT);
+       tmp = gsf = GET_SAVED_FRONTIER(q);
+       tmp = ((tmp + QDIO_MAX_BUFFERS_PER_Q-1) & (QDIO_MAX_BUFFERS_PER_Q-1) );
+       set_slsb(q, &tmp, SLSB_P_INPUT_NOT_INIT, &count);
+
 	/* 
 	 * we don't issue this SYNC_MEMORY, as we trust Rick T and
 	 * moreover will not use the PROCESSING state under VM, so
 	 * q->polling was 0 anyway
 	 */
 	/*SYNC_MEMORY;*/
-	if (q->slsb.acc.val[gsf]!=SLSB_P_INPUT_PRIMED)
+       if (irq->is_qebsm) {
+               count = 1;
+               qdio_do_eqbs(q, &state, &gsf, &count);
+       } else
+               state = q->slsb.acc.val[gsf];
+       if (state != SLSB_P_INPUT_PRIMED)
 		return 1;
 	/* 
 	 * set our summary bit again, as otherwise there is a
@@ -431,18 +566,136 @@ tiqdio_clear_global_summary(void)
 
 
 /************************* OUTBOUND ROUTINES *******************************/
+static int
+qdio_qebsm_get_outbound_buffer_frontier(struct qdio_q *q)
+{
+        struct qdio_irq *irq;
+        unsigned char state;
+        unsigned int cnt, count, ftc;
+
+        irq = (struct qdio_irq *) q->irq_ptr;
+        if ((!q->is_iqdio_q) && (!q->hydra_gives_outbound_pcis))
+                SYNC_MEMORY;
+
+        ftc = q->first_to_check;
+        count = qdio_min(atomic_read(&q->number_of_buffers_used),
+                        (QDIO_MAX_BUFFERS_PER_Q-1));
+        if (count == 0)
+                return q->first_to_check;
+        cnt = qdio_do_eqbs(q, &state, &ftc, &count);
+        if (cnt == 0)
+                return q->first_to_check;
+        switch (state) {
+        case SLSB_P_OUTPUT_ERROR:
+                QDIO_DBF_TEXT3(0,trace,"outperr");
+                atomic_sub(cnt , &q->number_of_buffers_used);
+                if (q->qdio_error)
+                        q->error_status_flags |=
+                                QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR;
+                q->qdio_error = SLSB_P_OUTPUT_ERROR;
+                q->error_status_flags |= QDIO_STATUS_LOOK_FOR_ERROR;
+                q->first_to_check = ftc;
+                break;
+        case SLSB_P_OUTPUT_EMPTY:
+                QDIO_DBF_TEXT5(0,trace,"outpempt");
+                atomic_sub(cnt, &q->number_of_buffers_used);
+                q->first_to_check = ftc;
+                break;
+        case SLSB_CU_OUTPUT_PRIMED:
+                /* all buffers primed */
+                QDIO_DBF_TEXT5(0,trace,"outpprim");
+                break;
+        default:
+                break;
+        }
+        QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
+        return q->first_to_check;
+}
+
+static int
+qdio_qebsm_get_inbound_buffer_frontier(struct qdio_q *q)
+{
+        struct qdio_irq *irq;
+        unsigned char state;
+        int tmp, ftc, count, cnt;
+        char dbf_text[15];
+
+
+        irq = (struct qdio_irq *) q->irq_ptr;
+        ftc = q->first_to_check;
+        count = qdio_min(atomic_read(&q->number_of_buffers_used),
+                        (QDIO_MAX_BUFFERS_PER_Q-1));
+        if (count == 0)
+                 return q->first_to_check;
+        cnt = qdio_do_eqbs(q, &state, &ftc, &count);
+        if (cnt == 0)
+                 return q->first_to_check;
+        switch (state) {
+        case SLSB_P_INPUT_ERROR :
+#ifdef CONFIG_QDIO_DEBUG
+                QDIO_DBF_TEXT3(1,trace,"inperr");
+                sprintf(dbf_text,"%2x,%2x",ftc,count);
+                QDIO_DBF_TEXT3(1,trace,dbf_text);
+#endif /* CONFIG_QDIO_DEBUG */
+                if (q->qdio_error)
+                        q->error_status_flags |=
+                                QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR;
+                q->qdio_error = SLSB_P_INPUT_ERROR;
+                q->error_status_flags |= QDIO_STATUS_LOOK_FOR_ERROR;
+                atomic_sub(cnt, &q->number_of_buffers_used);
+                q->first_to_check = ftc;
+                break;
+        case SLSB_P_INPUT_PRIMED :
+                QDIO_DBF_TEXT3(0,trace,"inptprim");
+                sprintf(dbf_text,"%2x,%2x",ftc,count);
+                QDIO_DBF_TEXT3(1,trace,dbf_text);
+                tmp = 0;
+                ftc = q->first_to_check;
+#ifdef QDIO_USE_PROCESSING_STATE
+		if (cnt > 1) {
+			cnt -= 1;
+			tmp = set_slsb(q, &ftc, SLSB_P_INPUT_NOT_INIT, &cnt);
+			if (!tmp)
+				break;
+		}
+		cnt = 1;
+		tmp += set_slsb(q, &ftc,
+			       SLSB_P_INPUT_PROCESSING, &cnt);
+		atomic_set(&q->polling, 1);
+#else
+                tmp = set_slsb(q, &ftc, SLSB_P_INPUT_NOT_INIT, &cnt);
+#endif
+                atomic_sub(tmp, &q->number_of_buffers_used);
+                q->first_to_check = ftc;
+                break;
+        case SLSB_CU_INPUT_EMPTY:
+        case SLSB_P_INPUT_NOT_INIT:
+        case SLSB_P_INPUT_PROCESSING:
+                QDIO_DBF_TEXT5(0,trace,"inpnipro");
+                break;
+        default:
+                break;
+        }
+        QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
+        return q->first_to_check;
+}
 
 static inline int
 qdio_get_outbound_buffer_frontier(struct qdio_q *q)
 {
-	int f,f_mod_no;
-	volatile char *slsb;
-	int first_not_to_check;
+	struct qdio_irq *irq;
+        volatile char *slsb;
+        unsigned int count = 1;
+        int first_not_to_check, f, f_mod_no;
 	char dbf_text[15];
 
 	QDIO_DBF_TEXT4(0,trace,"getobfro");
 	QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
 
+	irq = (struct qdio_irq *) q->irq_ptr;
+	if (irq->is_qebsm)
+		return qdio_qebsm_get_outbound_buffer_frontier(q);
+
 	slsb=&q->slsb.acc.val[0];
 	f_mod_no=f=q->first_to_check;
 	/* 
@@ -484,7 +737,7 @@ check_next:
 		QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256);
 
 		/* kind of process the buffer */
-		set_slsb(&q->slsb.acc.val[f_mod_no], SLSB_P_OUTPUT_NOT_INIT);
+		set_slsb(q, &f_mod_no, SLSB_P_OUTPUT_NOT_INIT, &count);
 
 		/* 
 		 * we increment the frontier, as this buffer
@@ -597,48 +850,48 @@ qdio_kick_outbound_q(struct qdio_q *q)
 
 	result=qdio_siga_output(q);
 
-		switch (result) {
-		case 0:
-			/* went smooth this time, reset timestamp */
+	switch (result) {
+	case 0:
+		/* went smooth this time, reset timestamp */
 #ifdef CONFIG_QDIO_DEBUG
-			QDIO_DBF_TEXT3(0,trace,"cc2reslv");
-			sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
-				atomic_read(&q->busy_siga_counter));
-			QDIO_DBF_TEXT3(0,trace,dbf_text);
+		QDIO_DBF_TEXT3(0,trace,"cc2reslv");
+		sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
+			atomic_read(&q->busy_siga_counter));
+		QDIO_DBF_TEXT3(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
-			q->timing.busy_start=0;
+		q->timing.busy_start=0;
+		break;
+	case (2|QDIO_SIGA_ERROR_B_BIT_SET):
+		/* cc=2 and busy bit: */
+		atomic_inc(&q->busy_siga_counter);
+
+		/* if the last siga was successful, save
+		 * timestamp here */
+		if (!q->timing.busy_start)
+			q->timing.busy_start=NOW;
+
+		/* if we're in time, don't touch error_status_flags
+		 * and siga_error */
+		if (NOW-q->timing.busy_start<QDIO_BUSY_BIT_GIVE_UP) {
+			qdio_mark_q(q);
 			break;
-		case (2|QDIO_SIGA_ERROR_B_BIT_SET):
-			/* cc=2 and busy bit: */
-			atomic_inc(&q->busy_siga_counter);
-
-			/* if the last siga was successful, save
-			 * timestamp here */
-			if (!q->timing.busy_start)
-				q->timing.busy_start=NOW;
-
-			/* if we're in time, don't touch error_status_flags
-			 * and siga_error */
-			if (NOW-q->timing.busy_start<QDIO_BUSY_BIT_GIVE_UP) {
-				qdio_mark_q(q);
-				break;
-			}
-			QDIO_DBF_TEXT2(0,trace,"cc2REPRT");
+		}
+		QDIO_DBF_TEXT2(0,trace,"cc2REPRT");
 #ifdef CONFIG_QDIO_DEBUG
-			sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
-				atomic_read(&q->busy_siga_counter));
-			QDIO_DBF_TEXT3(0,trace,dbf_text);
+		sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
+			atomic_read(&q->busy_siga_counter));
+		QDIO_DBF_TEXT3(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
-			/* else fallthrough and report error */
-		default:
-			/* for plain cc=1, 2 or 3: */
-			if (q->siga_error)
-				q->error_status_flags|=
-					QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR;
+		/* else fallthrough and report error */
+	default:
+		/* for plain cc=1, 2 or 3: */
+		if (q->siga_error)
 			q->error_status_flags|=
-				QDIO_STATUS_LOOK_FOR_ERROR;
-			q->siga_error=result;
-		}
+				QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR;
+		q->error_status_flags|=
+			QDIO_STATUS_LOOK_FOR_ERROR;
+		q->siga_error=result;
+	}
 }
 
 static inline void
@@ -743,8 +996,10 @@ qdio_outbound_processing(struct qdio_q *q)
 static inline int
 qdio_get_inbound_buffer_frontier(struct qdio_q *q)
 {
+	struct qdio_irq *irq;
 	int f,f_mod_no;
 	volatile char *slsb;
+	unsigned int count = 1;
 	int first_not_to_check;
 #ifdef CONFIG_QDIO_DEBUG
 	char dbf_text[15];
@@ -756,6 +1011,10 @@ qdio_get_inbound_buffer_frontier(struct qdio_q *q)
 	QDIO_DBF_TEXT4(0,trace,"getibfro");
 	QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
 
+	irq = (struct qdio_irq *) q->irq_ptr;
+	if (irq->is_qebsm)
+		return qdio_qebsm_get_inbound_buffer_frontier(q);
+
 	slsb=&q->slsb.acc.val[0];
 	f_mod_no=f=q->first_to_check;
 	/* 
@@ -792,19 +1051,19 @@ check_next:
 		 * kill VM in terms of CP overhead 
 		 */
 		if (q->siga_sync) {
-			set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT);
+			set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
 		} else {
 			/* set the previous buffer to NOT_INIT. The current
 			 * buffer will be set to PROCESSING at the end of
 			 * this function to avoid further interrupts. */
 			if (last_position>=0)
-				set_slsb(&slsb[last_position],
-					 SLSB_P_INPUT_NOT_INIT);
+				set_slsb(q, &last_position,
+					 SLSB_P_INPUT_NOT_INIT, &count);
 			atomic_set(&q->polling,1);
 			last_position=f_mod_no;
 		}
 #else /* QDIO_USE_PROCESSING_STATE */
-		set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT);
+		set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
 #endif /* QDIO_USE_PROCESSING_STATE */
 		/* 
 		 * not needed, as the inbound queue will be synced on the next
@@ -829,7 +1088,7 @@ check_next:
 		QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256);
 
 		/* kind of process the buffer */
-		set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT);
+		set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
 
 		if (q->qdio_error)
 			q->error_status_flags|=
@@ -857,7 +1116,7 @@ out:
 
 #ifdef QDIO_USE_PROCESSING_STATE
 	if (last_position>=0)
-		set_slsb(&slsb[last_position],SLSB_P_INPUT_PROCESSING);
+		set_slsb(q, &last_position, SLSB_P_INPUT_NOT_INIT, &count);
 #endif /* QDIO_USE_PROCESSING_STATE */
 
 	QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
@@ -902,6 +1161,10 @@ static inline int
 tiqdio_is_inbound_q_done(struct qdio_q *q)
 {
 	int no_used;
+	unsigned int start_buf, count;
+	unsigned char state = 0;
+	struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
+
 #ifdef CONFIG_QDIO_DEBUG
 	char dbf_text[15];
 #endif
@@ -927,8 +1190,13 @@ tiqdio_is_inbound_q_done(struct qdio_q *q)
 	if (!q->siga_sync)
 		/* we'll check for more primed buffers in qeth_stop_polling */
 		return 0;
-
-	if (q->slsb.acc.val[q->first_to_check]!=SLSB_P_INPUT_PRIMED)
+	if (irq->is_qebsm) {
+		count = 1;
+		start_buf = q->first_to_check;
+		qdio_do_eqbs(q, &state, &start_buf, &count);
+	} else
+		state = q->slsb.acc.val[q->first_to_check];
+	if (state != SLSB_P_INPUT_PRIMED)
 		/* 
 		 * nothing more to do, if next buffer is not PRIMED.
 		 * note that we did a SYNC_MEMORY before, that there
@@ -955,6 +1223,10 @@ static inline int
 qdio_is_inbound_q_done(struct qdio_q *q)
 {
 	int no_used;
+	unsigned int start_buf, count;
+	unsigned char state = 0;
+	struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
+
 #ifdef CONFIG_QDIO_DEBUG
 	char dbf_text[15];
 #endif
@@ -973,8 +1245,13 @@ qdio_is_inbound_q_done(struct qdio_q *q)
 		QDIO_DBF_TEXT4(0,trace,dbf_text);
 		return 1;
 	}
-
-	if (q->slsb.acc.val[q->first_to_check]==SLSB_P_INPUT_PRIMED) {
+	if (irq->is_qebsm) {
+		count = 1;
+		start_buf = q->first_to_check;
+		qdio_do_eqbs(q, &state, &start_buf, &count);
+	} else
+		state = q->slsb.acc.val[q->first_to_check];
+	if (state == SLSB_P_INPUT_PRIMED) {
 		/* we got something to do */
 		QDIO_DBF_TEXT4(0,trace,"inqisntA");
 		QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
@@ -1523,11 +1800,11 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
 		QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*));
 
 		/* fill in slsb */
-		for (j=0;j<QDIO_MAX_BUFFERS_PER_Q;j++) {
-			set_slsb(&q->slsb.acc.val[j],
-		   		 SLSB_P_INPUT_NOT_INIT);
-/*			q->sbal[j]->element[1].sbalf.i1.key=QDIO_STORAGE_KEY;*/
-		}
+		if (!irq_ptr->is_qebsm) {
+                        unsigned int count = 1;
+                        for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
+                                set_slsb(q, &j, SLSB_P_INPUT_NOT_INIT, &count);
+                }
 	}
 
 	for (i=0;i<no_output_qs;i++) {
@@ -1584,11 +1861,11 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
 		QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*));
 
 		/* fill in slsb */
-		for (j=0;j<QDIO_MAX_BUFFERS_PER_Q;j++) {
-			set_slsb(&q->slsb.acc.val[j],
-		   		 SLSB_P_OUTPUT_NOT_INIT);
-/*			q->sbal[j]->element[1].sbalf.i1.key=QDIO_STORAGE_KEY;*/
-		}
+                if (!irq_ptr->is_qebsm) {
+                        unsigned int count = 1;
+                        for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
+                                set_slsb(q, &j, SLSB_P_OUTPUT_NOT_INIT, &count);
+                }
 	}
 }
 
@@ -1905,7 +2182,7 @@ int
 qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
 		 unsigned int queue_number)
 {
-	int cc;
+	int cc = 0;
 	struct qdio_q *q;
 	struct qdio_irq *irq_ptr;
 	void *ptr;
@@ -1929,12 +2206,14 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
 		q=irq_ptr->input_qs[queue_number];
 		if (!q)
 			return -EINVAL;
-		cc = do_siga_sync(q->irq, 0, q->mask);
+		if (!(irq_ptr->is_qebsm))
+			cc = do_siga_sync(0x10000|q->irq, 0, q->mask);
 	} else if (flags&QDIO_FLAG_SYNC_OUTPUT) {
 		q=irq_ptr->output_qs[queue_number];
 		if (!q)
 			return -EINVAL;
-		cc = do_siga_sync(q->irq, q->mask, 0);
+		if (!(irq_ptr->is_qebsm))
+			cc = do_siga_sync(0x10000|q->irq, q->mask, 0);
 	} else 
 		return -EINVAL;
 
@@ -1945,12 +2224,49 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
 	return cc;
 }
 
-static unsigned char
-qdio_check_siga_needs(int sch)
+static inline void
+qdio_check_subchannel_qebsm(struct qdio_irq *irq_ptr, unsigned char qdioac,
+			    unsigned long token)
+{
+	struct qdio_q *q;
+	int i;
+	unsigned int count, start_buf;
+	char dbf_text[15];
+
+	/*check if QEBSM is disabled */
+	if (!(irq_ptr->is_qebsm) || !(qdioac & 0x01)) {
+		irq_ptr->is_qebsm  = 0;
+		irq_ptr->sch_token = 0;
+		irq_ptr->qib.rflags &= ~QIB_RFLAGS_ENABLE_QEBSM;
+		QDIO_DBF_TEXT0(0,setup,"noV=V");
+		return;
+	}
+	irq_ptr->sch_token = token;
+	/*input queue*/
+	for (i = 0; i < irq_ptr->no_input_qs;i++) {
+		q = irq_ptr->input_qs[i];
+		count = QDIO_MAX_BUFFERS_PER_Q;
+		start_buf = 0;
+		set_slsb(q, &start_buf, SLSB_P_INPUT_NOT_INIT, &count);
+	}
+	sprintf(dbf_text,"V=V:%2x",irq_ptr->is_qebsm);
+	QDIO_DBF_TEXT0(0,setup,dbf_text);
+	sprintf(dbf_text,"%8lx",irq_ptr->sch_token);
+	QDIO_DBF_TEXT0(0,setup,dbf_text);
+	/*output queue*/
+	for (i = 0; i < irq_ptr->no_output_qs; i++) {
+		q = irq_ptr->output_qs[i];
+		count = QDIO_MAX_BUFFERS_PER_Q;
+		start_buf = 0;
+		set_slsb(q, &start_buf, SLSB_P_OUTPUT_NOT_INIT, &count);
+	}
+}
+
+static void
+qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 {
 	int result;
 	unsigned char qdioac;
-
 	struct {
 		struct chsc_header request;
 		u16 reserved1;
@@ -1964,67 +2280,80 @@ qdio_check_siga_needs(int sch)
 		u8  reserved5;
 		u16 sch;
 		u8  qfmt;
-		u8  reserved6;
-		u8  qdioac;
+		u8  parm;
+		u8  qdioac1;
 		u8  sch_class;
 		u8  reserved7;
 		u8  icnt;
 		u8  reserved8;
 		u8  ocnt;
+		u8 reserved9;
+		u8 mbccnt;
+		u16 qdioac2;
+		u64 sch_token;
 	} *ssqd_area;
 
+	QDIO_DBF_TEXT0(0,setup,"getssqd");
+	qdioac = 0;
 	ssqd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!ssqd_area) {
 	        QDIO_PRINT_WARN("Could not get memory for chsc. Using all " \
-				"SIGAs for sch x%x.\n", sch);
-		return CHSC_FLAG_SIGA_INPUT_NECESSARY ||
-			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
-			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
+				"SIGAs for sch x%x.\n", irq_ptr->irq);
+		irq_ptr->qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
+				  CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
+				  CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
+		irq_ptr->is_qebsm = 0;
+		irq_ptr->sch_token = 0;
+		irq_ptr->qib.rflags &= ~QIB_RFLAGS_ENABLE_QEBSM;
+		return;
 	}
+
 	ssqd_area->request = (struct chsc_header) {
 		.length = 0x0010,
 		.code   = 0x0024,
 	};
-
-	ssqd_area->first_sch = sch;
-	ssqd_area->last_sch = sch;
-
-	result=chsc(ssqd_area);
+	ssqd_area->first_sch = irq_ptr->irq;
+	ssqd_area->last_sch = irq_ptr->irq;
+	result = chsc(ssqd_area);
 
 	if (result) {
 		QDIO_PRINT_WARN("CHSC returned cc %i. Using all " \
 				"SIGAs for sch x%x.\n",
-				result,sch);
+				result, irq_ptr->irq);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
+		irq_ptr->is_qebsm  = 0;
 		goto out;
 	}
 
 	if (ssqd_area->response.code != QDIO_CHSC_RESPONSE_CODE_OK) {
 		QDIO_PRINT_WARN("response upon checking SIGA needs " \
 				"is 0x%x. Using all SIGAs for sch x%x.\n",
-				ssqd_area->response.code, sch);
+				ssqd_area->response.code, irq_ptr->irq);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
+		irq_ptr->is_qebsm  = 0;
 		goto out;
 	}
 	if (!(ssqd_area->flags & CHSC_FLAG_QDIO_CAPABILITY) ||
 	    !(ssqd_area->flags & CHSC_FLAG_VALIDITY) ||
-	    (ssqd_area->sch != sch)) {
+	    (ssqd_area->sch != irq_ptr->irq)) {
 		QDIO_PRINT_WARN("huh? problems checking out sch x%x... " \
-				"using all SIGAs.\n",sch);
+				"using all SIGAs.\n",irq_ptr->irq);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY |
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY |
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* worst case */
+		irq_ptr->is_qebsm  = 0;
 		goto out;
 	}
-
-	qdioac = ssqd_area->qdioac;
+	qdioac = ssqd_area->qdioac1;
 out:
+	qdio_check_subchannel_qebsm(irq_ptr, qdioac,
+				    ssqd_area->sch_token);
 	free_page ((unsigned long) ssqd_area);
-	return qdioac;
+	irq_ptr->qdioac = qdioac;
 }
 
 static unsigned int
@@ -2055,6 +2384,13 @@ tiqdio_check_chsc_availability(void)
 	sprintf(dbf_text,"hydrati%1x", hydra_thinints);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
+#ifdef CONFIG_ARCH_S390X
+	/* Check for QEBSM support in general (bit 58). */
+	is_passthrough = css_general_characteristics.qebsm;
+#endif
+	sprintf(dbf_text,"cssQBS:%1x", is_passthrough);
+	QDIO_DBF_TEXT0(0,setup,dbf_text);
+
 	/* Check for aif time delay disablement fac (bit 56). If installed,
 	 * omit svs even under lpar (good point by rick again) */
 	omit_svs = css_general_characteristics.aif_tdd;
@@ -2698,7 +3034,7 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
 	QDIO_DBF_TEXT2(0,setup,dbf_text);
 
 	if (irq_ptr->is_thinint_irq) {
-		irq_ptr->dev_st_chg_ind=qdio_get_indicator();
+		irq_ptr->dev_st_chg_ind = qdio_get_indicator();
 		QDIO_DBF_HEX1(0,setup,&irq_ptr->dev_st_chg_ind,sizeof(void*));
 		if (!irq_ptr->dev_st_chg_ind) {
 			QDIO_PRINT_WARN("no indicator location available " \
@@ -2747,6 +3083,10 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
 	irq_ptr->qdr->qkey=QDIO_STORAGE_KEY;
 
 	/* fill in qib */
+	irq_ptr->is_qebsm = is_passthrough;
+	if (irq_ptr->is_qebsm)
+		irq_ptr->qib.rflags |= QIB_RFLAGS_ENABLE_QEBSM;
+
 	irq_ptr->qib.qfmt=init_data->q_format;
 	if (init_data->no_input_qs)
 		irq_ptr->qib.isliba=(unsigned long)(irq_ptr->input_qs[0]->slib);
@@ -2884,7 +3224,7 @@ qdio_establish(struct qdio_initialize *init_data)
 		return -EIO;
 	}
 
-	irq_ptr->qdioac=qdio_check_siga_needs(irq_ptr->irq);
+	qdio_get_ssqd_information(irq_ptr);
 	/* if this gets set once, we're running under VM and can omit SVSes */
 	if (irq_ptr->qdioac&CHSC_FLAG_SIGA_SYNC_NECESSARY)
 		omit_svs=1;
@@ -3015,30 +3355,40 @@ static inline void
 qdio_do_qdio_fill_input(struct qdio_q *q, unsigned int qidx,
 			unsigned int count, struct qdio_buffer *buffers)
 {
+	struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
+	qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1);
+	if (irq->is_qebsm) {
+		while (count)
+			set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count);
+		return;
+	}
 	for (;;) {
-		set_slsb(&q->slsb.acc.val[qidx],SLSB_CU_INPUT_EMPTY);
+		set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count);
 		count--;
 		if (!count) break;
-		qidx=(qidx+1)&(QDIO_MAX_BUFFERS_PER_Q-1);
+		qidx = (qidx + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
 	}
-
-	/* not necessary, as the queues are synced during the SIGA read */
-	/*SYNC_MEMORY;*/
 }
 
 static inline void
 qdio_do_qdio_fill_output(struct qdio_q *q, unsigned int qidx,
 			 unsigned int count, struct qdio_buffer *buffers)
 {
+	struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
+
+	qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1);
+	if (irq->is_qebsm) {
+		while (count)
+			set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count);
+		return;
+	}
+
 	for (;;) {
-		set_slsb(&q->slsb.acc.val[qidx],SLSB_CU_OUTPUT_PRIMED);
+		set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count);
 		count--;
 		if (!count) break;
-		qidx=(qidx+1)&(QDIO_MAX_BUFFERS_PER_Q-1);
+		qidx = (qidx + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
 	}
-
-	/* SIGA write will sync the queues */
-	/*SYNC_MEMORY;*/
 }
 
 static inline void
@@ -3083,6 +3433,9 @@ do_qdio_handle_outbound(struct qdio_q *q, unsigned int callflags,
 			struct qdio_buffer *buffers)
 {
 	int used_elements;
+	unsigned int cnt, start_buf;
+	unsigned char state = 0;
+	struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
 
 	/* This is the outbound handling of queues */
 #ifdef QDIO_PERFORMANCE_STATS
@@ -3115,9 +3468,15 @@ do_qdio_handle_outbound(struct qdio_q *q, unsigned int callflags,
 			 * SYNC_MEMORY :-/ ), we try to
 			 * fast-requeue buffers 
 			 */
-			if (q->slsb.acc.val[(qidx+QDIO_MAX_BUFFERS_PER_Q-1)
-					    &(QDIO_MAX_BUFFERS_PER_Q-1)]!=
-			    SLSB_CU_OUTPUT_PRIMED) {
+			if (irq->is_qebsm) {
+				cnt = 1;
+				start_buf = ((qidx+QDIO_MAX_BUFFERS_PER_Q-1) &
+					     (QDIO_MAX_BUFFERS_PER_Q-1));
+				qdio_do_eqbs(q, &state, &start_buf, &cnt);
+			} else
+				state = q->slsb.acc.val[(qidx+QDIO_MAX_BUFFERS_PER_Q-1)
+					&(QDIO_MAX_BUFFERS_PER_Q-1) ];
+			 if (state != SLSB_CU_OUTPUT_PRIMED) {
 				qdio_kick_outbound_q(q);
 			} else {
 				QDIO_DBF_TEXT3(0,trace, "fast-req");
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 328e31c..b5d303e 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -3,14 +3,13 @@
 
 #include <asm/page.h>
 
-#define VERSION_CIO_QDIO_H "$Revision: 1.33 $"
+#define VERSION_CIO_QDIO_H "$Revision: 1.37 $"
 
 #ifdef CONFIG_QDIO_DEBUG
 #define QDIO_VERBOSE_LEVEL 9
 #else /* CONFIG_QDIO_DEBUG */
 #define QDIO_VERBOSE_LEVEL 5
 #endif /* CONFIG_QDIO_DEBUG */
-
 #define QDIO_USE_PROCESSING_STATE
 
 #ifdef CONFIG_QDIO_PERF_STATS
@@ -265,6 +264,58 @@ QDIO_PRINT_##importance(header "%02x %02x %02x %02x  %02x %02x %02x %02x  " \
 /*
  * Some instructions as assembly
  */
+
+static inline int
+do_sqbs(unsigned long sch, unsigned char state, int queue,
+       unsigned int *start, unsigned int *count)
+{
+#ifdef CONFIG_ARCH_S390X
+       register unsigned long _ccq asm ("0") = *count;
+       register unsigned long _sch asm ("1") = sch;
+       unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
+
+       asm volatile (
+              " .insn rsy,0xeb000000008A,%1,0,0(%2)\n\t"
+              : "+d" (_ccq), "+d" (_queuestart)
+              : "d" ((unsigned long)state), "d" (_sch)
+              : "memory", "cc"
+       );
+       *count = _ccq & 0xff;
+       *start = _queuestart & 0xff;
+
+       return (_ccq >> 32) & 0xff;
+#else
+       return 0;
+#endif
+}
+
+static inline int
+do_eqbs(unsigned long sch, unsigned char *state, int queue,
+	unsigned int *start, unsigned int *count)
+{
+#ifdef CONFIG_ARCH_S390X
+	register unsigned long _ccq asm ("0") = *count;
+	register unsigned long _sch asm ("1") = sch;
+	unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
+	unsigned long _state = 0;
+
+	asm volatile (
+	      " .insn rrf,0xB99c0000,%1,%2,0,0  \n\t"
+	      : "+d" (_ccq), "+d" (_queuestart), "+d" (_state)
+	      : "d" (_sch)
+	      : "memory", "cc"
+	);
+	*count = _ccq & 0xff;
+	*start = _queuestart & 0xff;
+	*state = _state & 0xff;
+
+	return (_ccq >> 32) & 0xff;
+#else
+	return 0;
+#endif
+}
+
+
 static inline int
 do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 {
@@ -280,7 +331,7 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (0x10000|irq), "d" (mask1), "d" (mask2)
+		: "d" (irq), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
 #else /* CONFIG_ARCH_S390X */
@@ -293,7 +344,7 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (0x10000|irq), "d" (mask1), "d" (mask2)
+		: "d" (irq), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
 #endif /* CONFIG_ARCH_S390X */
@@ -314,7 +365,7 @@ do_siga_input(unsigned int irq, unsigned int mask)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (0x10000|irq), "d" (mask)
+		: "d" (irq), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
 #else /* CONFIG_ARCH_S390X */
@@ -326,7 +377,7 @@ do_siga_input(unsigned int irq, unsigned int mask)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (0x10000|irq), "d" (mask)
+		: "d" (irq), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
 #endif /* CONFIG_ARCH_S390X */
@@ -335,7 +386,8 @@ do_siga_input(unsigned int irq, unsigned int mask)
 }
 
 static inline int
-do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb)
+do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb,
+	       unsigned int fc)
 {
 	int cc;
 	__u32 busy_bit;
@@ -366,14 +418,14 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb)
 		".long	0b,2b	\n\t"
 		".previous	\n\t"
 		: "=d" (cc), "=d" (busy_bit)
-		: "d" (0x10000|irq), "d" (mask),
+		: "d" (irq), "d" (mask),
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
 		: "cc", "0", "1", "2", "memory"
 		);
 #else /* CONFIG_ARCH_S390X */
 	asm volatile (
-		"lghi	0,0	\n\t"
-		"llgfr	1,%2	\n\t"
+        	"llgfr  0,%5    \n\t"
+                "lgr    1,%2    \n\t"
 		"llgfr	2,%3	\n\t"
 		"siga	0	\n\t"
 		"0:"
@@ -391,8 +443,8 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb)
 		".quad	0b,1b	\n\t"
 		".previous	\n\t"
 		: "=d" (cc), "=d" (busy_bit)
-		: "d" (0x10000|irq), "d" (mask),
-		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
+		: "d" (irq), "d" (mask),
+		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
 		: "cc", "0", "1", "2", "memory"
 		);
 #endif /* CONFIG_ARCH_S390X */
@@ -494,33 +546,12 @@ struct qdio_perf_stats {
 #define QDIO_GET_ADDR(x) ((__u32)(long)x)
 #endif /* CONFIG_ARCH_S390X */
 
-#ifdef CONFIG_QDIO_DEBUG
-#define set_slsb(x,y) \
-  if(q->queue_type==QDIO_TRACE_QTYPE) { \
-        if(q->is_input_q) { \
-            QDIO_DBF_HEX2(0,slsb_in,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
-        } else { \
-            QDIO_DBF_HEX2(0,slsb_out,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
-        } \
-  } \
-  qdio_set_slsb(x,y); \
-  if(q->queue_type==QDIO_TRACE_QTYPE) { \
-        if(q->is_input_q) { \
-            QDIO_DBF_HEX2(0,slsb_in,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
-        } else { \
-            QDIO_DBF_HEX2(0,slsb_out,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
-        } \
-  }
-#else /* CONFIG_QDIO_DEBUG */
-#define set_slsb(x,y) qdio_set_slsb(x,y)
-#endif /* CONFIG_QDIO_DEBUG */
-
 struct qdio_q {
 	volatile struct slsb slsb;
 
 	char unused[QDIO_MAX_BUFFERS_PER_Q];
 
-	__u32 * volatile dev_st_chg_ind;
+	__u32 * dev_st_chg_ind;
 
 	int is_input_q;
 	int irq;
@@ -568,6 +599,7 @@ struct qdio_q {
 	struct tasklet_struct tasklet;
 #endif /* QDIO_USE_TIMERS_FOR_POLLING */
 
+
 	enum qdio_irq_states state;
 
 	/* used to store the error condition during a data transfer */
@@ -624,6 +656,10 @@ struct qdio_irq {
 	unsigned int hydra_gives_outbound_pcis;
 	unsigned int sync_done_on_outb_pcis;
 
+	/* QEBSM facility */
+	unsigned int is_qebsm;
+	unsigned long sch_token;
+
 	enum qdio_irq_states state;
 
 	unsigned int no_input_qs;
diff --git a/include/asm-s390/qdio.h b/include/asm-s390/qdio.h
index 0ddf0a8..7bc15f0 100644
--- a/include/asm-s390/qdio.h
+++ b/include/asm-s390/qdio.h
@@ -195,12 +195,14 @@ struct qdr {
 /*
  * queue information block (QIB)
  */
-#define QIB_AC_INBOUND_PCI_SUPPORTED 0x80
-#define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40
+#define QIB_AC_INBOUND_PCI_SUPPORTED 	0x80
+#define QIB_AC_OUTBOUND_PCI_SUPPORTED 	0x40
+#define QIB_RFLAGS_ENABLE_QEBSM		0x80
+
 struct qib {
 	unsigned int  qfmt    :  8;     /* queue format */
 	unsigned int  pfmt    :  8;     /* impl. dep. parameter format */
-	unsigned int  res1    :  8;     /* reserved */
+	unsigned int  rflags  :  8;	/* QEBSM */
 	unsigned int  ac      :  8;     /* adapter characteristics */
 	unsigned int  res2;             /* reserved */
 #ifdef QDIO_32_BIT
-- 
cgit v1.1


From a8237fc4108060402d904bea5e1062e22e731969 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:21 -0800
Subject: [PATCH] s390: introduce struct subchannel_id

This patch introduces a struct subchannel_id containing the subchannel number
(formerly referred to as "irq") and switches code formerly relying on the
subchannel number over to it.

While we're touching inline assemblies anyway, make sure they have correct
memory constraints.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/blacklist.c     |  23 +++++----
 drivers/s390/cio/chsc.c          |  63 +++++++++++++----------
 drivers/s390/cio/cio.c           |  84 ++++++++++++++++--------------
 drivers/s390/cio/cio.h           |  11 ++--
 drivers/s390/cio/cmf.c           |   8 +--
 drivers/s390/cio/css.c           |  69 +++++++++++++------------
 drivers/s390/cio/css.h           |  13 ++---
 drivers/s390/cio/device.c        |  17 +++++--
 drivers/s390/cio/device.h        |   1 +
 drivers/s390/cio/device_fsm.c    |  18 +++----
 drivers/s390/cio/device_id.c     |   6 +--
 drivers/s390/cio/device_ops.c    |   4 +-
 drivers/s390/cio/device_pgid.c   |  13 ++---
 drivers/s390/cio/device_status.c |   8 +--
 drivers/s390/cio/ioasm.h         |  55 +++++++++++---------
 drivers/s390/cio/qdio.c          | 107 ++++++++++++++++++++-------------------
 drivers/s390/cio/qdio.h          |  26 +++++-----
 drivers/s390/cio/schid.h         |  25 +++++++++
 18 files changed, 313 insertions(+), 238 deletions(-)
 create mode 100644 drivers/s390/cio/schid.h

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index a1c52a6..a4b0303 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -35,7 +35,7 @@
  */
 
 /* 65536 bits to indicate if a devno is blacklisted or not */
-#define __BL_DEV_WORDS ((__MAX_SUBCHANNELS + (8*sizeof(long) - 1)) / \
+#define __BL_DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
 			 (8*sizeof(long)))
 static unsigned long bl_dev[__BL_DEV_WORDS];
 typedef enum {add, free} range_action;
@@ -50,7 +50,7 @@ blacklist_range (range_action action, unsigned int from, unsigned int to)
 	if (!to)
 		to = from;
 
-	if (from > to || to > __MAX_SUBCHANNELS) {
+	if (from > to || to > __MAX_SUBCHANNEL) {
 		printk (KERN_WARNING "Invalid blacklist range "
 			"0x%04x to 0x%04x, skipping\n", from, to);
 		return;
@@ -143,7 +143,7 @@ blacklist_parse_parameters (char *str, range_action action)
 		if (strncmp(str,"all,",4) == 0 || strcmp(str,"all") == 0 ||
 		    strncmp(str,"all\n",4) == 0 || strncmp(str,"all ",4) == 0) {
 			from = 0;
-			to = __MAX_SUBCHANNELS;
+			to = __MAX_SUBCHANNEL;
 			str += 3;
 		} else {
 			int rc;
@@ -226,20 +226,21 @@ is_blacklisted (int devno)
 static inline void
 s390_redo_validation (void)
 {
-	unsigned int irq;
+	struct subchannel_id schid;
 
 	CIO_TRACE_EVENT (0, "redoval");
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
+	init_subchannel_id(&schid);
+	do {
 		int ret;
 		struct subchannel *sch;
 
-		sch = get_subchannel_by_schid(irq);
+		sch = get_subchannel_by_schid(schid);
 		if (sch) {
 			/* Already known. */
 			put_device(&sch->dev);
 			continue;
 		}
-		ret = css_probe_device(irq);
+		ret = css_probe_device(schid);
 		if (ret == -ENXIO)
 			break; /* We're through. */
 		if (ret == -ENOMEM)
@@ -248,7 +249,7 @@ s390_redo_validation (void)
 			 * panic.
 			 */
 			break;
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 }
 
 /*
@@ -289,12 +290,12 @@ static int cio_ignore_read (char *page, char **start, off_t off,
 	len = 0;
 	for (devno = off; /* abuse the page variable
 			   * as counter, see fs/proc/generic.c */
-	     devno < __MAX_SUBCHANNELS && len + entry_size < count; devno++) {
+	     devno < __MAX_SUBCHANNEL && len + entry_size < count; devno++) {
 		if (!test_bit(devno, bl_dev))
 			continue;
 		len += sprintf(page + len, "0.0.%04lx", devno);
 		if (test_bit(devno + 1, bl_dev)) { /* print range */
-			while (++devno < __MAX_SUBCHANNELS)
+			while (++devno < __MAX_SUBCHANNEL)
 				if (!test_bit(devno, bl_dev))
 					break;
 			len += sprintf(page + len, "-0.0.%04lx", --devno);
@@ -302,7 +303,7 @@ static int cio_ignore_read (char *page, char **start, off_t off,
 		len += sprintf(page + len, "\n");
 	}
 
-	if (devno < __MAX_SUBCHANNELS)
+	if (devno < __MAX_SUBCHANNEL)
 		*eof = 1;
 	*start = (char *) (devno - off); /* number of checked entries */
 	return len;
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index fa3c23b..aff5d14 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -104,8 +104,8 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 		.code   = 0x0004,
 	};
 
-	ssd_area->f_sch = sch->irq;
-	ssd_area->l_sch = sch->irq;
+	ssd_area->f_sch = sch->schid.sch_no;
+	ssd_area->l_sch = sch->schid.sch_no;
 
 	ccode = chsc(ssd_area);
 	if (ccode > 0) {
@@ -147,7 +147,8 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 	 */
 	if (ssd_area->st > 3) { /* uhm, that looks strange... */
 		CIO_CRW_EVENT(0, "Strange subchannel type %d"
-			      " for sch %04x\n", ssd_area->st, sch->irq);
+			      " for sch %04x\n", ssd_area->st,
+			      sch->schid.sch_no);
 		/*
 		 * There may have been a new subchannel type defined in the
 		 * time since this code was written; since we don't know which
@@ -157,7 +158,7 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 	} else {
 		const char *type[4] = {"I/O", "chsc", "message", "ADM"};
 		CIO_CRW_EVENT(6, "ssd: sch %04x is %s subchannel\n",
-			      sch->irq, type[ssd_area->st]);
+			      sch->schid.sch_no, type[ssd_area->st]);
 
 		sch->ssd_info.valid = 1;
 		sch->ssd_info.type = ssd_area->st;
@@ -232,7 +233,7 @@ s390_subchannel_remove_chpid(struct device *dev, void *data)
 	mask = 0x80 >> j;
 	spin_lock(&sch->lock);
 
-	stsch(sch->irq, &schib);
+	stsch(sch->schid, &schib);
 	if (!schib.pmcw.dnv)
 		goto out_unreg;
 	memcpy(&sch->schib, &schib, sizeof(struct schib));
@@ -284,7 +285,7 @@ out_unlock:
 out_unreg:
 	spin_unlock(&sch->lock);
 	sch->lpm = 0;
-	if (css_enqueue_subchannel_slow(sch->irq)) {
+	if (css_enqueue_subchannel_slow(sch->schid)) {
 		css_clear_subchannel_slow_list();
 		need_rescan = 1;
 	}
@@ -337,7 +338,7 @@ s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
 	 * new path information and eventually check for logically
 	 * offline chpids.
 	 */
-	ccode = stsch(sch->irq, &sch->schib);
+	ccode = stsch(sch->schid, &sch->schib);
 	if (ccode > 0)
 		return 0;
 
@@ -348,7 +349,8 @@ static int
 s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 {
 	struct subchannel *sch;
-	int irq, rc;
+	int rc;
+	struct subchannel_id schid;
 	char dbf_txt[15];
 
 	sprintf(dbf_txt, "accpr%x", chpid);
@@ -370,10 +372,11 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 		return 0; /* no need to do the rest */
 
 	rc = 0;
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
+	init_subchannel_id(&schid);
+	do {
 		int chp_mask, old_lpm;
 
-		sch = get_subchannel_by_schid(irq);
+		sch = get_subchannel_by_schid(schid);
 		if (!sch) {
 			struct schib schib;
 			int ret;
@@ -385,7 +388,7 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 			 * that beast may be on we'll have to do a stsch
 			 * on all devices, grr...
 			 */
-			if (stsch(irq, &schib)) {
+			if (stsch(schid, &schib)) {
 				/* We're through */
 				if (need_rescan)
 					rc = -EAGAIN;
@@ -396,7 +399,7 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 				continue;
 			}
 			/* Put it on the slow path. */
-			ret = css_enqueue_subchannel_slow(irq);
+			ret = css_enqueue_subchannel_slow(schid);
 			if (ret) {
 				css_clear_subchannel_slow_list();
 				need_rescan = 1;
@@ -428,7 +431,7 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 		put_device(&sch->dev);
 		if (fla_mask == 0xffff)
 			break;
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 	return rc;
 }
 
@@ -608,7 +611,8 @@ static int
 chp_add(int chpid)
 {
 	struct subchannel *sch;
-	int irq, ret, rc;
+	int ret, rc;
+	struct subchannel_id schid;
 	char dbf_txt[15];
 
 	if (!get_chp_status(chpid))
@@ -618,14 +622,15 @@ chp_add(int chpid)
 	CIO_TRACE_EVENT(2, dbf_txt);
 
 	rc = 0;
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
+	init_subchannel_id(&schid);
+	do {
 		int i;
 
-		sch = get_subchannel_by_schid(irq);
+		sch = get_subchannel_by_schid(schid);
 		if (!sch) {
 			struct schib schib;
 
-			if (stsch(irq, &schib)) {
+			if (stsch(schid, &schib)) {
 				/* We're through */
 				if (need_rescan)
 					rc = -EAGAIN;
@@ -636,7 +641,7 @@ chp_add(int chpid)
 				continue;
 			}
 			/* Put it on the slow path. */
-			ret = css_enqueue_subchannel_slow(irq);
+			ret = css_enqueue_subchannel_slow(schid);
 			if (ret) {
 				css_clear_subchannel_slow_list();
 				need_rescan = 1;
@@ -648,7 +653,7 @@ chp_add(int chpid)
 		spin_lock(&sch->lock);
 		for (i=0; i<8; i++)
 			if (sch->schib.pmcw.chpid[i] == chpid) {
-				if (stsch(sch->irq, &sch->schib) != 0) {
+				if (stsch(sch->schid, &sch->schib) != 0) {
 					/* Endgame. */
 					spin_unlock(&sch->lock);
 					return rc;
@@ -669,7 +674,7 @@ chp_add(int chpid)
 
 		spin_unlock(&sch->lock);
 		put_device(&sch->dev);
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 	return rc;
 }
 
@@ -702,7 +707,7 @@ __check_for_io_and_kill(struct subchannel *sch, int index)
 	if (!device_is_online(sch))
 		/* cio could be doing I/O. */
 		return 0;
-	cc = stsch(sch->irq, &sch->schib);
+	cc = stsch(sch->schid, &sch->schib);
 	if (cc)
 		return 0;
 	if (sch->schib.scsw.actl && sch->schib.pmcw.lpum == (0x80 >> index)) {
@@ -743,7 +748,7 @@ __s390_subchannel_vary_chpid(struct subchannel *sch, __u8 chpid, int on)
 			 * just varied off path. Then kill it.
 			 */
 			if (!__check_for_io_and_kill(sch, chp) && !sch->lpm) {
-				if (css_enqueue_subchannel_slow(sch->irq)) {
+				if (css_enqueue_subchannel_slow(sch->schid)) {
 					css_clear_subchannel_slow_list();
 					need_rescan = 1;
 				}
@@ -789,7 +794,8 @@ static int
 s390_vary_chpid( __u8 chpid, int on)
 {
 	char dbf_text[15];
-	int status, irq, ret;
+	int status, ret;
+	struct subchannel_id schid;
 	struct subchannel *sch;
 
 	sprintf(dbf_text, on?"varyon%x":"varyoff%x", chpid);
@@ -818,26 +824,27 @@ s390_vary_chpid( __u8 chpid, int on)
 	if (!on)
 		goto out;
 	/* Scan for new devices on varied on path. */
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
+	init_subchannel_id(&schid);
+	do {
 		struct schib schib;
 
 		if (need_rescan)
 			break;
-		sch = get_subchannel_by_schid(irq);
+		sch = get_subchannel_by_schid(schid);
 		if (sch) {
 			put_device(&sch->dev);
 			continue;
 		}
-		if (stsch(irq, &schib))
+		if (stsch(schid, &schib))
 			/* We're through */
 			break;
 		/* Put it on the slow path. */
-		ret = css_enqueue_subchannel_slow(irq);
+		ret = css_enqueue_subchannel_slow(schid);
 		if (ret) {
 			css_clear_subchannel_slow_list();
 			need_rescan = 1;
 		}
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 out:
 	if (need_rescan || css_slow_subchannels_exist())
 		queue_work(slow_path_wq, &slow_path_work);
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 185bc73..396bada 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -135,7 +135,7 @@ cio_tpi(void)
 		return 0;
 	irb = (struct irb *) __LC_IRB;
 	/* Store interrupt response block to lowcore. */
-	if (tsch (tpi_info->irq, irb) != 0)
+	if (tsch (tpi_info->schid, irb) != 0)
 		/* Not status pending or not operational. */
 		return 1;
 	sch = (struct subchannel *)(unsigned long)tpi_info->intparm;
@@ -163,10 +163,10 @@ cio_start_handle_notoper(struct subchannel *sch, __u8 lpm)
 	else
 		sch->lpm = 0;
 
-	stsch (sch->irq, &sch->schib);
+	stsch (sch->schid, &sch->schib);
 
 	CIO_MSG_EVENT(0, "cio_start: 'not oper' status for "
-		      "subchannel %04x!\n", sch->irq);
+		      "subchannel %04x!\n", sch->schid.sch_no);
 	sprintf(dbf_text, "no%s", sch->dev.bus_id);
 	CIO_TRACE_EVENT(0, dbf_text);
 	CIO_HEX_EVENT(0, &sch->schib, sizeof (struct schib));
@@ -204,7 +204,7 @@ cio_start_key (struct subchannel *sch,	/* subchannel structure */
 	sch->orb.key = key >> 4;
 	/* issue "Start Subchannel" */
 	sch->orb.cpa = (__u32) __pa (cpa);
-	ccode = ssch (sch->irq, &sch->orb);
+	ccode = ssch (sch->schid, &sch->orb);
 
 	/* process condition code */
 	sprintf (dbf_txt, "ccode:%d", ccode);
@@ -243,7 +243,7 @@ cio_resume (struct subchannel *sch)
 	CIO_TRACE_EVENT (4, "resIO");
 	CIO_TRACE_EVENT (4, sch->dev.bus_id);
 
-	ccode = rsch (sch->irq);
+	ccode = rsch (sch->schid);
 
 	sprintf (dbf_txt, "ccode:%d", ccode);
 	CIO_TRACE_EVENT (4, dbf_txt);
@@ -283,7 +283,7 @@ cio_halt(struct subchannel *sch)
 	/*
 	 * Issue "Halt subchannel" and process condition code
 	 */
-	ccode = hsch (sch->irq);
+	ccode = hsch (sch->schid);
 
 	sprintf (dbf_txt, "ccode:%d", ccode);
 	CIO_TRACE_EVENT (2, dbf_txt);
@@ -318,7 +318,7 @@ cio_clear(struct subchannel *sch)
 	/*
 	 * Issue "Clear subchannel" and process condition code
 	 */
-	ccode = csch (sch->irq);
+	ccode = csch (sch->schid);
 
 	sprintf (dbf_txt, "ccode:%d", ccode);
 	CIO_TRACE_EVENT (2, dbf_txt);
@@ -351,7 +351,7 @@ cio_cancel (struct subchannel *sch)
 	CIO_TRACE_EVENT (2, "cancelIO");
 	CIO_TRACE_EVENT (2, sch->dev.bus_id);
 
-	ccode = xsch (sch->irq);
+	ccode = xsch (sch->schid);
 
 	sprintf (dbf_txt, "ccode:%d", ccode);
 	CIO_TRACE_EVENT (2, dbf_txt);
@@ -359,7 +359,7 @@ cio_cancel (struct subchannel *sch)
 	switch (ccode) {
 	case 0:		/* success */
 		/* Update information in scsw. */
-		stsch (sch->irq, &sch->schib);
+		stsch (sch->schid, &sch->schib);
 		return 0;
 	case 1:		/* status pending */
 		return -EBUSY;
@@ -381,7 +381,7 @@ cio_modify (struct subchannel *sch)
 
 	ret = 0;
 	for (retry = 0; retry < 5; retry++) {
-		ccode = msch_err (sch->irq, &sch->schib);
+		ccode = msch_err (sch->schid, &sch->schib);
 		if (ccode < 0)	/* -EIO if msch gets a program check. */
 			return ccode;
 		switch (ccode) {
@@ -414,7 +414,7 @@ cio_enable_subchannel (struct subchannel *sch, unsigned int isc)
 	CIO_TRACE_EVENT (2, "ensch");
 	CIO_TRACE_EVENT (2, sch->dev.bus_id);
 
-	ccode = stsch (sch->irq, &sch->schib);
+	ccode = stsch (sch->schid, &sch->schib);
 	if (ccode)
 		return -ENODEV;
 
@@ -432,13 +432,13 @@ cio_enable_subchannel (struct subchannel *sch, unsigned int isc)
 			 */
 			sch->schib.pmcw.csense = 0;
 		if (ret == 0) {
-			stsch (sch->irq, &sch->schib);
+			stsch (sch->schid, &sch->schib);
 			if (sch->schib.pmcw.ena)
 				break;
 		}
 		if (ret == -EBUSY) {
 			struct irb irb;
-			if (tsch(sch->irq, &irb) != 0)
+			if (tsch(sch->schid, &irb) != 0)
 				break;
 		}
 	}
@@ -461,7 +461,7 @@ cio_disable_subchannel (struct subchannel *sch)
 	CIO_TRACE_EVENT (2, "dissch");
 	CIO_TRACE_EVENT (2, sch->dev.bus_id);
 
-	ccode = stsch (sch->irq, &sch->schib);
+	ccode = stsch (sch->schid, &sch->schib);
 	if (ccode == 3)		/* Not operational. */
 		return -ENODEV;
 
@@ -485,7 +485,7 @@ cio_disable_subchannel (struct subchannel *sch)
 			 */
 			break;
 		if (ret == 0) {
-			stsch (sch->irq, &sch->schib);
+			stsch (sch->schid, &sch->schib);
 			if (!sch->schib.pmcw.ena)
 				break;
 		}
@@ -508,12 +508,12 @@ cio_disable_subchannel (struct subchannel *sch)
  *   -ENODEV for subchannels with invalid device number or blacklisted devices
  */
 int
-cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
+cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 {
 	char dbf_txt[15];
 	int ccode;
 
-	sprintf (dbf_txt, "valsch%x", irq);
+	sprintf (dbf_txt, "valsch%x", schid.sch_no);
 	CIO_TRACE_EVENT (4, dbf_txt);
 
 	/* Nuke all fields. */
@@ -522,17 +522,17 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
 	spin_lock_init(&sch->lock);
 
 	/* Set a name for the subchannel */
-	snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", irq);
+	snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", schid.sch_no);
 
 	/*
 	 * The first subchannel that is not-operational (ccode==3)
 	 *  indicates that there aren't any more devices available.
 	 */
-	sch->irq = irq;
-	ccode = stsch (irq, &sch->schib);
+	ccode = stsch (schid, &sch->schib);
 	if (ccode)
 		return -ENXIO;
 
+	sch->schid = schid;
 	/* Copy subchannel type from path management control word. */
 	sch->st = sch->schib.pmcw.st;
 
@@ -543,7 +543,7 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
 		CIO_DEBUG(KERN_INFO, 0,
 			  "Subchannel %04X reports "
 			  "non-I/O subchannel type %04X\n",
-			  sch->irq, sch->st);
+			  sch->schid.sch_no, sch->st);
 		/* We stop here for non-io subchannels. */
 		return sch->st;
 	}
@@ -573,7 +573,7 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
 	CIO_DEBUG(KERN_INFO, 0,
 		  "Detected device %04X on subchannel %04X"
 		  " - PIM = %02X, PAM = %02X, POM = %02X\n",
-		  sch->schib.pmcw.dev, sch->irq, sch->schib.pmcw.pim,
+		  sch->schib.pmcw.dev, sch->schid.sch_no, sch->schib.pmcw.pim,
 		  sch->schib.pmcw.pam, sch->schib.pmcw.pom);
 
 	/*
@@ -632,7 +632,7 @@ do_IRQ (struct pt_regs *regs)
 		if (sch)
 			spin_lock(&sch->lock);
 		/* Store interrupt response block to lowcore. */
-		if (tsch (tpi_info->irq, irb) == 0 && sch) {
+		if (tsch (tpi_info->schid, irb) == 0 && sch) {
 			/* Keep subchannel information word up to date. */
 			memcpy (&sch->schib.scsw, &irb->scsw,
 				sizeof (irb->scsw));
@@ -693,26 +693,28 @@ wait_cons_dev (void)
 static int
 cio_console_irq(void)
 {
-	int irq;
+	struct subchannel_id schid;
 	
+	init_subchannel_id(&schid);
 	if (console_irq != -1) {
 		/* VM provided us with the irq number of the console. */
-		if (stsch(console_irq, &console_subchannel.schib) != 0 ||
+		schid.sch_no = console_irq;
+		if (stsch(schid, &console_subchannel.schib) != 0 ||
 		    !console_subchannel.schib.pmcw.dnv)
 			return -1;
 		console_devno = console_subchannel.schib.pmcw.dev;
 	} else if (console_devno != -1) {
 		/* At least the console device number is known. */
-		for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
-			if (stsch(irq, &console_subchannel.schib) != 0)
+		do {
+			if (stsch(schid, &console_subchannel.schib) != 0)
 				break;
 			if (console_subchannel.schib.pmcw.dnv &&
 			    console_subchannel.schib.pmcw.dev ==
 			    console_devno) {
-				console_irq = irq;
+				console_irq = schid.sch_no;
 				break;
 			}
-		}
+		} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 		if (console_irq == -1)
 			return -1;
 	} else {
@@ -729,6 +731,7 @@ struct subchannel *
 cio_probe_console(void)
 {
 	int irq, ret;
+	struct subchannel_id schid;
 
 	if (xchg(&console_subchannel_in_use, 1) != 0)
 		return ERR_PTR(-EBUSY);
@@ -738,7 +741,9 @@ cio_probe_console(void)
 		return ERR_PTR(-ENODEV);
 	}
 	memset(&console_subchannel, 0, sizeof(struct subchannel));
-	ret = cio_validate_subchannel(&console_subchannel, irq);
+	init_subchannel_id(&schid);
+	schid.sch_no = irq;
+	ret = cio_validate_subchannel(&console_subchannel, schid);
 	if (ret) {
 		console_subchannel_in_use = 0;
 		return ERR_PTR(-ENODEV);
@@ -770,11 +775,11 @@ cio_release_console(void)
 
 /* Bah... hack to catch console special sausages. */
 int
-cio_is_console(int irq)
+cio_is_console(struct subchannel_id schid)
 {
 	if (!console_subchannel_in_use)
 		return 0;
-	return (irq == console_subchannel.irq);
+	return schid_equal(&schid, &console_subchannel.schid);
 }
 
 struct subchannel *
@@ -787,7 +792,7 @@ cio_get_console_subchannel(void)
 
 #endif
 static inline int
-__disable_subchannel_easy(unsigned int schid, struct schib *schib)
+__disable_subchannel_easy(struct subchannel_id schid, struct schib *schib)
 {
 	int retry, cc;
 
@@ -805,7 +810,7 @@ __disable_subchannel_easy(unsigned int schid, struct schib *schib)
 }
 
 static inline int
-__clear_subchannel_easy(unsigned int schid)
+__clear_subchannel_easy(struct subchannel_id schid)
 {
 	int retry;
 
@@ -815,8 +820,8 @@ __clear_subchannel_easy(unsigned int schid)
 		struct tpi_info ti;
 
 		if (tpi(&ti)) {
-			tsch(ti.irq, (struct irb *)__LC_IRB);
-			if (ti.irq == schid)
+			tsch(ti.schid, (struct irb *)__LC_IRB);
+			if (schid_equal(&ti.schid, &schid))
 				return 0;
 		}
 		udelay(100);
@@ -830,10 +835,11 @@ extern void do_reipl(unsigned long devno);
 void
 clear_all_subchannels(void)
 {
-	unsigned int schid;
+	struct subchannel_id schid;
 
 	local_irq_disable();
-	for (schid=0;schid<=highest_subchannel;schid++) {
+	init_subchannel_id(&schid);
+	do {
 		struct schib schib;
 		if (stsch(schid, &schib))
 			break; /* break out of the loop */
@@ -849,7 +855,7 @@ clear_all_subchannels(void)
 			stsch(schid, &schib);
 			__disable_subchannel_easy(schid, &schib);
 		}
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 }
 
 /* Make sure all subchannels are quiet before we re-ipl an lpar. */
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index c50a9da..0ca9873 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -1,6 +1,8 @@
 #ifndef S390_CIO_H
 #define S390_CIO_H
 
+#include "schid.h"
+
 /*
  * where we put the ssd info
  */
@@ -83,7 +85,7 @@ struct orb {
 
 /* subchannel data structure used by I/O subroutines */
 struct subchannel {
-	unsigned int irq;	/* aka. subchannel number */
+	struct subchannel_id schid;
 	spinlock_t lock;	/* subchannel lock */
 
 	enum {
@@ -114,7 +116,7 @@ struct subchannel {
 
 #define to_subchannel(n) container_of(n, struct subchannel, dev)
 
-extern int cio_validate_subchannel (struct subchannel *, unsigned int);
+extern int cio_validate_subchannel (struct subchannel *, struct subchannel_id);
 extern int cio_enable_subchannel (struct subchannel *, unsigned int);
 extern int cio_disable_subchannel (struct subchannel *);
 extern int cio_cancel (struct subchannel *);
@@ -127,14 +129,15 @@ extern int cio_cancel (struct subchannel *);
 extern int cio_set_options (struct subchannel *, int);
 extern int cio_get_options (struct subchannel *);
 extern int cio_modify (struct subchannel *);
+
 /* Use with care. */
 #ifdef CONFIG_CCW_CONSOLE
 extern struct subchannel *cio_probe_console(void);
 extern void cio_release_console(void);
-extern int cio_is_console(int irq);
+extern int cio_is_console(struct subchannel_id);
 extern struct subchannel *cio_get_console_subchannel(void);
 #else
-#define cio_is_console(irq) 0
+#define cio_is_console(schid) 0
 #define cio_get_console_subchannel() NULL
 #endif
 
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index b978f7f..0b03714 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/s390/cio/cmf.c ($Revision: 1.16 $)
+ * linux/drivers/s390/cio/cmf.c ($Revision: 1.19 $)
  *
  * Linux on zSeries Channel Measurement Facility support
  *
@@ -178,7 +178,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
 	/* msch can silently fail, so do it again if necessary */
 	for (retry = 0; retry < 3; retry++) {
 		/* prepare schib */
-		stsch(sch->irq, schib);
+		stsch(sch->schid, schib);
 		schib->pmcw.mme  = mme;
 		schib->pmcw.mbfc = mbfc;
 		/* address can be either a block address or a block index */
@@ -188,7 +188,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
 			schib->pmcw.mbi = address;
 
 		/* try to submit it */
-		switch(ret = msch_err(sch->irq, schib)) {
+		switch(ret = msch_err(sch->schid, schib)) {
 			case 0:
 				break;
 			case 1:
@@ -202,7 +202,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
 				ret = -EINVAL;
 				break;
 		}
-		stsch(sch->irq, schib); /* restore the schib */
+		stsch(sch->schid, schib); /* restore the schib */
 
 		if (ret)
 			break;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 7e4d57b..5137daf 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -33,7 +33,7 @@ struct device css_bus_device = {
 };
 
 static struct subchannel *
-css_alloc_subchannel(int irq)
+css_alloc_subchannel(struct subchannel_id schid)
 {
 	struct subchannel *sch;
 	int ret;
@@ -41,13 +41,11 @@ css_alloc_subchannel(int irq)
 	sch = kmalloc (sizeof (*sch), GFP_KERNEL | GFP_DMA);
 	if (sch == NULL)
 		return ERR_PTR(-ENOMEM);
-	ret = cio_validate_subchannel (sch, irq);
+	ret = cio_validate_subchannel (sch, schid);
 	if (ret < 0) {
 		kfree(sch);
 		return ERR_PTR(ret);
 	}
-	if (irq > highest_subchannel)
-		highest_subchannel = irq;
 
 	if (sch->st != SUBCHANNEL_TYPE_IO) {
 		/* For now we ignore all non-io subchannels. */
@@ -87,7 +85,7 @@ css_subchannel_release(struct device *dev)
 	struct subchannel *sch;
 
 	sch = to_subchannel(dev);
-	if (!cio_is_console(sch->irq))
+	if (!cio_is_console(sch->schid))
 		kfree(sch);
 }
 
@@ -114,12 +112,12 @@ css_register_subchannel(struct subchannel *sch)
 }
 
 int
-css_probe_device(int irq)
+css_probe_device(struct subchannel_id schid)
 {
 	int ret;
 	struct subchannel *sch;
 
-	sch = css_alloc_subchannel(irq);
+	sch = css_alloc_subchannel(schid);
 	if (IS_ERR(sch))
 		return PTR_ERR(sch);
 	ret = css_register_subchannel(sch);
@@ -132,26 +130,26 @@ static int
 check_subchannel(struct device * dev, void * data)
 {
 	struct subchannel *sch;
-	int irq = (unsigned long)data;
+	struct subchannel_id *schid = data;
 
 	sch = to_subchannel(dev);
-	return (sch->irq == irq);
+	return schid_equal(&sch->schid, schid);
 }
 
 struct subchannel *
-get_subchannel_by_schid(int irq)
+get_subchannel_by_schid(struct subchannel_id schid)
 {
 	struct device *dev;
 
 	dev = bus_find_device(&css_bus_type, NULL,
-			      (void *)(unsigned long)irq, check_subchannel);
+			      (void *)&schid, check_subchannel);
 
 	return dev ? to_subchannel(dev) : NULL;
 }
 
 
 static inline int
-css_get_subchannel_status(struct subchannel *sch, int schid)
+css_get_subchannel_status(struct subchannel *sch, struct subchannel_id schid)
 {
 	struct schib schib;
 	int cc;
@@ -170,13 +168,13 @@ css_get_subchannel_status(struct subchannel *sch, int schid)
 }
 	
 static int
-css_evaluate_subchannel(int irq, int slow)
+css_evaluate_subchannel(struct subchannel_id schid, int slow)
 {
 	int event, ret, disc;
 	struct subchannel *sch;
 	unsigned long flags;
 
-	sch = get_subchannel_by_schid(irq);
+	sch = get_subchannel_by_schid(schid);
 	disc = sch ? device_is_disconnected(sch) : 0;
 	if (disc && slow) {
 		if (sch)
@@ -194,9 +192,10 @@ css_evaluate_subchannel(int irq, int slow)
 			put_device(&sch->dev);
 		return -EAGAIN; /* Will be done on the slow path. */
 	}
-	event = css_get_subchannel_status(sch, irq);
+	event = css_get_subchannel_status(sch, schid);
 	CIO_MSG_EVENT(4, "Evaluating schid %04x, event %d, %s, %s path.\n",
-		      irq, event, sch?(disc?"disconnected":"normal"):"unknown",
+		      schid.sch_no, event,
+		      sch?(disc?"disconnected":"normal"):"unknown",
 		      slow?"slow":"fast");
 	switch (event) {
 	case CIO_NO_PATH:
@@ -253,7 +252,7 @@ css_evaluate_subchannel(int irq, int slow)
 			sch->schib.pmcw.intparm = 0;
 			cio_modify(sch);
 			put_device(&sch->dev);
-			ret = css_probe_device(irq);
+			ret = css_probe_device(schid);
 		} else {
 			/*
 			 * We can't immediately deregister the disconnected
@@ -272,7 +271,7 @@ css_evaluate_subchannel(int irq, int slow)
 			device_trigger_reprobe(sch);
 			spin_unlock_irqrestore(&sch->lock, flags);
 		}
-		ret = sch ? 0 : css_probe_device(irq);
+		ret = sch ? 0 : css_probe_device(schid);
 		break;
 	default:
 		BUG();
@@ -284,10 +283,12 @@ css_evaluate_subchannel(int irq, int slow)
 static void
 css_rescan_devices(void)
 {
-	int irq, ret;
+	int ret;
+	struct subchannel_id schid;
 
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
-		ret = css_evaluate_subchannel(irq, 1);
+	init_subchannel_id(&schid);
+	do {
+		ret = css_evaluate_subchannel(schid, 1);
 		/* No more memory. It doesn't make sense to continue. No
 		 * panic because this can happen in midflight and just
 		 * because we can't use a new device is no reason to crash
@@ -297,12 +298,12 @@ css_rescan_devices(void)
 		/* -ENXIO indicates that there are no more subchannels. */
 		if (ret == -ENXIO)
 			break;
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 }
 
 struct slow_subchannel {
 	struct list_head slow_list;
-	unsigned long schid;
+	struct subchannel_id schid;
 };
 
 static LIST_HEAD(slow_subchannels_head);
@@ -357,20 +358,24 @@ int
 css_process_crw(int irq)
 {
 	int ret;
+	struct subchannel_id mchk_schid;
 
 	CIO_CRW_EVENT(2, "source is subchannel %04X\n", irq);
 
 	if (need_rescan)
 		/* We need to iterate all subchannels anyway. */
 		return -EAGAIN;
+
+	init_subchannel_id(&mchk_schid);
+	mchk_schid.sch_no = irq;
 	/* 
 	 * Since we are always presented with IPI in the CRW, we have to
 	 * use stsch() to find out if the subchannel in question has come
 	 * or gone.
 	 */
-	ret = css_evaluate_subchannel(irq, 0);
+	ret = css_evaluate_subchannel(mchk_schid, 0);
 	if (ret == -EAGAIN) {
-		if (css_enqueue_subchannel_slow(irq)) {
+		if (css_enqueue_subchannel_slow(mchk_schid)) {
 			css_clear_subchannel_slow_list();
 			need_rescan = 1;
 		}
@@ -404,7 +409,8 @@ css_generate_pgid(void)
 static int __init
 init_channel_subsystem (void)
 {
-	int ret, irq;
+	int ret;
+	struct subchannel_id schid;
 
 	if (chsc_determine_css_characteristics() == 0)
 		css_characteristics_avail = 1;
@@ -420,13 +426,14 @@ init_channel_subsystem (void)
 
 	ctl_set_bit(6, 28);
 
-	for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
+	init_subchannel_id(&schid);
+	do {
 		struct subchannel *sch;
 
-		if (cio_is_console(irq))
+		if (cio_is_console(schid))
 			sch = cio_get_console_subchannel();
 		else {
-			sch = css_alloc_subchannel(irq);
+			sch = css_alloc_subchannel(schid);
 			if (IS_ERR(sch))
 				ret = PTR_ERR(sch);
 			else
@@ -448,7 +455,7 @@ init_channel_subsystem (void)
 		 * console subchannel.
 		 */
 		css_register_subchannel(sch);
-	}
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
 	return 0;
 
 out_bus:
@@ -482,7 +489,7 @@ struct bus_type css_bus_type = {
 subsys_initcall(init_channel_subsystem);
 
 int
-css_enqueue_subchannel_slow(unsigned long schid)
+css_enqueue_subchannel_slow(struct subchannel_id schid)
 {
 	struct slow_subchannel *new_slow_sch;
 	unsigned long flags;
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index 2004a6c..f26e16d 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -6,6 +6,8 @@
 
 #include <asm/cio.h>
 
+#include "schid.h"
+
 /*
  * path grouping stuff
  */
@@ -68,7 +70,7 @@ struct ccw_device_private {
 	atomic_t onoff;
 	unsigned long registered;
 	__u16 devno;		/* device number */
-	__u16 irq;		/* subchannel number */
+	__u16 sch_no;		/* subchannel number */
 	__u8 imask;		/* lpm mask for SNID/SID/SPGID */
 	int iretry;		/* retry counter SNID/SID/SPGID */
 	struct {
@@ -121,12 +123,11 @@ struct css_driver {
 extern struct bus_type css_bus_type;
 extern struct css_driver io_subchannel_driver;
 
-int css_probe_device(int irq);
-extern struct subchannel * get_subchannel_by_schid(int irq);
-extern unsigned int highest_subchannel;
+extern int css_probe_device(struct subchannel_id);
+extern struct subchannel * get_subchannel_by_schid(struct subchannel_id);
 extern int css_init_done;
 
-#define __MAX_SUBCHANNELS 65536
+#define __MAX_SUBCHANNEL 65535
 
 extern struct bus_type css_bus_type;
 extern struct device css_bus_device;
@@ -144,7 +145,7 @@ void device_set_waiting(struct subchannel *);
 void device_kill_pending_timer(struct subchannel *);
 
 /* Helper functions to build lists for the slow path. */
-int css_enqueue_subchannel_slow(unsigned long schid);
+extern int css_enqueue_subchannel_slow(struct subchannel_id schid);
 void css_walk_subchannel_slow_list(void (*fn)(unsigned long));
 void css_clear_subchannel_slow_list(void);
 int css_slow_subchannels_exist(void);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 0590cff..9ac07ae 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -622,7 +622,7 @@ ccw_device_do_unreg_rereg(void *data)
 
 			other_sch = to_subchannel(other_cdev->dev.parent);
 			if (get_device(&other_sch->dev)) {
-				stsch(other_sch->irq, &other_sch->schib);
+				stsch(other_sch->schid, &other_sch->schib);
 				if (other_sch->schib.pmcw.dnv) {
 					other_sch->schib.pmcw.intparm = 0;
 					cio_modify(other_sch);
@@ -772,7 +772,7 @@ io_subchannel_recog(struct ccw_device *cdev, struct subchannel *sch)
 	/* Init private data. */
 	priv = cdev->private;
 	priv->devno = sch->schib.pmcw.dev;
-	priv->irq = sch->irq;
+	priv->sch_no = sch->schid.sch_no;
 	priv->state = DEV_STATE_NOT_OPER;
 	INIT_LIST_HEAD(&priv->cmb_list);
 	init_waitqueue_head(&priv->wait_q);
@@ -951,7 +951,7 @@ io_subchannel_shutdown(struct device *dev)
 	sch = to_subchannel(dev);
 	cdev = dev->driver_data;
 
-	if (cio_is_console(sch->irq))
+	if (cio_is_console(sch->schid))
 		return;
 	if (!sch->schib.pmcw.ena)
 		/* Nothing to do. */
@@ -1146,6 +1146,16 @@ ccw_driver_unregister (struct ccw_driver *cdriver)
 	driver_unregister(&cdriver->driver);
 }
 
+/* Helper func for qdio. */
+struct subchannel_id
+ccw_device_get_subchannel_id(struct ccw_device *cdev)
+{
+	struct subchannel *sch;
+
+	sch = to_subchannel(cdev->dev.parent);
+	return sch->schid;
+}
+
 MODULE_LICENSE("GPL");
 EXPORT_SYMBOL(ccw_device_set_online);
 EXPORT_SYMBOL(ccw_device_set_offline);
@@ -1155,3 +1165,4 @@ EXPORT_SYMBOL(get_ccwdev_by_busid);
 EXPORT_SYMBOL(ccw_bus_type);
 EXPORT_SYMBOL(ccw_device_work);
 EXPORT_SYMBOL(ccw_device_notify_work);
+EXPORT_SYMBOL_GPL(ccw_device_get_subchannel_id);
diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h
index a3aa056..11587eb 100644
--- a/drivers/s390/cio/device.h
+++ b/drivers/s390/cio/device.h
@@ -110,6 +110,7 @@ int ccw_device_stlck(struct ccw_device *);
 
 /* qdio needs this. */
 void ccw_device_set_timeout(struct ccw_device *, int);
+extern struct subchannel_id ccw_device_get_subchannel_id(struct ccw_device *);
 
 void retry_set_schib(struct ccw_device *cdev);
 #endif
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index c1c89f4..9efeae7 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -133,7 +133,7 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev)
 	int ret;
 
 	sch = to_subchannel(cdev->dev.parent);
-	ret = stsch(sch->irq, &sch->schib);
+	ret = stsch(sch->schid, &sch->schib);
 	if (ret || !sch->schib.pmcw.dnv)
 		return -ENODEV; 
 	if (!sch->schib.pmcw.ena || sch->schib.scsw.actl == 0)
@@ -231,7 +231,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	 * through ssch() and the path information is up to date.
 	 */
 	old_lpm = sch->lpm;
-	stsch(sch->irq, &sch->schib);
+	stsch(sch->schid, &sch->schib);
 	sch->lpm = sch->schib.pmcw.pim &
 		sch->schib.pmcw.pam &
 		sch->schib.pmcw.pom &
@@ -258,7 +258,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	case DEV_STATE_NOT_OPER:
 		CIO_DEBUG(KERN_WARNING, 2,
 			  "SenseID : unknown device %04x on subchannel %04x\n",
-			  cdev->private->devno, sch->irq);
+			  cdev->private->devno, sch->schid.sch_no);
 		break;
 	case DEV_STATE_OFFLINE:
 		if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
@@ -291,7 +291,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	case DEV_STATE_BOXED:
 		CIO_DEBUG(KERN_WARNING, 2,
 			  "SenseID : boxed device %04x on subchannel %04x\n",
-			  cdev->private->devno, sch->irq);
+			  cdev->private->devno, sch->schid.sch_no);
 		break;
 	}
 	cdev->private->state = state;
@@ -359,7 +359,7 @@ ccw_device_done(struct ccw_device *cdev, int state)
 	if (state == DEV_STATE_BOXED)
 		CIO_DEBUG(KERN_WARNING, 2,
 			  "Boxed device %04x on subchannel %04x\n",
-			  cdev->private->devno, sch->irq);
+			  cdev->private->devno, sch->schid.sch_no);
 
 	if (cdev->private->flags.donotify) {
 		cdev->private->flags.donotify = 0;
@@ -592,7 +592,7 @@ ccw_device_offline(struct ccw_device *cdev)
 	struct subchannel *sch;
 
 	sch = to_subchannel(cdev->dev.parent);
-	if (stsch(sch->irq, &sch->schib) || !sch->schib.pmcw.dnv)
+	if (stsch(sch->schid, &sch->schib) || !sch->schib.pmcw.dnv)
 		return -ENODEV;
 	if (cdev->private->state != DEV_STATE_ONLINE) {
 		if (sch->schib.scsw.actl != 0)
@@ -711,7 +711,7 @@ ccw_device_online_verify(struct ccw_device *cdev, enum dev_event dev_event)
 	 * Since we might not just be coming from an interrupt from the
 	 * subchannel we have to update the schib.
 	 */
-	stsch(sch->irq, &sch->schib);
+	stsch(sch->schid, &sch->schib);
 
 	if (sch->schib.scsw.actl != 0 ||
 	    (cdev->private->irb.scsw.stctl & SCSW_STCTL_STATUS_PEND)) {
@@ -923,7 +923,7 @@ ccw_device_wait4io_irq(struct ccw_device *cdev, enum dev_event dev_event)
 
 	/* Iff device is idle, reset timeout. */
 	sch = to_subchannel(cdev->dev.parent);
-	if (!stsch(sch->irq, &sch->schib))
+	if (!stsch(sch->schid, &sch->schib))
 		if (sch->schib.scsw.actl == 0)
 			ccw_device_set_timeout(cdev, 0);
 	/* Call the handler. */
@@ -1035,7 +1035,7 @@ device_trigger_reprobe(struct subchannel *sch)
 		return;
 
 	/* Update some values. */
-	if (stsch(sch->irq, &sch->schib))
+	if (stsch(sch->schid, &sch->schib))
 		return;
 
 	/*
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 0e68fb5..207881e 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -258,7 +258,7 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 		 */
 		CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel %04x "
 			      "reports cmd reject\n",
-			      cdev->private->devno, sch->irq);
+			      cdev->private->devno, sch->schid.sch_no);
 		return -EOPNOTSUPP;
 	}
 	if (irb->esw.esw0.erw.cons) {
@@ -280,13 +280,13 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 			CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x on"
 				      " subchannel %04x is 'not operational'\n",
 				      sch->orb.lpm, cdev->private->devno,
-				      sch->irq);
+				      sch->schid.sch_no);
 		return -EACCES;
 	}
 	/* Hmm, whatever happened, try again. */
 	CIO_MSG_EVENT(2, "SenseID : start_IO() for device %04x on "
 		      "subchannel %04x returns status %02X%02X\n",
-		      cdev->private->devno, sch->irq,
+		      cdev->private->devno, sch->schid.sch_no,
 		      irb->scsw.dstat, irb->scsw.cstat);
 	return -EAGAIN;
 }
diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c
index 85a3026..143b6c2 100644
--- a/drivers/s390/cio/device_ops.c
+++ b/drivers/s390/cio/device_ops.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/device_ops.c
  *
- *   $Revision: 1.57 $
+ *   $Revision: 1.58 $
  *
  *    Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
  *			 IBM Corporation
@@ -570,7 +570,7 @@ ccw_device_get_chp_desc(struct ccw_device *cdev, int chp_no)
 int
 _ccw_device_get_subchannel_number(struct ccw_device *cdev)
 {
-	return cdev->private->irq;
+	return cdev->private->sch_no;
 }
 
 int
diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index 757b270..f08e84c 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -59,7 +59,7 @@ __ccw_device_sense_pgid_start(struct ccw_device *cdev)
 			CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
 				      "%04x, lpm %02X, became 'not "
 				      "operational'\n",
-				      cdev->private->devno, sch->irq,
+				      cdev->private->devno, sch->schid.sch_no,
 				      cdev->private->imask);
 
 		}
@@ -121,13 +121,14 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
 	if (irb->scsw.cc == 3) {
 		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
 			      "%04x, lpm %02X, became 'not operational'\n",
-			      cdev->private->devno, sch->irq, sch->orb.lpm);
+			      cdev->private->devno, sch->schid.sch_no,
+			      sch->orb.lpm);
 		return -EACCES;
 	}
 	if (cdev->private->pgid.inf.ps.state2 == SNID_STATE2_RESVD_ELSE) {
 		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel %04x "
 			      "is reserved by someone else\n",
-			      cdev->private->devno, sch->irq);
+			      cdev->private->devno, sch->schid.sch_no);
 		return -EUSERS;
 	}
 	return 0;
@@ -237,7 +238,7 @@ __ccw_device_do_pgid(struct ccw_device *cdev, __u8 func)
 	sch->vpm &= ~cdev->private->imask;
 	CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
 		      "%04x, lpm %02X, became 'not operational'\n",
-		      cdev->private->devno, sch->irq, cdev->private->imask);
+		      cdev->private->devno, sch->schid.sch_no, cdev->private->imask);
 	return ret;
 }
 
@@ -271,7 +272,7 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
 	if (irb->scsw.cc == 3) {
 		CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
 			      "%04x, lpm %02X, became 'not operational'\n",
-			      cdev->private->devno, sch->irq,
+			      cdev->private->devno, sch->schid.sch_no,
 			      cdev->private->imask);
 		return -EACCES;
 	}
@@ -373,7 +374,7 @@ ccw_device_verify_start(struct ccw_device *cdev)
 	 * Update sch->lpm with current values to catch paths becoming
 	 * available again.
 	 */
-	if (stsch(sch->irq, &sch->schib)) {
+	if (stsch(sch->schid, &sch->schib)) {
 		ccw_device_verify_done(cdev, -ENODEV);
 		return;
 	}
diff --git a/drivers/s390/cio/device_status.c b/drivers/s390/cio/device_status.c
index 12a24d4..929f8fb 100644
--- a/drivers/s390/cio/device_status.c
+++ b/drivers/s390/cio/device_status.c
@@ -38,13 +38,13 @@ ccw_device_msg_control_check(struct ccw_device *cdev, struct irb *irb)
 		      "received"
 		      " ... device %04X on subchannel %04X, dev_stat "
 		      ": %02X sch_stat : %02X\n",
-		      cdev->private->devno, cdev->private->irq,
+		      cdev->private->devno, cdev->private->sch_no,
 		      irb->scsw.dstat, irb->scsw.cstat);
 
 	if (irb->scsw.cc != 3) {
 		char dbf_text[15];
 
-		sprintf(dbf_text, "chk%x", cdev->private->irq);
+		sprintf(dbf_text, "chk%x", cdev->private->sch_no);
 		CIO_TRACE_EVENT(0, dbf_text);
 		CIO_HEX_EVENT(0, irb, sizeof (struct irb));
 	}
@@ -59,10 +59,10 @@ ccw_device_path_notoper(struct ccw_device *cdev)
 	struct subchannel *sch;
 
 	sch = to_subchannel(cdev->dev.parent);
-	stsch (sch->irq, &sch->schib);
+	stsch (sch->schid, &sch->schib);
 
 	CIO_MSG_EVENT(0, "%s(%04x) - path(s) %02x are "
-		      "not operational \n", __FUNCTION__, sch->irq,
+		      "not operational \n", __FUNCTION__, sch->schid.sch_no,
 		      sch->schib.pmcw.pnom);
 
 	sch->lpm &= ~sch->schib.pmcw.pnom;
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 45480a2..66c882e 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -1,12 +1,13 @@
 #ifndef S390_CIO_IOASM_H
 #define S390_CIO_IOASM_H
 
+#include "schid.h"
+
 /*
  * TPI info structure
  */
 struct tpi_info {
-	__u32 reserved1	 : 16;	 /* reserved 0x00000001 */
-	__u32 irq	 : 16;	 /* aka. subchannel number */
+	struct subchannel_id schid;
 	__u32 intparm;		 /* interruption parameter */
 	__u32 adapter_IO : 1;
 	__u32 reserved2	 : 1;
@@ -21,7 +22,8 @@ struct tpi_info {
  * Some S390 specific IO instructions as inline
  */
 
-static inline int stsch(int irq, volatile struct schib *addr)
+static inline int stsch(struct subchannel_id schid,
+			    volatile struct schib *addr)
 {
 	int ccode;
 
@@ -31,12 +33,13 @@ static inline int stsch(int irq, volatile struct schib *addr)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000), "a" (addr)
+		: "d" (schid), "a" (addr), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int msch(int irq, volatile struct schib *addr)
+static inline int msch(struct subchannel_id schid,
+			   volatile struct schib *addr)
 {
 	int ccode;
 
@@ -46,12 +49,13 @@ static inline int msch(int irq, volatile struct schib *addr)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L), "a" (addr)
+		: "d" (schid), "a" (addr), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int msch_err(int irq, volatile struct schib *addr)
+static inline int msch_err(struct subchannel_id schid,
+			       volatile struct schib *addr)
 {
 	int ccode;
 
@@ -74,12 +78,13 @@ static inline int msch_err(int irq, volatile struct schib *addr)
 		".previous"
 #endif
 		: "=&d" (ccode)
-		: "d" (irq | 0x10000L), "a" (addr), "K" (-EIO)
+		: "d" (schid), "a" (addr), "K" (-EIO), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int tsch(int irq, volatile struct irb *addr)
+static inline int tsch(struct subchannel_id schid,
+			   volatile struct irb *addr)
 {
 	int ccode;
 
@@ -89,7 +94,7 @@ static inline int tsch(int irq, volatile struct irb *addr)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L), "a" (addr)
+		: "d" (schid), "a" (addr), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
@@ -103,12 +108,13 @@ static inline int tpi( volatile struct tpi_info *addr)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "a" (addr)
+		: "a" (addr), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int ssch(int irq, volatile struct orb *addr)
+static inline int ssch(struct subchannel_id schid,
+			   volatile struct orb *addr)
 {
 	int ccode;
 
@@ -118,12 +124,12 @@ static inline int ssch(int irq, volatile struct orb *addr)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L), "a" (addr)
+		: "d" (schid), "a" (addr), "m" (*addr)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int rsch(int irq)
+static inline int rsch(struct subchannel_id schid)
 {
 	int ccode;
 
@@ -133,12 +139,12 @@ static inline int rsch(int irq)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L)
+		: "d" (schid)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int csch(int irq)
+static inline int csch(struct subchannel_id schid)
 {
 	int ccode;
 
@@ -148,12 +154,12 @@ static inline int csch(int irq)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L)
+		: "d" (schid)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int hsch(int irq)
+static inline int hsch(struct subchannel_id schid)
 {
 	int ccode;
 
@@ -163,12 +169,12 @@ static inline int hsch(int irq)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L)
+		: "d" (schid)
 		: "cc", "1" );
 	return ccode;
 }
 
-static inline int xsch(int irq)
+static inline int xsch(struct subchannel_id schid)
 {
 	int ccode;
 
@@ -178,21 +184,22 @@ static inline int xsch(int irq)
 		"   ipm	  %0\n"
 		"   srl	  %0,28"
 		: "=d" (ccode)
-		: "d" (irq | 0x10000L)
+		: "d" (schid)
 		: "cc", "1" );
 	return ccode;
 }
 
 static inline int chsc(void *chsc_area)
 {
+	typedef struct { char _[4096]; } addr_type;
 	int cc;
 
 	__asm__ __volatile__ (
-		".insn	rre,0xb25f0000,%1,0	\n\t"
+		".insn	rre,0xb25f0000,%2,0	\n\t"
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
-		: "=d" (cc)
-		: "d" (chsc_area)
+		: "=d" (cc), "=m" (*(addr_type *) chsc_area)
+		: "d" (chsc_area), "m" (*(addr_type *) chsc_area)
 		: "cc" );
 
 	return cc;
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index e8bdfcd..5c7001b 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -270,7 +270,7 @@ qdio_siga_sync(struct qdio_q *q, unsigned int gpr2,
 	perf_stats.siga_syncs++;
 #endif /* QDIO_PERFORMANCE_STATS */
 
-	cc = do_siga_sync(0x10000|q->irq, gpr2, gpr3);
+	cc = do_siga_sync(q->schid, gpr2, gpr3);
 	if (cc)
 		QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
 
@@ -290,12 +290,16 @@ __do_siga_output(struct qdio_q *q, unsigned int *busy_bit)
 {
        struct qdio_irq *irq;
        unsigned int fc = 0;
+       unsigned long schid;
 
        irq = (struct qdio_irq *) q->irq_ptr;
        if (!irq->is_qebsm)
-               return do_siga_output(0x10000|q->irq, q->mask, busy_bit, fc);
-       fc |= 0x80;
-       return do_siga_output(irq->sch_token, q->mask, busy_bit, fc);
+	       schid = *((u32 *)&q->schid);
+       else {
+	       schid = irq->sch_token;
+	       fc |= 0x80;
+       }
+       return do_siga_output(schid, q->mask, busy_bit, fc);
 }
 
 /* 
@@ -349,7 +353,7 @@ qdio_siga_input(struct qdio_q *q)
 	perf_stats.siga_ins++;
 #endif /* QDIO_PERFORMANCE_STATS */
 
-	cc = do_siga_input(0x10000|q->irq, q->mask);
+	cc = do_siga_input(q->schid, q->mask);
 	
 	if (cc)
 		QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
@@ -855,7 +859,7 @@ qdio_kick_outbound_q(struct qdio_q *q)
 		/* went smooth this time, reset timestamp */
 #ifdef CONFIG_QDIO_DEBUG
 		QDIO_DBF_TEXT3(0,trace,"cc2reslv");
-		sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
+		sprintf(dbf_text,"%4x%2x%2x",q->schid.sch_no,q->q_no,
 			atomic_read(&q->busy_siga_counter));
 		QDIO_DBF_TEXT3(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
@@ -878,7 +882,7 @@ qdio_kick_outbound_q(struct qdio_q *q)
 		}
 		QDIO_DBF_TEXT2(0,trace,"cc2REPRT");
 #ifdef CONFIG_QDIO_DEBUG
-		sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no,
+		sprintf(dbf_text,"%4x%2x%2x",q->schid.sch_no,q->q_no,
 			atomic_read(&q->busy_siga_counter));
 		QDIO_DBF_TEXT3(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
@@ -1733,7 +1737,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
 	void *ptr;
 	int available;
 
-	sprintf(dbf_text,"qfqs%4x",cdev->private->irq);
+	sprintf(dbf_text,"qfqs%4x",cdev->private->sch_no);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 	for (i=0;i<no_input_qs;i++) {
 		q=irq_ptr->input_qs[i];
@@ -1753,7 +1757,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
 
                 q->queue_type=q_format;
 		q->int_parm=int_parm;
-		q->irq=irq_ptr->irq;
+		q->schid = irq_ptr->schid;
 		q->irq_ptr = irq_ptr;
 		q->cdev = cdev;
 		q->mask=1<<(31-i);
@@ -1826,7 +1830,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
                 q->queue_type=q_format;
 		q->int_parm=int_parm;
 		q->is_input_q=0;
-		q->irq=irq_ptr->irq;
+		q->schid = irq_ptr->schid;
 		q->cdev = cdev;
 		q->irq_ptr = irq_ptr;
 		q->mask=1<<(31-i);
@@ -1933,7 +1937,7 @@ qdio_set_state(struct qdio_irq *irq_ptr, enum qdio_irq_states state)
 	char dbf_text[15];
 
 	QDIO_DBF_TEXT5(0,trace,"newstate");
-	sprintf(dbf_text,"%4x%4x",irq_ptr->irq,state);
+	sprintf(dbf_text,"%4x%4x",irq_ptr->schid.sch_no,state);
 	QDIO_DBF_TEXT5(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
 
@@ -1946,12 +1950,12 @@ qdio_set_state(struct qdio_irq *irq_ptr, enum qdio_irq_states state)
 }
 
 static inline void
-qdio_irq_check_sense(int irq, struct irb *irb)
+qdio_irq_check_sense(struct subchannel_id schid, struct irb *irb)
 {
 	char dbf_text[15];
 
 	if (irb->esw.esw0.erw.cons) {
-		sprintf(dbf_text,"sens%4x",irq);
+		sprintf(dbf_text,"sens%4x",schid.sch_no);
 		QDIO_DBF_TEXT2(1,trace,dbf_text);
 		QDIO_DBF_HEX0(0,sense,irb,QDIO_DBF_SENSE_LEN);
 
@@ -2063,20 +2067,20 @@ qdio_timeout_handler(struct ccw_device *cdev)
 	switch (irq_ptr->state) {
 	case QDIO_IRQ_STATE_INACTIVE:
 		QDIO_PRINT_ERR("establish queues on irq %04x: timed out\n",
-			       irq_ptr->irq);
+			       irq_ptr->schid.sch_no);
 		QDIO_DBF_TEXT2(1,setup,"eq:timeo");
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		break;
 	case QDIO_IRQ_STATE_CLEANUP:
 		QDIO_PRINT_INFO("Did not get interrupt on cleanup, irq=0x%x.\n",
-				irq_ptr->irq);
+				irq_ptr->schid.sch_no);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		break;
 	case QDIO_IRQ_STATE_ESTABLISHED:
 	case QDIO_IRQ_STATE_ACTIVE:
 		/* I/O has been terminated by common I/O layer. */
 		QDIO_PRINT_INFO("Queues on irq %04x killed by cio.\n",
-				irq_ptr->irq);
+				irq_ptr->schid.sch_no);
 		QDIO_DBF_TEXT2(1, trace, "cio:term");
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_STOPPED);
 		if (get_device(&cdev->dev)) {
@@ -2139,7 +2143,7 @@ qdio_handler(struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
 		}
 	}
 
-	qdio_irq_check_sense(irq_ptr->irq, irb);
+	qdio_irq_check_sense(irq_ptr->schid, irb);
 
 #ifdef CONFIG_QDIO_DEBUG
 	sprintf(dbf_text, "state:%d", irq_ptr->state);
@@ -2195,7 +2199,7 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
 		return -ENODEV;
 
 #ifdef CONFIG_QDIO_DEBUG
-	*((int*)(&dbf_text[4])) = irq_ptr->irq;
+	*((int*)(&dbf_text[4])) = irq_ptr->schid.sch_no;
 	QDIO_DBF_HEX4(0,trace,dbf_text,QDIO_DBF_TRACE_LEN);
 	*((int*)(&dbf_text[0]))=flags;
 	*((int*)(&dbf_text[4]))=queue_number;
@@ -2207,13 +2211,13 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
 		if (!q)
 			return -EINVAL;
 		if (!(irq_ptr->is_qebsm))
-			cc = do_siga_sync(0x10000|q->irq, 0, q->mask);
+			cc = do_siga_sync(q->schid, 0, q->mask);
 	} else if (flags&QDIO_FLAG_SYNC_OUTPUT) {
 		q=irq_ptr->output_qs[queue_number];
 		if (!q)
 			return -EINVAL;
 		if (!(irq_ptr->is_qebsm))
-			cc = do_siga_sync(0x10000|q->irq, q->mask, 0);
+			cc = do_siga_sync(q->schid, q->mask, 0);
 	} else 
 		return -EINVAL;
 
@@ -2298,7 +2302,7 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	ssqd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!ssqd_area) {
 	        QDIO_PRINT_WARN("Could not get memory for chsc. Using all " \
-				"SIGAs for sch x%x.\n", irq_ptr->irq);
+				"SIGAs for sch x%x.\n", irq_ptr->schid.sch_no);
 		irq_ptr->qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 				  CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 				  CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
@@ -2312,14 +2316,14 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 		.length = 0x0010,
 		.code   = 0x0024,
 	};
-	ssqd_area->first_sch = irq_ptr->irq;
-	ssqd_area->last_sch = irq_ptr->irq;
+	ssqd_area->first_sch = irq_ptr->schid.sch_no;
+	ssqd_area->last_sch = irq_ptr->schid.sch_no;
 	result = chsc(ssqd_area);
 
 	if (result) {
 		QDIO_PRINT_WARN("CHSC returned cc %i. Using all " \
 				"SIGAs for sch x%x.\n",
-				result, irq_ptr->irq);
+				result, irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
@@ -2330,7 +2334,7 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	if (ssqd_area->response.code != QDIO_CHSC_RESPONSE_CODE_OK) {
 		QDIO_PRINT_WARN("response upon checking SIGA needs " \
 				"is 0x%x. Using all SIGAs for sch x%x.\n",
-				ssqd_area->response.code, irq_ptr->irq);
+				ssqd_area->response.code, irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
@@ -2339,9 +2343,9 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	}
 	if (!(ssqd_area->flags & CHSC_FLAG_QDIO_CAPABILITY) ||
 	    !(ssqd_area->flags & CHSC_FLAG_VALIDITY) ||
-	    (ssqd_area->sch != irq_ptr->irq)) {
+	    (ssqd_area->sch != irq_ptr->schid.sch_no)) {
 		QDIO_PRINT_WARN("huh? problems checking out sch x%x... " \
-				"using all SIGAs.\n",irq_ptr->irq);
+				"using all SIGAs.\n",irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY |
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY |
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* worst case */
@@ -2427,7 +2431,7 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 		/* set to 0x10000000 to enable
 		 * time delay disablement facility */
 		u32 reserved5;
-		u32 subsystem_id;
+		struct subchannel_id schid;
 		u32 reserved6[1004];
 		struct chsc_header response;
 		u32 reserved7;
@@ -2449,7 +2453,7 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 	scssc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!scssc_area) {
 		QDIO_PRINT_WARN("No memory for setting indicators on " \
-				"subchannel x%x.\n", irq_ptr->irq);
+				"subchannel x%x.\n", irq_ptr->schid.sch_no);
 		return -ENOMEM;
 	}
 	scssc_area->request = (struct chsc_header) {
@@ -2463,7 +2467,7 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 	scssc_area->ks = QDIO_STORAGE_KEY;
 	scssc_area->kc = QDIO_STORAGE_KEY;
 	scssc_area->isc = TIQDIO_THININT_ISC;
-	scssc_area->subsystem_id = (1<<16) + irq_ptr->irq;
+	scssc_area->schid = irq_ptr->schid;
 	/* enables the time delay disablement facility. Don't care
 	 * whether it is really there (i.e. we haven't checked for
 	 * it) */
@@ -2473,12 +2477,10 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 		QDIO_PRINT_WARN("Time delay disablement facility " \
 				"not available\n");
 
-
-
 	result = chsc(scssc_area);
 	if (result) {
 		QDIO_PRINT_WARN("could not set indicators on irq x%x, " \
-				"cc=%i.\n",irq_ptr->irq,result);
+				"cc=%i.\n",irq_ptr->schid.sch_no,result);
 		result = -EIO;
 		goto out;
 	}
@@ -2534,7 +2536,7 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
 	scsscf_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!scsscf_area) {
 		QDIO_PRINT_WARN("No memory for setting delay target on " \
-				"subchannel x%x.\n", irq_ptr->irq);
+				"subchannel x%x.\n", irq_ptr->schid.sch_no);
 		return -ENOMEM;
 	}
 	scsscf_area->request = (struct chsc_header) {
@@ -2547,7 +2549,8 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
 	result=chsc(scsscf_area);
 	if (result) {
 		QDIO_PRINT_WARN("could not set delay target on irq x%x, " \
-				"cc=%i. Continuing.\n",irq_ptr->irq,result);
+				"cc=%i. Continuing.\n",irq_ptr->schid.sch_no,
+				result);
 		result = -EIO;
 		goto out;
 	}
@@ -2581,7 +2584,7 @@ qdio_cleanup(struct ccw_device *cdev, int how)
 	if (!irq_ptr)
 		return -ENODEV;
 
-	sprintf(dbf_text,"qcln%4x",irq_ptr->irq);
+	sprintf(dbf_text,"qcln%4x",irq_ptr->schid.sch_no);
 	QDIO_DBF_TEXT1(0,trace,dbf_text);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
@@ -2608,7 +2611,7 @@ qdio_shutdown(struct ccw_device *cdev, int how)
 
 	down(&irq_ptr->setting_up_sema);
 
-	sprintf(dbf_text,"qsqs%4x",irq_ptr->irq);
+	sprintf(dbf_text,"qsqs%4x",irq_ptr->schid.sch_no);
 	QDIO_DBF_TEXT1(0,trace,dbf_text);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
@@ -2714,7 +2717,7 @@ qdio_free(struct ccw_device *cdev)
 
 	down(&irq_ptr->setting_up_sema);
 
-	sprintf(dbf_text,"qfqs%4x",irq_ptr->irq);
+	sprintf(dbf_text,"qfqs%4x",irq_ptr->schid.sch_no);
 	QDIO_DBF_TEXT1(0,trace,dbf_text);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
@@ -2862,13 +2865,13 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 	irq_ptr = cdev->private->qdio_data;
 
 	if (cstat || (dstat & ~(DEV_STAT_CHN_END|DEV_STAT_DEV_END))) {
-		sprintf(dbf_text,"ick1%4x",irq_ptr->irq);
+		sprintf(dbf_text,"ick1%4x",irq_ptr->schid.sch_no);
 		QDIO_DBF_TEXT2(1,trace,dbf_text);
 		QDIO_DBF_HEX2(0,trace,&dstat,sizeof(int));
 		QDIO_DBF_HEX2(0,trace,&cstat,sizeof(int));
 		QDIO_PRINT_ERR("received check condition on establish " \
 			       "queues on irq 0x%x (cs=x%x, ds=x%x).\n",
-			       irq_ptr->irq,cstat,dstat);
+			       irq_ptr->schid.sch_no,cstat,dstat);
 		qdio_set_state(irq_ptr,QDIO_IRQ_STATE_ERR);
 	}
 	
@@ -2878,7 +2881,7 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 		QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat));
 		QDIO_PRINT_ERR("establish queues on irq %04x: didn't get "
 			       "device end: dstat=%02x, cstat=%02x\n",
-			       irq_ptr->irq, dstat, cstat);
+			       irq_ptr->schid.sch_no, dstat, cstat);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		return 1;
 	}
@@ -2890,7 +2893,7 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 		QDIO_PRINT_ERR("establish queues on irq %04x: got "
 			       "the following devstat: dstat=%02x, "
 			       "cstat=%02x\n",
-			       irq_ptr->irq, dstat, cstat);
+			       irq_ptr->schid.sch_no, dstat, cstat);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		return 1;
 	}
@@ -2905,7 +2908,7 @@ qdio_establish_handle_irq(struct ccw_device *cdev, int cstat, int dstat)
 
 	irq_ptr = cdev->private->qdio_data;
 
-	sprintf(dbf_text,"qehi%4x",cdev->private->irq);
+	sprintf(dbf_text,"qehi%4x",cdev->private->sch_no);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 	QDIO_DBF_TEXT0(0,trace,dbf_text);
 
@@ -2924,7 +2927,7 @@ qdio_initialize(struct qdio_initialize *init_data)
 	int rc;
 	char dbf_text[15];
 
-	sprintf(dbf_text,"qini%4x",init_data->cdev->private->irq);
+	sprintf(dbf_text,"qini%4x",init_data->cdev->private->sch_no);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 	QDIO_DBF_TEXT0(0,trace,dbf_text);
 
@@ -2945,7 +2948,7 @@ qdio_allocate(struct qdio_initialize *init_data)
 	struct qdio_irq *irq_ptr;
 	char dbf_text[15];
 
-	sprintf(dbf_text,"qalc%4x",init_data->cdev->private->irq);
+	sprintf(dbf_text,"qalc%4x",init_data->cdev->private->sch_no);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 	QDIO_DBF_TEXT0(0,trace,dbf_text);
 	if ( (init_data->no_input_qs>QDIO_MAX_QUEUES_PER_IRQ) ||
@@ -3018,7 +3021,7 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
 
 	irq_ptr->int_parm=init_data->int_parm;
 
-	irq_ptr->irq = init_data->cdev->private->irq;
+	irq_ptr->schid = ccw_device_get_subchannel_id(init_data->cdev);
 	irq_ptr->no_input_qs=init_data->no_input_qs;
 	irq_ptr->no_output_qs=init_data->no_output_qs;
 
@@ -3038,7 +3041,7 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
 		QDIO_DBF_HEX1(0,setup,&irq_ptr->dev_st_chg_ind,sizeof(void*));
 		if (!irq_ptr->dev_st_chg_ind) {
 			QDIO_PRINT_WARN("no indicator location available " \
-					"for irq 0x%x\n",irq_ptr->irq);
+					"for irq 0x%x\n",irq_ptr->schid.sch_no);
 			qdio_release_irq_memory(irq_ptr);
 			return -ENOBUFS;
 		}
@@ -3169,7 +3172,7 @@ qdio_establish(struct qdio_initialize *init_data)
 		tiqdio_set_delay_target(irq_ptr,TIQDIO_DELAY_TARGET);
 	}
 
-	sprintf(dbf_text,"qest%4x",cdev->private->irq);
+	sprintf(dbf_text,"qest%4x",cdev->private->sch_no);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 	QDIO_DBF_TEXT0(0,trace,dbf_text);
 
@@ -3197,7 +3200,7 @@ qdio_establish(struct qdio_initialize *init_data)
 		}
 		QDIO_PRINT_WARN("establish queues on irq %04x: do_IO " \
                            "returned %i, next try returned %i\n",
-                           irq_ptr->irq,result,result2);
+                           irq_ptr->schid.sch_no,result,result2);
 		result=result2;
 		if (result)
 			ccw_device_set_timeout(cdev, 0);
@@ -3270,7 +3273,7 @@ qdio_activate(struct ccw_device *cdev, int flags)
 		goto out;
 	}
 
-	sprintf(dbf_text,"qact%4x", irq_ptr->irq);
+	sprintf(dbf_text,"qact%4x", irq_ptr->schid.sch_no);
 	QDIO_DBF_TEXT2(0,setup,dbf_text);
 	QDIO_DBF_TEXT2(0,trace,dbf_text);
 
@@ -3297,7 +3300,7 @@ qdio_activate(struct ccw_device *cdev, int flags)
 		}
 		QDIO_PRINT_WARN("activate queues on irq %04x: do_IO " \
                            "returned %i, next try returned %i\n",
-                           irq_ptr->irq,result,result2);
+                           irq_ptr->schid.sch_no,result,result2);
 		result=result2;
 	}
 
@@ -3509,7 +3512,7 @@ do_QDIO(struct ccw_device *cdev,unsigned int callflags,
 #ifdef CONFIG_QDIO_DEBUG
 	char dbf_text[20];
 
-	sprintf(dbf_text,"doQD%04x",cdev->private->irq);
+	sprintf(dbf_text,"doQD%04x",cdev->private->sch_no);
  	QDIO_DBF_TEXT3(0,trace,dbf_text);
 #endif /* CONFIG_QDIO_DEBUG */
 
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index b5d303e..43b840a 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -3,7 +3,9 @@
 
 #include <asm/page.h>
 
-#define VERSION_CIO_QDIO_H "$Revision: 1.37 $"
+#include "schid.h"
+
+#define VERSION_CIO_QDIO_H "$Revision: 1.40 $"
 
 #ifdef CONFIG_QDIO_DEBUG
 #define QDIO_VERBOSE_LEVEL 9
@@ -317,7 +319,7 @@ do_eqbs(unsigned long sch, unsigned char *state, int queue,
 
 
 static inline int
-do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
+do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 {
 	int cc;
 
@@ -331,7 +333,7 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (irq), "d" (mask1), "d" (mask2)
+		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
 #else /* CONFIG_ARCH_S390X */
@@ -344,7 +346,7 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (irq), "d" (mask1), "d" (mask2)
+		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
 #endif /* CONFIG_ARCH_S390X */
@@ -352,7 +354,7 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
 }
 
 static inline int
-do_siga_input(unsigned int irq, unsigned int mask)
+do_siga_input(struct subchannel_id schid, unsigned int mask)
 {
 	int cc;
 
@@ -365,7 +367,7 @@ do_siga_input(unsigned int irq, unsigned int mask)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (irq), "d" (mask)
+		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
 #else /* CONFIG_ARCH_S390X */
@@ -377,7 +379,7 @@ do_siga_input(unsigned int irq, unsigned int mask)
 		"ipm	%0	\n\t"
 		"srl	%0,28	\n\t"
 		: "=d" (cc)
-		: "d" (irq), "d" (mask)
+		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
 #endif /* CONFIG_ARCH_S390X */
@@ -386,7 +388,7 @@ do_siga_input(unsigned int irq, unsigned int mask)
 }
 
 static inline int
-do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb,
+do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 	       unsigned int fc)
 {
 	int cc;
@@ -418,7 +420,7 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb,
 		".long	0b,2b	\n\t"
 		".previous	\n\t"
 		: "=d" (cc), "=d" (busy_bit)
-		: "d" (irq), "d" (mask),
+		: "d" (schid), "d" (mask),
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
 		: "cc", "0", "1", "2", "memory"
 		);
@@ -443,7 +445,7 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb,
 		".quad	0b,1b	\n\t"
 		".previous	\n\t"
 		: "=d" (cc), "=d" (busy_bit)
-		: "d" (irq), "d" (mask),
+		: "d" (schid), "d" (mask),
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
 		: "cc", "0", "1", "2", "memory"
 		);
@@ -554,7 +556,7 @@ struct qdio_q {
 	__u32 * dev_st_chg_ind;
 
 	int is_input_q;
-	int irq;
+	struct subchannel_id schid;
 	struct ccw_device *cdev;
 
 	unsigned int is_iqdio_q;
@@ -649,7 +651,7 @@ struct qdio_irq {
 	__u32 * volatile dev_st_chg_ind;
 
 	unsigned long int_parm;
-	int irq;
+	struct subchannel_id schid;
 
 	unsigned int is_iqdio_irq;
 	unsigned int is_thinint_irq;
diff --git a/drivers/s390/cio/schid.h b/drivers/s390/cio/schid.h
new file mode 100644
index 0000000..220d978
--- /dev/null
+++ b/drivers/s390/cio/schid.h
@@ -0,0 +1,25 @@
+#ifndef S390_SCHID_H
+#define S390_SCHID_H
+
+struct subchannel_id {
+	__u32 reserved:15;
+	__u32 one:1;
+	__u32 sch_no:16;
+} __attribute__ ((packed,aligned(4)));
+
+
+/* Helper function for sane state of pre-allocated subchannel_id. */
+static inline void
+init_subchannel_id(struct subchannel_id *schid)
+{
+	memset(schid, 0, sizeof(struct subchannel_id));
+	schid->one = 1;
+}
+
+static inline int
+schid_equal(struct subchannel_id *schid1, struct subchannel_id *schid2)
+{
+	return !memcmp(schid1, schid2, sizeof(struct subchannel_id));
+}
+
+#endif /* S390_SCHID_H */
-- 
cgit v1.1


From f97a56fb768e5fe9cd07c56ca47870136bb5530c Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:22 -0800
Subject: [PATCH] s390: introduce for_each_subchannel

for_each_subchannel() is an iterator calling a function for every possible
subchannel id until non-zero is returned.  Convert the current iterating
functions to it.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/blacklist.c |  46 +++---
 drivers/s390/cio/chsc.c      | 372 ++++++++++++++++++++++---------------------
 drivers/s390/cio/cio.c       |  79 ++++-----
 drivers/s390/cio/css.c       | 110 +++++++------
 drivers/s390/cio/css.h       |   1 +
 5 files changed, 318 insertions(+), 290 deletions(-)

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index a4b0303..25e9848 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -219,6 +219,27 @@ is_blacklisted (int devno)
 }
 
 #ifdef CONFIG_PROC_FS
+static int
+__s390_redo_validation(struct subchannel_id schid, void *data)
+{
+	int ret;
+	struct subchannel *sch;
+
+	sch = get_subchannel_by_schid(schid);
+	if (sch) {
+		/* Already known. */
+		put_device(&sch->dev);
+		return 0;
+	}
+	ret = css_probe_device(schid);
+	if (ret == -ENXIO)
+		return ret; /* We're through. */
+	if (ret == -ENOMEM)
+		/* Stop validation for now. Bad, but no need for a panic. */
+		return ret;
+	return 0;
+}
+
 /*
  * Function: s390_redo_validation
  * Look for no longer blacklisted devices
@@ -226,30 +247,9 @@ is_blacklisted (int devno)
 static inline void
 s390_redo_validation (void)
 {
-	struct subchannel_id schid;
-
 	CIO_TRACE_EVENT (0, "redoval");
-	init_subchannel_id(&schid);
-	do {
-		int ret;
-		struct subchannel *sch;
-
-		sch = get_subchannel_by_schid(schid);
-		if (sch) {
-			/* Already known. */
-			put_device(&sch->dev);
-			continue;
-		}
-		ret = css_probe_device(schid);
-		if (ret == -ENXIO)
-			break; /* We're through. */
-		if (ret == -ENOMEM)
-			/*
-			 * Stop validation for now. Bad, but no need for a
-			 * panic.
-			 */
-			break;
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+
+	for_each_subchannel(__s390_redo_validation, NULL);
 }
 
 /*
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index aff5d14..78e0823 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -310,9 +310,14 @@ s390_set_chpid_offline( __u8 chpid)
 		queue_work(slow_path_wq, &slow_path_work);
 }
 
+struct res_acc_data {
+	struct channel_path *chp;
+	u32 fla_mask;
+	u16 fla;
+};
+
 static int
-s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
-			 struct subchannel *sch)
+s390_process_res_acc_sch(struct res_acc_data *res_data, struct subchannel *sch)
 {
 	int found;
 	int chp;
@@ -324,8 +329,9 @@ s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
 		 * check if chpid is in information updated by ssd
 		 */
 		if (sch->ssd_info.valid &&
-		    sch->ssd_info.chpid[chp] == chpid &&
-		    (sch->ssd_info.fla[chp] & fla_mask) == fla) {
+		    sch->ssd_info.chpid[chp] == res_data->chp->id &&
+		    (sch->ssd_info.fla[chp] & res_data->fla_mask)
+		    == res_data->fla) {
 			found = 1;
 			break;
 		}
@@ -345,18 +351,80 @@ s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
 	return 0x80 >> chp;
 }
 
+static inline int
+s390_process_res_acc_new_sch(struct subchannel_id schid)
+{
+	struct schib schib;
+	int ret;
+	/*
+	 * We don't know the device yet, but since a path
+	 * may be available now to the device we'll have
+	 * to do recognition again.
+	 * Since we don't have any idea about which chpid
+	 * that beast may be on we'll have to do a stsch
+	 * on all devices, grr...
+	 */
+	if (stsch(schid, &schib))
+		/* We're through */
+		return need_rescan ? -EAGAIN : -ENXIO;
+
+	/* Put it on the slow path. */
+	ret = css_enqueue_subchannel_slow(schid);
+	if (ret) {
+		css_clear_subchannel_slow_list();
+		need_rescan = 1;
+		return -EAGAIN;
+	}
+	return 0;
+}
+
 static int
-s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
+__s390_process_res_acc(struct subchannel_id schid, void *data)
 {
+	int chp_mask, old_lpm;
+	struct res_acc_data *res_data;
 	struct subchannel *sch;
+
+	res_data = (struct res_acc_data *)data;
+	sch = get_subchannel_by_schid(schid);
+	if (!sch)
+		/* Check if a subchannel is newly available. */
+		return s390_process_res_acc_new_sch(schid);
+
+	spin_lock_irq(&sch->lock);
+
+	chp_mask = s390_process_res_acc_sch(res_data, sch);
+
+	if (chp_mask == 0) {
+		spin_unlock_irq(&sch->lock);
+		return 0;
+	}
+	old_lpm = sch->lpm;
+	sch->lpm = ((sch->schib.pmcw.pim &
+		     sch->schib.pmcw.pam &
+		     sch->schib.pmcw.pom)
+		    | chp_mask) & sch->opm;
+	if (!old_lpm && sch->lpm)
+		device_trigger_reprobe(sch);
+	else if (sch->driver && sch->driver->verify)
+		sch->driver->verify(&sch->dev);
+
+	spin_unlock_irq(&sch->lock);
+	put_device(&sch->dev);
+	return (res_data->fla_mask == 0xffff) ? -ENODEV : 0;
+}
+
+
+static int
+s390_process_res_acc (struct res_acc_data *res_data)
+{
 	int rc;
-	struct subchannel_id schid;
 	char dbf_txt[15];
 
-	sprintf(dbf_txt, "accpr%x", chpid);
+	sprintf(dbf_txt, "accpr%x", res_data->chp->id);
 	CIO_TRACE_EVENT( 2, dbf_txt);
-	if (fla != 0) {
-		sprintf(dbf_txt, "fla%x", fla);
+	if (res_data->fla != 0) {
+		sprintf(dbf_txt, "fla%x", res_data->fla);
 		CIO_TRACE_EVENT( 2, dbf_txt);
 	}
 
@@ -367,71 +435,11 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
 	 * The more information we have (info), the less scanning
 	 * will we have to do.
 	 */
-
-	if (!get_chp_status(chpid))
-		return 0; /* no need to do the rest */
-
-	rc = 0;
-	init_subchannel_id(&schid);
-	do {
-		int chp_mask, old_lpm;
-
-		sch = get_subchannel_by_schid(schid);
-		if (!sch) {
-			struct schib schib;
-			int ret;
-			/*
-			 * We don't know the device yet, but since a path
-			 * may be available now to the device we'll have
-			 * to do recognition again.
-			 * Since we don't have any idea about which chpid
-			 * that beast may be on we'll have to do a stsch
-			 * on all devices, grr...
-			 */
-			if (stsch(schid, &schib)) {
-				/* We're through */
-				if (need_rescan)
-					rc = -EAGAIN;
-				break;
-			}
-			if (need_rescan) {
-				rc = -EAGAIN;
-				continue;
-			}
-			/* Put it on the slow path. */
-			ret = css_enqueue_subchannel_slow(schid);
-			if (ret) {
-				css_clear_subchannel_slow_list();
-				need_rescan = 1;
-			}
-			rc = -EAGAIN;
-			continue;
-		}
-	
-		spin_lock_irq(&sch->lock);
-
-		chp_mask = s390_process_res_acc_sch(chpid, fla, fla_mask, sch);
-
-		if (chp_mask == 0) {
-
-			spin_unlock_irq(&sch->lock);
-			continue;
-		}
-		old_lpm = sch->lpm;
-		sch->lpm = ((sch->schib.pmcw.pim &
-			     sch->schib.pmcw.pam &
-			     sch->schib.pmcw.pom)
-			    | chp_mask) & sch->opm;
-		if (!old_lpm && sch->lpm)
-			device_trigger_reprobe(sch);
-		else if (sch->driver && sch->driver->verify)
-			sch->driver->verify(&sch->dev);
-
-		spin_unlock_irq(&sch->lock);
-		put_device(&sch->dev);
-		if (fla_mask == 0xffff)
-			break;
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	rc = for_each_subchannel(__s390_process_res_acc, res_data);
+	if (css_slow_subchannels_exist())
+		rc = -EAGAIN;
+	else if (rc != -EAGAIN)
+		rc = 0;
 	return rc;
 }
 
@@ -469,6 +477,7 @@ int
 chsc_process_crw(void)
 {
 	int chpid, ret;
+	struct res_acc_data res_data;
 	struct {
 		struct chsc_header request;
 		u32 reserved1;
@@ -503,7 +512,7 @@ chsc_process_crw(void)
 	do {
 		int ccode, status;
 		memset(sei_area, 0, sizeof(*sei_area));
-
+		memset(&res_data, 0, sizeof(struct res_acc_data));
 		sei_area->request = (struct chsc_header) {
 			.length = 0x0010,
 			.code   = 0x000e,
@@ -576,26 +585,23 @@ chsc_process_crw(void)
 			if (status < 0)
 				new_channel_path(sei_area->rsid);
 			else if (!status)
-				return 0;
-			if ((sei_area->vf & 0x80) == 0) {
-				pr_debug("chpid: %x\n", sei_area->rsid);
-				ret = s390_process_res_acc(sei_area->rsid,
-							   0, 0);
-			} else if ((sei_area->vf & 0xc0) == 0x80) {
-				pr_debug("chpid: %x link addr: %x\n",
-					 sei_area->rsid, sei_area->fla);
-				ret = s390_process_res_acc(sei_area->rsid,
-							   sei_area->fla,
-							   0xff00);
-			} else if ((sei_area->vf & 0xc0) == 0xc0) {
-				pr_debug("chpid: %x full link addr: %x\n",
-					 sei_area->rsid, sei_area->fla);
-				ret = s390_process_res_acc(sei_area->rsid,
-							   sei_area->fla,
-							   0xffff);
+				break;
+			res_data.chp = chps[sei_area->rsid];
+			pr_debug("chpid: %x", sei_area->rsid);
+			if ((sei_area->vf & 0xc0) != 0) {
+				res_data.fla = sei_area->fla;
+				if ((sei_area->vf & 0xc0) == 0xc0) {
+					pr_debug(" full link addr: %x",
+						 sei_area->fla);
+					res_data.fla_mask = 0xffff;
+				} else {
+					pr_debug(" link addr: %x",
+						 sei_area->fla);
+					res_data.fla_mask = 0xff00;
+				}
 			}
-			pr_debug("\n");
-			
+			ret = s390_process_res_acc(&res_data);
+			pr_debug("\n\n");
 			break;
 			
 		default: /* other stuff */
@@ -607,12 +613,70 @@ chsc_process_crw(void)
 	return ret;
 }
 
+static inline int
+__chp_add_new_sch(struct subchannel_id schid)
+{
+	struct schib schib;
+	int ret;
+
+	if (stsch(schid, &schib))
+		/* We're through */
+		return need_rescan ? -EAGAIN : -ENXIO;
+
+	/* Put it on the slow path. */
+	ret = css_enqueue_subchannel_slow(schid);
+	if (ret) {
+		css_clear_subchannel_slow_list();
+		need_rescan = 1;
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+
 static int
-chp_add(int chpid)
+__chp_add(struct subchannel_id schid, void *data)
 {
+	int i;
+	struct channel_path *chp;
 	struct subchannel *sch;
-	int ret, rc;
-	struct subchannel_id schid;
+
+	chp = (struct channel_path *)data;
+	sch = get_subchannel_by_schid(schid);
+	if (!sch)
+		/* Check if the subchannel is now available. */
+		return __chp_add_new_sch(schid);
+	spin_lock(&sch->lock);
+	for (i=0; i<8; i++)
+		if (sch->schib.pmcw.chpid[i] == chp->id) {
+			if (stsch(sch->schid, &sch->schib) != 0) {
+				/* Endgame. */
+				spin_unlock(&sch->lock);
+				return -ENXIO;
+			}
+			break;
+		}
+	if (i==8) {
+		spin_unlock(&sch->lock);
+		return 0;
+	}
+	sch->lpm = ((sch->schib.pmcw.pim &
+		     sch->schib.pmcw.pam &
+		     sch->schib.pmcw.pom)
+		    | 0x80 >> i) & sch->opm;
+
+	if (sch->driver && sch->driver->verify)
+		sch->driver->verify(&sch->dev);
+
+	spin_unlock(&sch->lock);
+	put_device(&sch->dev);
+	return 0;
+}
+
+static int
+chp_add(int chpid)
+{
+	int rc;
 	char dbf_txt[15];
 
 	if (!get_chp_status(chpid))
@@ -621,60 +685,11 @@ chp_add(int chpid)
 	sprintf(dbf_txt, "cadd%x", chpid);
 	CIO_TRACE_EVENT(2, dbf_txt);
 
-	rc = 0;
-	init_subchannel_id(&schid);
-	do {
-		int i;
-
-		sch = get_subchannel_by_schid(schid);
-		if (!sch) {
-			struct schib schib;
-
-			if (stsch(schid, &schib)) {
-				/* We're through */
-				if (need_rescan)
-					rc = -EAGAIN;
-				break;
-			}
-			if (need_rescan) {
-				rc = -EAGAIN;
-				continue;
-			}
-			/* Put it on the slow path. */
-			ret = css_enqueue_subchannel_slow(schid);
-			if (ret) {
-				css_clear_subchannel_slow_list();
-				need_rescan = 1;
-			}
-			rc = -EAGAIN;
-			continue;
-		}
-	
-		spin_lock(&sch->lock);
-		for (i=0; i<8; i++)
-			if (sch->schib.pmcw.chpid[i] == chpid) {
-				if (stsch(sch->schid, &sch->schib) != 0) {
-					/* Endgame. */
-					spin_unlock(&sch->lock);
-					return rc;
-				}
-				break;
-			}
-		if (i==8) {
-			spin_unlock(&sch->lock);
-			return rc;
-		}
-		sch->lpm = ((sch->schib.pmcw.pim &
-			     sch->schib.pmcw.pam &
-			     sch->schib.pmcw.pom)
-			    | 0x80 >> i) & sch->opm;
-
-		if (sch->driver && sch->driver->verify)
-			sch->driver->verify(&sch->dev);
-
-		spin_unlock(&sch->lock);
-		put_device(&sch->dev);
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	rc = for_each_subchannel(__chp_add, chps[chpid]);
+	if (css_slow_subchannels_exist())
+		rc = -EAGAIN;
+	if (rc != -EAGAIN)
+		rc = 0;
 	return rc;
 }
 
@@ -786,6 +801,29 @@ s390_subchannel_vary_chpid_on(struct device *dev, void *data)
 	return 0;
 }
 
+static int
+__s390_vary_chpid_on(struct subchannel_id schid, void *data)
+{
+	struct schib schib;
+	struct subchannel *sch;
+
+	sch = get_subchannel_by_schid(schid);
+	if (sch) {
+		put_device(&sch->dev);
+		return 0;
+	}
+	if (stsch(schid, &schib))
+		/* We're through */
+		return -ENXIO;
+	/* Put it on the slow path. */
+	if (css_enqueue_subchannel_slow(schid)) {
+		css_clear_subchannel_slow_list();
+		need_rescan = 1;
+		return -EAGAIN;
+	}
+	return 0;
+}
+
 /*
  * Function: s390_vary_chpid
  * Varies the specified chpid online or offline
@@ -794,9 +832,7 @@ static int
 s390_vary_chpid( __u8 chpid, int on)
 {
 	char dbf_text[15];
-	int status, ret;
-	struct subchannel_id schid;
-	struct subchannel *sch;
+	int status;
 
 	sprintf(dbf_text, on?"varyon%x":"varyoff%x", chpid);
 	CIO_TRACE_EVENT( 2, dbf_text);
@@ -821,31 +857,9 @@ s390_vary_chpid( __u8 chpid, int on)
 	bus_for_each_dev(&css_bus_type, NULL, &chpid, on ?
 			 s390_subchannel_vary_chpid_on :
 			 s390_subchannel_vary_chpid_off);
-	if (!on)
-		goto out;
-	/* Scan for new devices on varied on path. */
-	init_subchannel_id(&schid);
-	do {
-		struct schib schib;
-
-		if (need_rescan)
-			break;
-		sch = get_subchannel_by_schid(schid);
-		if (sch) {
-			put_device(&sch->dev);
-			continue;
-		}
-		if (stsch(schid, &schib))
-			/* We're through */
-			break;
-		/* Put it on the slow path. */
-		ret = css_enqueue_subchannel_slow(schid);
-		if (ret) {
-			css_clear_subchannel_slow_list();
-			need_rescan = 1;
-		}
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
-out:
+	if (on)
+		/* Scan for new devices on varied on path. */
+		for_each_subchannel(__s390_vary_chpid_on, NULL);
 	if (need_rescan || css_slow_subchannels_exist())
 		queue_work(slow_path_wq, &slow_path_work);
 	return 0;
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 396bada..3eb6cb6 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -691,7 +691,22 @@ wait_cons_dev (void)
 }
 
 static int
-cio_console_irq(void)
+cio_test_for_console(struct subchannel_id schid, void *data)
+{
+	if (stsch(schid, &console_subchannel.schib) != 0)
+		return -ENXIO;
+	if (console_subchannel.schib.pmcw.dnv &&
+	    console_subchannel.schib.pmcw.dev ==
+	    console_devno) {
+		console_irq = schid.sch_no;
+		return 1; /* found */
+	}
+	return 0;
+}
+
+
+static int
+cio_get_console_sch_no(void)
 {
 	struct subchannel_id schid;
 	
@@ -705,16 +720,7 @@ cio_console_irq(void)
 		console_devno = console_subchannel.schib.pmcw.dev;
 	} else if (console_devno != -1) {
 		/* At least the console device number is known. */
-		do {
-			if (stsch(schid, &console_subchannel.schib) != 0)
-				break;
-			if (console_subchannel.schib.pmcw.dnv &&
-			    console_subchannel.schib.pmcw.dev ==
-			    console_devno) {
-				console_irq = schid.sch_no;
-				break;
-			}
-		} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+		for_each_subchannel(cio_test_for_console, NULL);
 		if (console_irq == -1)
 			return -1;
 	} else {
@@ -730,19 +736,19 @@ cio_console_irq(void)
 struct subchannel *
 cio_probe_console(void)
 {
-	int irq, ret;
+	int sch_no, ret;
 	struct subchannel_id schid;
 
 	if (xchg(&console_subchannel_in_use, 1) != 0)
 		return ERR_PTR(-EBUSY);
-	irq = cio_console_irq();
-	if (irq == -1) {
+	sch_no = cio_get_console_sch_no();
+	if (sch_no == -1) {
 		console_subchannel_in_use = 0;
 		return ERR_PTR(-ENODEV);
 	}
 	memset(&console_subchannel, 0, sizeof(struct subchannel));
 	init_subchannel_id(&schid);
-	schid.sch_no = irq;
+	schid.sch_no = sch_no;
 	ret = cio_validate_subchannel(&console_subchannel, schid);
 	if (ret) {
 		console_subchannel_in_use = 0;
@@ -830,32 +836,33 @@ __clear_subchannel_easy(struct subchannel_id schid)
 }
 
 extern void do_reipl(unsigned long devno);
+static int
+__shutdown_subchannel_easy(struct subchannel_id schid, void *data)
+{
+	struct schib schib;
+
+	if (stsch(schid, &schib))
+		return -ENXIO;
+	if (!schib.pmcw.ena)
+		return 0;
+	switch(__disable_subchannel_easy(schid, &schib)) {
+	case 0:
+	case -ENODEV:
+		break;
+	default: /* -EBUSY */
+		if (__clear_subchannel_easy(schid))
+			break; /* give up... */
+		stsch(schid, &schib);
+		__disable_subchannel_easy(schid, &schib);
+	}
+	return 0;
+}
 
-/* Clear all subchannels. */
 void
 clear_all_subchannels(void)
 {
-	struct subchannel_id schid;
-
 	local_irq_disable();
-	init_subchannel_id(&schid);
-	do {
-		struct schib schib;
-		if (stsch(schid, &schib))
-			break; /* break out of the loop */
-		if (!schib.pmcw.ena)
-			continue;
-		switch(__disable_subchannel_easy(schid, &schib)) {
-		case 0:
-		case -ENODEV:
-			break;
-		default: /* -EBUSY */
-			if (__clear_subchannel_easy(schid))
-				break; /* give up... jump out of switch */
-			stsch(schid, &schib);
-			__disable_subchannel_easy(schid, &schib);
-		}
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	for_each_subchannel(__shutdown_subchannel_easy, NULL);
 }
 
 /* Make sure all subchannels are quiet before we re-ipl an lpar. */
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 5137daf..dba632a 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -21,7 +21,6 @@
 #include "ioasm.h"
 #include "chsc.h"
 
-unsigned int highest_subchannel;
 int need_rescan = 0;
 int css_init_done = 0;
 
@@ -32,6 +31,22 @@ struct device css_bus_device = {
 	.bus_id = "css0",
 };
 
+inline int
+for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *data)
+{
+	struct subchannel_id schid;
+	int ret;
+
+	init_subchannel_id(&schid);
+	ret = -ENODEV;
+	do {
+		ret = fn(schid, data);
+		if (ret)
+			break;
+	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	return ret;
+}
+
 static struct subchannel *
 css_alloc_subchannel(struct subchannel_id schid)
 {
@@ -280,25 +295,10 @@ css_evaluate_subchannel(struct subchannel_id schid, int slow)
 	return ret;
 }
 
-static void
-css_rescan_devices(void)
+static int
+css_rescan_devices(struct subchannel_id schid, void *data)
 {
-	int ret;
-	struct subchannel_id schid;
-
-	init_subchannel_id(&schid);
-	do {
-		ret = css_evaluate_subchannel(schid, 1);
-		/* No more memory. It doesn't make sense to continue. No
-		 * panic because this can happen in midflight and just
-		 * because we can't use a new device is no reason to crash
-		 * the system. */
-		if (ret == -ENOMEM)
-			break;
-		/* -ENXIO indicates that there are no more subchannels. */
-		if (ret == -ENXIO)
-			break;
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	return css_evaluate_subchannel(schid, 1);
 }
 
 struct slow_subchannel {
@@ -316,7 +316,7 @@ css_trigger_slow_path(void)
 
 	if (need_rescan) {
 		need_rescan = 0;
-		css_rescan_devices();
+		for_each_subchannel(css_rescan_devices, NULL);
 		return;
 	}
 
@@ -383,6 +383,43 @@ css_process_crw(int irq)
 	return ret;
 }
 
+static int __init
+__init_channel_subsystem(struct subchannel_id schid, void *data)
+{
+	struct subchannel *sch;
+	int ret;
+
+	if (cio_is_console(schid))
+		sch = cio_get_console_subchannel();
+	else {
+		sch = css_alloc_subchannel(schid);
+		if (IS_ERR(sch))
+			ret = PTR_ERR(sch);
+		else
+			ret = 0;
+		switch (ret) {
+		case 0:
+			break;
+		case -ENOMEM:
+			panic("Out of memory in init_channel_subsystem\n");
+		/* -ENXIO: no more subchannels. */
+		case -ENXIO:
+			return ret;
+		default:
+			return 0;
+		}
+	}
+	/*
+	 * We register ALL valid subchannels in ioinfo, even those
+	 * that have been present before init_channel_subsystem.
+	 * These subchannels can't have been registered yet (kmalloc
+	 * not working) so we do it now. This is true e.g. for the
+	 * console subchannel.
+	 */
+	css_register_subchannel(sch);
+	return 0;
+}
+
 static void __init
 css_generate_pgid(void)
 {
@@ -410,7 +447,6 @@ static int __init
 init_channel_subsystem (void)
 {
 	int ret;
-	struct subchannel_id schid;
 
 	if (chsc_determine_css_characteristics() == 0)
 		css_characteristics_avail = 1;
@@ -426,38 +462,8 @@ init_channel_subsystem (void)
 
 	ctl_set_bit(6, 28);
 
-	init_subchannel_id(&schid);
-	do {
-		struct subchannel *sch;
-
-		if (cio_is_console(schid))
-			sch = cio_get_console_subchannel();
-		else {
-			sch = css_alloc_subchannel(schid);
-			if (IS_ERR(sch))
-				ret = PTR_ERR(sch);
-			else
-				ret = 0;
-			if (ret == -ENOMEM)
-				panic("Out of memory in "
-				      "init_channel_subsystem\n");
-			/* -ENXIO: no more subchannels. */
-			if (ret == -ENXIO)
-				break;
-			if (ret)
-				continue;
-		}
-		/*
-		 * We register ALL valid subchannels in ioinfo, even those
-		 * that have been present before init_channel_subsystem.
-		 * These subchannels can't have been registered yet (kmalloc
-		 * not working) so we do it now. This is true e.g. for the
-		 * console subchannel.
-		 */
-		css_register_subchannel(sch);
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+	for_each_subchannel(__init_channel_subsystem, NULL);
 	return 0;
-
 out_bus:
 	bus_unregister(&css_bus_type);
 out:
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index f26e16d..71efca2 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -126,6 +126,7 @@ extern struct css_driver io_subchannel_driver;
 extern int css_probe_device(struct subchannel_id);
 extern struct subchannel * get_subchannel_by_schid(struct subchannel_id);
 extern int css_init_done;
+extern int for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *);
 
 #define __MAX_SUBCHANNEL 65535
 
-- 
cgit v1.1


From a28c69448154a0901e8815922030c5dcd2f8e388 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:23 -0800
Subject: [PATCH] s390: introduce struct channel_subsystem

struct channel_subsystem encapsulates several per channel subsystem
properties, like status of chpids or the global path group id.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/chsc.c        | 32 ++++++++++++--------
 drivers/s390/cio/chsc.h        |  5 ++--
 drivers/s390/cio/css.c         | 67 +++++++++++++++++++++++++++++-------------
 drivers/s390/cio/css.h         | 25 +++++++++++++---
 drivers/s390/cio/device.c      |  4 ---
 drivers/s390/cio/device_pgid.c |  2 +-
 6 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 78e0823..ebd9249 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -24,8 +24,6 @@
 #include "ioasm.h"
 #include "chsc.h"
 
-static struct channel_path *chps[NR_CHPIDS];
-
 static void *sei_page;
 
 static int new_channel_path(int chpid);
@@ -33,13 +31,13 @@ static int new_channel_path(int chpid);
 static inline void
 set_chp_logically_online(int chp, int onoff)
 {
-	chps[chp]->state = onoff;
+	css[0]->chps[chp]->state = onoff;
 }
 
 static int
 get_chp_status(int chp)
 {
-	return (chps[chp] ? chps[chp]->state : -ENODEV);
+	return (css[0]->chps[chp] ? css[0]->chps[chp]->state : -ENODEV);
 }
 
 void
@@ -219,13 +217,13 @@ s390_subchannel_remove_chpid(struct device *dev, void *data)
 	int j;
 	int mask;
 	struct subchannel *sch;
-	__u8 *chpid;
+	struct channel_path *chpid;
 	struct schib schib;
 
 	sch = to_subchannel(dev);
 	chpid = data;
 	for (j = 0; j < 8; j++)
-		if (sch->schib.pmcw.chpid[j] == *chpid)
+		if (sch->schib.pmcw.chpid[j] == chpid->id)
 			break;
 	if (j >= 8)
 		return 0;
@@ -296,18 +294,20 @@ static inline void
 s390_set_chpid_offline( __u8 chpid)
 {
 	char dbf_txt[15];
+	struct device *dev;
 
 	sprintf(dbf_txt, "chpr%x", chpid);
 	CIO_TRACE_EVENT(2, dbf_txt);
 
 	if (get_chp_status(chpid) <= 0)
 		return;
-
-	bus_for_each_dev(&css_bus_type, NULL, &chpid,
+	dev = get_device(&css[0]->chps[chpid]->dev);
+	bus_for_each_dev(&css_bus_type, NULL, to_channelpath(dev),
 			 s390_subchannel_remove_chpid);
 
 	if (need_rescan || css_slow_subchannels_exist())
 		queue_work(slow_path_wq, &slow_path_work);
+	put_device(dev);
 }
 
 struct res_acc_data {
@@ -511,6 +511,7 @@ chsc_process_crw(void)
 	ret = 0;
 	do {
 		int ccode, status;
+		struct device *dev;
 		memset(sei_area, 0, sizeof(*sei_area));
 		memset(&res_data, 0, sizeof(struct res_acc_data));
 		sei_area->request = (struct chsc_header) {
@@ -586,7 +587,8 @@ chsc_process_crw(void)
 				new_channel_path(sei_area->rsid);
 			else if (!status)
 				break;
-			res_data.chp = chps[sei_area->rsid];
+			dev = get_device(&css[0]->chps[sei_area->rsid]->dev);
+			res_data.chp = to_channelpath(dev);
 			pr_debug("chpid: %x", sei_area->rsid);
 			if ((sei_area->vf & 0xc0) != 0) {
 				res_data.fla = sei_area->fla;
@@ -602,6 +604,7 @@ chsc_process_crw(void)
 			}
 			ret = s390_process_res_acc(&res_data);
 			pr_debug("\n\n");
+			put_device(dev);
 			break;
 			
 		default: /* other stuff */
@@ -678,6 +681,7 @@ chp_add(int chpid)
 {
 	int rc;
 	char dbf_txt[15];
+	struct device *dev;
 
 	if (!get_chp_status(chpid))
 		return 0; /* no need to do the rest */
@@ -685,11 +689,13 @@ chp_add(int chpid)
 	sprintf(dbf_txt, "cadd%x", chpid);
 	CIO_TRACE_EVENT(2, dbf_txt);
 
-	rc = for_each_subchannel(__chp_add, chps[chpid]);
+	dev = get_device(&css[0]->chps[chpid]->dev);
+	rc = for_each_subchannel(__chp_add, to_channelpath(dev));
 	if (css_slow_subchannels_exist())
 		rc = -EAGAIN;
 	if (rc != -EAGAIN)
 		rc = 0;
+	put_device(dev);
 	return rc;
 }
 
@@ -1016,7 +1022,7 @@ new_channel_path(int chpid)
 	chp->id = chpid;
 	chp->state = 1;
 	chp->dev = (struct device) {
-		.parent  = &css_bus_device,
+		.parent  = &css[0]->device,
 		.release = chp_release,
 	};
 	snprintf(chp->dev.bus_id, BUS_ID_SIZE, "chp0.%x", chpid);
@@ -1038,7 +1044,7 @@ new_channel_path(int chpid)
 		device_unregister(&chp->dev);
 		goto out_free;
 	} else
-		chps[chpid] = chp;
+		css[0]->chps[chpid] = chp;
 	return ret;
 out_free:
 	kfree(chp);
@@ -1051,7 +1057,7 @@ chsc_get_chp_desc(struct subchannel *sch, int chp_no)
 	struct channel_path *chp;
 	struct channel_path_desc *desc;
 
-	chp = chps[sch->schib.pmcw.chpid[chp_no]];
+	chp = css[0]->chps[sch->schib.pmcw.chpid[chp_no]];
 	if (!chp)
 		return NULL;
 	desc = kmalloc(sizeof(struct channel_path_desc), GFP_KERNEL);
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index 6945013..170083c 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -1,8 +1,6 @@
 #ifndef S390_CHSC_H
 #define S390_CHSC_H
 
-#define NR_CHPIDS 256
-
 #define CHSC_SEI_ACC_CHPID        1
 #define CHSC_SEI_ACC_LINKADDR     2
 #define CHSC_SEI_ACC_FULLLINKADDR 3
@@ -65,4 +63,7 @@ extern int chsc_determine_css_characteristics(void);
 extern int css_characteristics_avail;
 
 extern void *chsc_get_chp_desc(struct subchannel*, int);
+
+#define to_channelpath(dev) container_of(dev, struct channel_path, dev)
+
 #endif
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index dba632a..b6225cb 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -24,12 +24,9 @@
 int need_rescan = 0;
 int css_init_done = 0;
 
-struct pgid global_pgid;
-int css_characteristics_avail = 0;
+struct channel_subsystem *css[__MAX_CSSID + 1];
 
-struct device css_bus_device = {
-	.bus_id = "css0",
-};
+int css_characteristics_avail = 0;
 
 inline int
 for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *data)
@@ -112,7 +109,7 @@ css_register_subchannel(struct subchannel *sch)
 	int ret;
 
 	/* Initialize the subchannel structure */
-	sch->dev.parent = &css_bus_device;
+	sch->dev.parent = &css[0]->device;
 	sch->dev.bus = &css_bus_type;
 	sch->dev.release = &css_subchannel_release;
 	
@@ -421,21 +418,35 @@ __init_channel_subsystem(struct subchannel_id schid, void *data)
 }
 
 static void __init
-css_generate_pgid(void)
+css_generate_pgid(struct channel_subsystem *css, u32 tod_high)
 {
-	/* Let's build our path group ID here. */
-	if (css_characteristics_avail && css_general_characteristics.mcss)
-		global_pgid.cpu_addr = 0x8000;
-	else {
+	if (css_characteristics_avail && css_general_characteristics.mcss) {
+		css->global_pgid.pgid_high.ext_cssid.version = 0x80;
+		css->global_pgid.pgid_high.ext_cssid.cssid = css->cssid;
+	} else {
 #ifdef CONFIG_SMP
-		global_pgid.cpu_addr = hard_smp_processor_id();
+		css->global_pgid.pgid_high.cpu_addr = hard_smp_processor_id();
 #else
-		global_pgid.cpu_addr = 0;
+		css->global_pgid.pgid_high.cpu_addr = 0;
 #endif
 	}
-	global_pgid.cpu_id = ((cpuid_t *) __LC_CPUID)->ident;
-	global_pgid.cpu_model = ((cpuid_t *) __LC_CPUID)->machine;
-	global_pgid.tod_high = (__u32) (get_clock() >> 32);
+	css->global_pgid.cpu_id = ((cpuid_t *) __LC_CPUID)->ident;
+	css->global_pgid.cpu_model = ((cpuid_t *) __LC_CPUID)->machine;
+	css->global_pgid.tod_high = tod_high;
+
+}
+
+static inline void __init
+setup_css(int nr)
+{
+	u32 tod_high;
+
+	memset(css[nr], 0, sizeof(struct channel_subsystem));
+	css[nr]->valid = 1;
+	css[nr]->cssid = nr;
+	sprintf(css[nr]->device.bus_id, "css%x", nr);
+	tod_high = (u32) (get_clock() >> 32);
+	css_generate_pgid(css[nr], tod_high);
 }
 
 /*
@@ -446,25 +457,39 @@ css_generate_pgid(void)
 static int __init
 init_channel_subsystem (void)
 {
-	int ret;
+	int ret, i;
 
 	if (chsc_determine_css_characteristics() == 0)
 		css_characteristics_avail = 1;
 
-	css_generate_pgid();
-
 	if ((ret = bus_register(&css_bus_type)))
 		goto out;
-	if ((ret = device_register (&css_bus_device)))
-		goto out_bus;
 
+	/* Setup css structure. */
+	for (i = 0; i <= __MAX_CSSID; i++) {
+		css[i] = kmalloc(sizeof(struct channel_subsystem), GFP_KERNEL);
+		if (!css[i]) {
+			ret = -ENOMEM;
+			goto out_bus;
+		}
+		setup_css(i);
+		ret = device_register(&css[i]->device);
+		if (ret)
+			goto out_free;
+	}
 	css_init_done = 1;
 
 	ctl_set_bit(6, 28);
 
 	for_each_subchannel(__init_channel_subsystem, NULL);
 	return 0;
+out_free:
+	kfree(css[i]);
 out_bus:
+	while (i > 0) {
+		i--;
+		device_unregister(&css[i]->device);
+	}
 	bus_unregister(&css_bus_type);
 out:
 	return ret;
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index 71efca2..b74659c 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -35,19 +35,25 @@ struct path_state {
 	__u8  resvd  : 3;	/* reserved */
 } __attribute__ ((packed));
 
+struct extended_cssid {
+	u8 version;
+	u8 cssid;
+} __attribute__ ((packed));
+
 struct pgid {
 	union {
 		__u8 fc;   	/* SPID function code */
 		struct path_state ps;	/* SNID path state */
 	} inf;
-	__u32 cpu_addr	: 16;	/* CPU address */
+	union {
+		__u32 cpu_addr	: 16;	/* CPU address */
+		struct extended_cssid ext_cssid;
+	} pgid_high;
 	__u32 cpu_id	: 24;	/* CPU identification */
 	__u32 cpu_model : 16;	/* CPU model */
 	__u32 tod_high;		/* high word TOD clock */
 } __attribute__ ((packed));
 
-extern struct pgid global_pgid;
-
 #define MAX_CIWS 8
 
 /*
@@ -129,9 +135,20 @@ extern int css_init_done;
 extern int for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *);
 
 #define __MAX_SUBCHANNEL 65535
+#define __MAX_CHPID 255
+#define __MAX_CSSID 0
+
+struct channel_subsystem {
+	u8 cssid;
+	int valid;
+	struct channel_path *chps[__MAX_CHPID];
+	struct device device;
+	struct pgid global_pgid;
+};
+#define to_css(dev) container_of(dev, struct channel_subsystem, device)
 
 extern struct bus_type css_bus_type;
-extern struct device css_bus_device;
+extern struct channel_subsystem *css[];
 
 /* Some helper functions for disconnected state. */
 int device_is_disconnected(struct subchannel *);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 9ac07ae..ba9f7c1 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -986,10 +986,6 @@ ccw_device_console_enable (struct ccw_device *cdev, struct subchannel *sch)
 	cdev->dev = (struct device) {
 		.parent = &sch->dev,
 	};
-	/* Initialize the subchannel structure */
-	sch->dev.parent = &css_bus_device;
-	sch->dev.bus = &css_bus_type;
-
 	rc = io_subchannel_recog(cdev, sch);
 	if (rc)
 		return rc;
diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index f08e84c..3c89d70 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -164,7 +164,7 @@ ccw_device_sense_pgid_irq(struct ccw_device *cdev, enum dev_event dev_event)
 	/* 0, -ETIME, -EOPNOTSUPP, -EAGAIN, -EACCES or -EUSERS */
 	case 0:			/* Sense Path Group ID successful. */
 		if (cdev->private->pgid.inf.ps.state1 == SNID_STATE1_RESET)
-			memcpy(&cdev->private->pgid, &global_pgid,
+			memcpy(&cdev->private->pgid, &css[0]->global_pgid,
 			       sizeof(struct pgid));
 		ccw_device_sense_pgid_done(cdev, 0);
 		break;
-- 
cgit v1.1


From 678a395b356a98368a93c3640252502b70c3676f Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:24 -0800
Subject: [PATCH] s390: convert /proc/cio_ignore

Convert /proc/cio_ignore to a sequential file.  This makes multiple subchannel
sets support easier.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/blacklist.c | 122 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 30 deletions(-)

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 25e9848..daea41c 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 
@@ -279,41 +280,82 @@ blacklist_parse_proc_parameters (char *buf)
 	s390_redo_validation ();
 }
 
-/* FIXME: These should be real bus ids and not home-grown ones! */
-static int cio_ignore_read (char *page, char **start, off_t off,
-			    int count, int *eof, void *data)
+/* Iterator struct for all devices. */
+struct ccwdev_iter {
+	int devno;
+	int in_range;
+};
+
+static void *
+cio_ignore_proc_seq_start(struct seq_file *s, loff_t *offset)
 {
-	const unsigned int entry_size = 18; /* "0.0.ABCD-0.0.EFGH\n" */
-	long devno;
-	int len;
-
-	len = 0;
-	for (devno = off; /* abuse the page variable
-			   * as counter, see fs/proc/generic.c */
-	     devno < __MAX_SUBCHANNEL && len + entry_size < count; devno++) {
-		if (!test_bit(devno, bl_dev))
-			continue;
-		len += sprintf(page + len, "0.0.%04lx", devno);
-		if (test_bit(devno + 1, bl_dev)) { /* print range */
-			while (++devno < __MAX_SUBCHANNEL)
-				if (!test_bit(devno, bl_dev))
-					break;
-			len += sprintf(page + len, "-0.0.%04lx", --devno);
-		}
-		len += sprintf(page + len, "\n");
-	}
+	struct ccwdev_iter *iter;
+
+	if (*offset > __MAX_SUBCHANNEL)
+		return NULL;
+	iter = kmalloc(sizeof(struct ccwdev_iter), GFP_KERNEL);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+	memset(iter, 0, sizeof(struct ccwdev_iter));
+	iter->devno = *offset;
+	return iter;
+}
+
+static void
+cio_ignore_proc_seq_stop(struct seq_file *s, void *it)
+{
+	if (!IS_ERR(it))
+		kfree(it);
+}
 
-	if (devno < __MAX_SUBCHANNEL)
-		*eof = 1;
-	*start = (char *) (devno - off); /* number of checked entries */
-	return len;
+static void *
+cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
+{
+	struct ccwdev_iter *iter;
+
+	if (*offset > __MAX_SUBCHANNEL)
+		return NULL;
+	iter = (struct ccwdev_iter *)it;
+	iter->devno++;
+	(*offset)++;
+	return iter;
 }
 
-static int cio_ignore_write(struct file *file, const char __user *user_buf,
-			     unsigned long user_len, void *data)
+static int
+cio_ignore_proc_seq_show(struct seq_file *s, void *it)
+{
+	struct ccwdev_iter *iter;
+
+	iter = (struct ccwdev_iter *)it;
+	if (!is_blacklisted(iter->devno))
+		/* Not blacklisted, nothing to output. */
+		return 0;
+	if (!iter->in_range) {
+		/* First device in range. */
+		if ((iter->devno == __MAX_SUBCHANNEL) ||
+		    !is_blacklisted(iter->devno + 1))
+			/* Singular device. */
+			return seq_printf(s, "0.0.%04x\n", iter->devno);
+		iter->in_range = 1;
+		return seq_printf(s, "0.0.%04x-", iter->devno);
+	}
+	if ((iter->devno == __MAX_SUBCHANNEL) ||
+	    !is_blacklisted(iter->devno + 1)) {
+		/* Last device in range. */
+		iter->in_range = 0;
+		return seq_printf(s, "0.0.%04x\n", iter->devno);
+	}
+	return 0;
+}
+
+static ssize_t
+cio_ignore_write(struct file *file, const char __user *user_buf,
+		 size_t user_len, loff_t *offset)
 {
 	char *buf;
 
+	if (*offset)
+		return -EINVAL;
 	if (user_len > 65536)
 		user_len = 65536;
 	buf = vmalloc (user_len + 1); /* maybe better use the stack? */
@@ -331,6 +373,27 @@ static int cio_ignore_write(struct file *file, const char __user *user_buf,
 	return user_len;
 }
 
+static struct seq_operations cio_ignore_proc_seq_ops = {
+	.start = cio_ignore_proc_seq_start,
+	.stop  = cio_ignore_proc_seq_stop,
+	.next  = cio_ignore_proc_seq_next,
+	.show  = cio_ignore_proc_seq_show,
+};
+
+static int
+cio_ignore_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cio_ignore_proc_seq_ops);
+}
+
+static struct file_operations cio_ignore_proc_fops = {
+	.open    = cio_ignore_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cio_ignore_write,
+};
+
 static int
 cio_ignore_proc_init (void)
 {
@@ -341,8 +404,7 @@ cio_ignore_proc_init (void)
 	if (!entry)
 		return 0;
 
-	entry->read_proc  = cio_ignore_read;
-	entry->write_proc = cio_ignore_write;
+	entry->proc_fops = &cio_ignore_proc_fops;
 
 	return 1;
 }
-- 
cgit v1.1


From fb6958a594da49ece869793e6ec163b89fc5f79f Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:25 -0800
Subject: [PATCH] s390: multiple subchannel sets support

Add support for multiple subchannel sets.  Works with arbitrary devices in
subchannel set 1 and is transparent to device drivers.  Although currently
only two subchannel sets are available, this will work with the architectured
maximum number of subchannel sets as well.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/blacklist.c     | 86 +++++++++++++++++++++++-----------------
 drivers/s390/cio/blacklist.h     |  2 +-
 drivers/s390/cio/chsc.c          | 68 +++++++++++++++++++++++++++----
 drivers/s390/cio/chsc.h          |  4 ++
 drivers/s390/cio/cio.c           | 35 +++++++++-------
 drivers/s390/cio/css.c           | 44 ++++++++++++++------
 drivers/s390/cio/css.h           |  2 +
 drivers/s390/cio/device.c        | 22 ++++++----
 drivers/s390/cio/device_fsm.c    | 15 ++++---
 drivers/s390/cio/device_id.c     | 22 +++++-----
 drivers/s390/cio/device_pgid.c   | 40 ++++++++++---------
 drivers/s390/cio/device_status.c | 10 +++--
 drivers/s390/cio/ioasm.h         | 29 ++++++++++++++
 drivers/s390/cio/qdio.c          | 81 ++++++++++++++++++++++---------------
 drivers/s390/cio/schid.h         |  3 +-
 drivers/s390/s390mach.c          | 56 +++++++++++++++++++-------
 16 files changed, 354 insertions(+), 165 deletions(-)

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index daea41c..2d444cb 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/blacklist.c
  *   S/390 common I/O routines -- blacklisting of specific devices
- *   $Revision: 1.35 $
+ *   $Revision: 1.39 $
  *
  *    Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
  *			      IBM Corporation
@@ -35,10 +35,10 @@
  * These can be single devices or ranges of devices
  */
 
-/* 65536 bits to indicate if a devno is blacklisted or not */
+/* 65536 bits for each set to indicate if a devno is blacklisted or not */
 #define __BL_DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
 			 (8*sizeof(long)))
-static unsigned long bl_dev[__BL_DEV_WORDS];
+static unsigned long bl_dev[__MAX_SSID + 1][__BL_DEV_WORDS];
 typedef enum {add, free} range_action;
 
 /*
@@ -46,21 +46,23 @@ typedef enum {add, free} range_action;
  * (Un-)blacklist the devices from-to
  */
 static inline void
-blacklist_range (range_action action, unsigned int from, unsigned int to)
+blacklist_range (range_action action, unsigned int from, unsigned int to,
+		 unsigned int ssid)
 {
 	if (!to)
 		to = from;
 
-	if (from > to || to > __MAX_SUBCHANNEL) {
+	if (from > to || to > __MAX_SUBCHANNEL || ssid > __MAX_SSID) {
 		printk (KERN_WARNING "Invalid blacklist range "
-			"0x%04x to 0x%04x, skipping\n", from, to);
+			"0.%x.%04x to 0.%x.%04x, skipping\n",
+			ssid, from, ssid, to);
 		return;
 	}
 	for (; from <= to; from++) {
 		if (action == add)
-			set_bit (from, bl_dev);
+			set_bit (from, bl_dev[ssid]);
 		else
-			clear_bit (from, bl_dev);
+			clear_bit (from, bl_dev[ssid]);
 	}
 }
 
@@ -70,7 +72,7 @@ blacklist_range (range_action action, unsigned int from, unsigned int to)
  * Shamelessly grabbed from dasd_devmap.c.
  */
 static inline int
-blacklist_busid(char **str, int *id0, int *id1, int *devno)
+blacklist_busid(char **str, int *id0, int *ssid, int *devno)
 {
 	int val, old_style;
 	char *sav;
@@ -87,7 +89,7 @@ blacklist_busid(char **str, int *id0, int *id1, int *devno)
 		goto confused;
 	val = simple_strtoul(*str, str, 16);
 	if (old_style || (*str)[0] != '.') {
-		*id0 = *id1 = 0;
+		*id0 = *ssid = 0;
 		if (val < 0 || val > 0xffff)
 			goto confused;
 		*devno = val;
@@ -106,7 +108,7 @@ blacklist_busid(char **str, int *id0, int *id1, int *devno)
 	val = simple_strtoul(*str, str, 16);
 	if (val < 0 || val > 0xff || (*str)++[0] != '.')
 		goto confused;
-	*id1 = val;
+	*ssid = val;
 	if (!isxdigit((*str)[0]))	/* We require at least one hex digit */
 		goto confused;
 	val = simple_strtoul(*str, str, 16);
@@ -126,7 +128,7 @@ confused:
 static inline int
 blacklist_parse_parameters (char *str, range_action action)
 {
-	unsigned int from, to, from_id0, to_id0, from_id1, to_id1;
+	unsigned int from, to, from_id0, to_id0, from_ssid, to_ssid;
 
 	while (*str != 0 && *str != '\n') {
 		range_action ra = action;
@@ -143,23 +145,25 @@ blacklist_parse_parameters (char *str, range_action action)
 		 */
 		if (strncmp(str,"all,",4) == 0 || strcmp(str,"all") == 0 ||
 		    strncmp(str,"all\n",4) == 0 || strncmp(str,"all ",4) == 0) {
-			from = 0;
-			to = __MAX_SUBCHANNEL;
+			int j;
+
 			str += 3;
+			for (j=0; j <= __MAX_SSID; j++)
+				blacklist_range(ra, 0, __MAX_SUBCHANNEL, j);
 		} else {
 			int rc;
 
 			rc = blacklist_busid(&str, &from_id0,
-					     &from_id1, &from);
+					     &from_ssid, &from);
 			if (rc)
 				continue;
 			to = from;
 			to_id0 = from_id0;
-			to_id1 = from_id1;
+			to_ssid = from_ssid;
 			if (*str == '-') {
 				str++;
 				rc = blacklist_busid(&str, &to_id0,
-						     &to_id1, &to);
+						     &to_ssid, &to);
 				if (rc)
 					continue;
 			}
@@ -169,18 +173,19 @@ blacklist_parse_parameters (char *str, range_action action)
 					strsep(&str, ",\n"));
 				continue;
 			}
-			if ((from_id0 != to_id0) || (from_id1 != to_id1)) {
+			if ((from_id0 != to_id0) ||
+			    (from_ssid != to_ssid)) {
 				printk(KERN_WARNING "invalid cio_ignore range "
 					"%x.%x.%04x-%x.%x.%04x\n",
-					from_id0, from_id1, from,
-					to_id0, to_id1, to);
+					from_id0, from_ssid, from,
+					to_id0, to_ssid, to);
 				continue;
 			}
+			pr_debug("blacklist_setup: adding range "
+				 "from %x.%x.%04x to %x.%x.%04x\n",
+				 from_id0, from_ssid, from, to_id0, to_ssid, to);
+			blacklist_range (ra, from, to, to_ssid);
 		}
-		/* FIXME: ignoring id0 and id1 here. */
-		pr_debug("blacklist_setup: adding range "
-			 "from 0.0.%04x to 0.0.%04x\n", from, to);
-		blacklist_range (ra, from, to);
 	}
 	return 1;
 }
@@ -214,9 +219,9 @@ __setup ("cio_ignore=", blacklist_setup);
  * Used by validate_subchannel()
  */
 int
-is_blacklisted (int devno)
+is_blacklisted (int ssid, int devno)
 {
-	return test_bit (devno, bl_dev);
+	return test_bit (devno, bl_dev[ssid]);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -283,6 +288,7 @@ blacklist_parse_proc_parameters (char *buf)
 /* Iterator struct for all devices. */
 struct ccwdev_iter {
 	int devno;
+	int ssid;
 	int in_range;
 };
 
@@ -291,13 +297,14 @@ cio_ignore_proc_seq_start(struct seq_file *s, loff_t *offset)
 {
 	struct ccwdev_iter *iter;
 
-	if (*offset > __MAX_SUBCHANNEL)
+	if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
 		return NULL;
 	iter = kmalloc(sizeof(struct ccwdev_iter), GFP_KERNEL);
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 	memset(iter, 0, sizeof(struct ccwdev_iter));
-	iter->devno = *offset;
+	iter->ssid = *offset / (__MAX_SUBCHANNEL + 1);
+	iter->devno = *offset % (__MAX_SUBCHANNEL + 1);
 	return iter;
 }
 
@@ -313,10 +320,16 @@ cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
 {
 	struct ccwdev_iter *iter;
 
-	if (*offset > __MAX_SUBCHANNEL)
+	if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
 		return NULL;
 	iter = (struct ccwdev_iter *)it;
-	iter->devno++;
+	if (iter->devno == __MAX_SUBCHANNEL) {
+		iter->devno = 0;
+		iter->ssid++;
+		if (iter->ssid > __MAX_SSID)
+			return NULL;
+	} else
+		iter->devno++;
 	(*offset)++;
 	return iter;
 }
@@ -327,23 +340,24 @@ cio_ignore_proc_seq_show(struct seq_file *s, void *it)
 	struct ccwdev_iter *iter;
 
 	iter = (struct ccwdev_iter *)it;
-	if (!is_blacklisted(iter->devno))
+	if (!is_blacklisted(iter->ssid, iter->devno))
 		/* Not blacklisted, nothing to output. */
 		return 0;
 	if (!iter->in_range) {
 		/* First device in range. */
 		if ((iter->devno == __MAX_SUBCHANNEL) ||
-		    !is_blacklisted(iter->devno + 1))
+		    !is_blacklisted(iter->ssid, iter->devno + 1))
 			/* Singular device. */
-			return seq_printf(s, "0.0.%04x\n", iter->devno);
+			return seq_printf(s, "0.%x.%04x\n",
+					  iter->ssid, iter->devno);
 		iter->in_range = 1;
-		return seq_printf(s, "0.0.%04x-", iter->devno);
+		return seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno);
 	}
 	if ((iter->devno == __MAX_SUBCHANNEL) ||
-	    !is_blacklisted(iter->devno + 1)) {
+	    !is_blacklisted(iter->ssid, iter->devno + 1)) {
 		/* Last device in range. */
 		iter->in_range = 0;
-		return seq_printf(s, "0.0.%04x\n", iter->devno);
+		return seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno);
 	}
 	return 0;
 }
diff --git a/drivers/s390/cio/blacklist.h b/drivers/s390/cio/blacklist.h
index fb42caf..95e25c1 100644
--- a/drivers/s390/cio/blacklist.h
+++ b/drivers/s390/cio/blacklist.h
@@ -1,6 +1,6 @@
 #ifndef S390_BLACKLIST_H
 #define S390_BLACKLIST_H
 
-extern int is_blacklisted (int devno);
+extern int is_blacklisted (int ssid, int devno);
 
 #endif
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index ebd9249..7270808 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/chsc.c
  *   S/390 common I/O routines -- channel subsystem call
- *   $Revision: 1.120 $
+ *   $Revision: 1.126 $
  *
  *    Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
  *			      IBM Corporation
@@ -75,7 +75,9 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 
 	struct {
 		struct chsc_header request;
-		u16 reserved1;
+		u16 reserved1a:10;
+		u16 ssid:2;
+		u16 reserved1b:4;
 		u16 f_sch;	  /* first subchannel */
 		u16 reserved2;
 		u16 l_sch;	  /* last subchannel */
@@ -102,6 +104,7 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 		.code   = 0x0004,
 	};
 
+	ssd_area->ssid = sch->schid.ssid;
 	ssd_area->f_sch = sch->schid.sch_no;
 	ssd_area->l_sch = sch->schid.sch_no;
 
@@ -145,8 +148,8 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 	 */
 	if (ssd_area->st > 3) { /* uhm, that looks strange... */
 		CIO_CRW_EVENT(0, "Strange subchannel type %d"
-			      " for sch %04x\n", ssd_area->st,
-			      sch->schid.sch_no);
+			      " for sch 0.%x.%04x\n", ssd_area->st,
+			      sch->schid.ssid, sch->schid.sch_no);
 		/*
 		 * There may have been a new subchannel type defined in the
 		 * time since this code was written; since we don't know which
@@ -155,8 +158,9 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
 		return 0;
 	} else {
 		const char *type[4] = {"I/O", "chsc", "message", "ADM"};
-		CIO_CRW_EVENT(6, "ssd: sch %04x is %s subchannel\n",
-			      sch->schid.sch_no, type[ssd_area->st]);
+		CIO_CRW_EVENT(6, "ssd: sch 0.%x.%04x is %s subchannel\n",
+			      sch->schid.ssid, sch->schid.sch_no,
+			      type[ssd_area->st]);
 
 		sch->ssd_info.valid = 1;
 		sch->ssd_info.type = ssd_area->st;
@@ -364,7 +368,7 @@ s390_process_res_acc_new_sch(struct subchannel_id schid)
 	 * that beast may be on we'll have to do a stsch
 	 * on all devices, grr...
 	 */
-	if (stsch(schid, &schib))
+	if (stsch_err(schid, &schib))
 		/* We're through */
 		return need_rescan ? -EAGAIN : -ENXIO;
 
@@ -818,7 +822,7 @@ __s390_vary_chpid_on(struct subchannel_id schid, void *data)
 		put_device(&sch->dev);
 		return 0;
 	}
-	if (stsch(schid, &schib))
+	if (stsch_err(schid, &schib))
 		/* We're through */
 		return -ENXIO;
 	/* Put it on the slow path. */
@@ -1078,6 +1082,54 @@ chsc_alloc_sei_area(void)
 	return (sei_page ? 0 : -ENOMEM);
 }
 
+int __init
+chsc_enable_facility(int operation_code)
+{
+	int ret;
+	struct {
+		struct chsc_header request;
+		u8 reserved1:4;
+		u8 format:4;
+		u8 reserved2;
+		u16 operation_code;
+		u32 reserved3;
+		u32 reserved4;
+		u32 operation_data_area[252];
+		struct chsc_header response;
+		u32 reserved5:4;
+		u32 format2:4;
+		u32 reserved6:24;
+	} *sda_area;
+
+	sda_area = (void *)get_zeroed_page(GFP_KERNEL|GFP_DMA);
+	if (!sda_area)
+		return -ENOMEM;
+	sda_area->request = (struct chsc_header) {
+		.length = 0x0400,
+		.code = 0x0031,
+	};
+	sda_area->operation_code = operation_code;
+
+	ret = chsc(sda_area);
+	if (ret > 0) {
+		ret = (ret == 3) ? -ENODEV : -EBUSY;
+		goto out;
+	}
+	switch (sda_area->response.code) {
+	case 0x0003: /* invalid request block */
+	case 0x0007:
+		ret = -EINVAL;
+		break;
+	case 0x0004: /* command not provided */
+	case 0x0101: /* facility not provided */
+		ret = -EOPNOTSUPP;
+		break;
+	}
+ out:
+	free_page((unsigned long)sda_area);
+	return ret;
+}
+
 subsys_initcall(chsc_alloc_sei_area);
 
 struct css_general_char css_general_characteristics;
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index 170083c..44e4b4b 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -5,6 +5,8 @@
 #define CHSC_SEI_ACC_LINKADDR     2
 #define CHSC_SEI_ACC_FULLLINKADDR 3
 
+#define CHSC_SDA_OC_MSS   0x2
+
 struct chsc_header {
 	u16 length;
 	u16 code;
@@ -64,6 +66,8 @@ extern int css_characteristics_avail;
 
 extern void *chsc_get_chp_desc(struct subchannel*, int);
 
+extern int chsc_enable_facility(int);
+
 #define to_channelpath(dev) container_of(dev, struct channel_path, dev)
 
 #endif
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 3eb6cb6..6f274f4 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/cio.c
  *   S/390 common I/O routines -- low level i/o calls
- *   $Revision: 1.135 $
+ *   $Revision: 1.138 $
  *
  *    Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
  *			      IBM Corporation
@@ -166,7 +166,8 @@ cio_start_handle_notoper(struct subchannel *sch, __u8 lpm)
 	stsch (sch->schid, &sch->schib);
 
 	CIO_MSG_EVENT(0, "cio_start: 'not oper' status for "
-		      "subchannel %04x!\n", sch->schid.sch_no);
+		      "subchannel 0.%x.%04x!\n", sch->schid.ssid,
+		      sch->schid.sch_no);
 	sprintf(dbf_text, "no%s", sch->dev.bus_id);
 	CIO_TRACE_EVENT(0, dbf_text);
 	CIO_HEX_EVENT(0, &sch->schib, sizeof (struct schib));
@@ -522,15 +523,18 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 	spin_lock_init(&sch->lock);
 
 	/* Set a name for the subchannel */
-	snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", schid.sch_no);
+	snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x", schid.ssid,
+		  schid.sch_no);
 
 	/*
 	 * The first subchannel that is not-operational (ccode==3)
 	 *  indicates that there aren't any more devices available.
+	 * If stsch gets an exception, it means the current subchannel set
+	 *  is not valid.
 	 */
-	ccode = stsch (schid, &sch->schib);
+	ccode = stsch_err (schid, &sch->schib);
 	if (ccode)
-		return -ENXIO;
+		return (ccode == 3) ? -ENXIO : ccode;
 
 	sch->schid = schid;
 	/* Copy subchannel type from path management control word. */
@@ -541,9 +545,9 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 	 */
 	if (sch->st != 0) {
 		CIO_DEBUG(KERN_INFO, 0,
-			  "Subchannel %04X reports "
+			  "Subchannel 0.%x.%04x reports "
 			  "non-I/O subchannel type %04X\n",
-			  sch->schid.sch_no, sch->st);
+			  sch->schid.ssid, sch->schid.sch_no, sch->st);
 		/* We stop here for non-io subchannels. */
 		return sch->st;
 	}
@@ -554,26 +558,29 @@ cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
 		return -ENODEV;
 
 	/* Devno is valid. */
-	if (is_blacklisted (sch->schib.pmcw.dev)) {
+	if (is_blacklisted (sch->schid.ssid, sch->schib.pmcw.dev)) {
 		/*
 		 * This device must not be known to Linux. So we simply
 		 * say that there is no device and return ENODEV.
 		 */
 		CIO_MSG_EVENT(0, "Blacklisted device detected "
-			      "at devno %04X\n", sch->schib.pmcw.dev);
+			      "at devno %04X, subchannel set %x\n",
+			      sch->schib.pmcw.dev, sch->schid.ssid);
 		return -ENODEV;
 	}
 	sch->opm = 0xff;
-	chsc_validate_chpids(sch);
+	if (!cio_is_console(sch->schid))
+		chsc_validate_chpids(sch);
 	sch->lpm = sch->schib.pmcw.pim &
 		sch->schib.pmcw.pam &
 		sch->schib.pmcw.pom &
 		sch->opm;
 
 	CIO_DEBUG(KERN_INFO, 0,
-		  "Detected device %04X on subchannel %04X"
+		  "Detected device %04x on subchannel 0.%x.%04X"
 		  " - PIM = %02X, PAM = %02X, POM = %02X\n",
-		  sch->schib.pmcw.dev, sch->schid.sch_no, sch->schib.pmcw.pim,
+		  sch->schib.pmcw.dev, sch->schid.ssid,
+		  sch->schid.sch_no, sch->schib.pmcw.pim,
 		  sch->schib.pmcw.pam, sch->schib.pmcw.pom);
 
 	/*
@@ -693,7 +700,7 @@ wait_cons_dev (void)
 static int
 cio_test_for_console(struct subchannel_id schid, void *data)
 {
-	if (stsch(schid, &console_subchannel.schib) != 0)
+	if (stsch_err(schid, &console_subchannel.schib) != 0)
 		return -ENXIO;
 	if (console_subchannel.schib.pmcw.dnv &&
 	    console_subchannel.schib.pmcw.dev ==
@@ -841,7 +848,7 @@ __shutdown_subchannel_easy(struct subchannel_id schid, void *data)
 {
 	struct schib schib;
 
-	if (stsch(schid, &schib))
+	if (stsch_err(schid, &schib))
 		return -ENXIO;
 	if (!schib.pmcw.ena)
 		return 0;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index b6225cb..9e9d4a1 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -1,7 +1,7 @@
 /*
  *  drivers/s390/cio/css.c
  *  driver for channel subsystem
- *   $Revision: 1.85 $
+ *   $Revision: 1.93 $
  *
  *    Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
  *			 IBM Corporation
@@ -23,6 +23,7 @@
 
 int need_rescan = 0;
 int css_init_done = 0;
+static int max_ssid = 0;
 
 struct channel_subsystem *css[__MAX_CSSID + 1];
 
@@ -37,10 +38,13 @@ for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *data)
 	init_subchannel_id(&schid);
 	ret = -ENODEV;
 	do {
-		ret = fn(schid, data);
-		if (ret)
-			break;
-	} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+		do {
+			ret = fn(schid, data);
+			if (ret)
+				break;
+		} while (schid.sch_no++ < __MAX_SUBCHANNEL);
+		schid.sch_no = 0;
+	} while (schid.ssid++ < max_ssid);
 	return ret;
 }
 
@@ -205,8 +209,8 @@ css_evaluate_subchannel(struct subchannel_id schid, int slow)
 		return -EAGAIN; /* Will be done on the slow path. */
 	}
 	event = css_get_subchannel_status(sch, schid);
-	CIO_MSG_EVENT(4, "Evaluating schid %04x, event %d, %s, %s path.\n",
-		      schid.sch_no, event,
+	CIO_MSG_EVENT(4, "Evaluating schid 0.%x.%04x, event %d, %s, %s path.\n",
+		      schid.ssid, schid.sch_no, event,
 		      sch?(disc?"disconnected":"normal"):"unknown",
 		      slow?"slow":"fast");
 	switch (event) {
@@ -352,19 +356,23 @@ css_reiterate_subchannels(void)
  * Called from the machine check handler for subchannel report words.
  */
 int
-css_process_crw(int irq)
+css_process_crw(int rsid1, int rsid2)
 {
 	int ret;
 	struct subchannel_id mchk_schid;
 
-	CIO_CRW_EVENT(2, "source is subchannel %04X\n", irq);
+	CIO_CRW_EVENT(2, "source is subchannel %04X, subsystem id %x\n",
+		      rsid1, rsid2);
 
 	if (need_rescan)
 		/* We need to iterate all subchannels anyway. */
 		return -EAGAIN;
 
 	init_subchannel_id(&mchk_schid);
-	mchk_schid.sch_no = irq;
+	mchk_schid.sch_no = rsid1;
+	if (rsid2 != 0)
+		mchk_schid.ssid = (rsid2 >> 8) & 3;
+
 	/* 
 	 * Since we are always presented with IPI in the CRW, we have to
 	 * use stsch() to find out if the subchannel in question has come
@@ -465,12 +473,23 @@ init_channel_subsystem (void)
 	if ((ret = bus_register(&css_bus_type)))
 		goto out;
 
+	/* Try to enable MSS. */
+	ret = chsc_enable_facility(CHSC_SDA_OC_MSS);
+	switch (ret) {
+	case 0: /* Success. */
+		max_ssid = __MAX_SSID;
+		break;
+	case -ENOMEM:
+		goto out_bus;
+	default:
+		max_ssid = 0;
+	}
 	/* Setup css structure. */
 	for (i = 0; i <= __MAX_CSSID; i++) {
 		css[i] = kmalloc(sizeof(struct channel_subsystem), GFP_KERNEL);
 		if (!css[i]) {
 			ret = -ENOMEM;
-			goto out_bus;
+			goto out_unregister;
 		}
 		setup_css(i);
 		ret = device_register(&css[i]->device);
@@ -485,11 +504,12 @@ init_channel_subsystem (void)
 	return 0;
 out_free:
 	kfree(css[i]);
-out_bus:
+out_unregister:
 	while (i > 0) {
 		i--;
 		device_unregister(&css[i]->device);
 	}
+out_bus:
 	bus_unregister(&css_bus_type);
 out:
 	return ret;
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index b74659c..251ebd7 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -77,6 +77,7 @@ struct ccw_device_private {
 	unsigned long registered;
 	__u16 devno;		/* device number */
 	__u16 sch_no;		/* subchannel number */
+	__u8 ssid;              /* subchannel set id */
 	__u8 imask;		/* lpm mask for SNID/SID/SPGID */
 	int iretry;		/* retry counter SNID/SID/SPGID */
 	struct {
@@ -135,6 +136,7 @@ extern int css_init_done;
 extern int for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *);
 
 #define __MAX_SUBCHANNEL 65535
+#define __MAX_SSID 3
 #define __MAX_CHPID 255
 #define __MAX_CSSID 0
 
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index ba9f7c1..fa3e4c0 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -535,7 +535,8 @@ ccw_device_register(struct ccw_device *cdev)
 }
 
 struct match_data {
-	unsigned int  devno;
+	unsigned int devno;
+	unsigned int ssid;
 	struct ccw_device * sibling;
 };
 
@@ -548,6 +549,7 @@ match_devno(struct device * dev, void * data)
 	cdev = to_ccwdev(dev);
 	if ((cdev->private->state == DEV_STATE_DISCONNECTED) &&
 	    (cdev->private->devno == d->devno) &&
+	    (cdev->private->ssid == d->ssid) &&
 	    (cdev != d->sibling)) {
 		cdev->private->state = DEV_STATE_NOT_OPER;
 		return 1;
@@ -556,11 +558,13 @@ match_devno(struct device * dev, void * data)
 }
 
 static struct ccw_device *
-get_disc_ccwdev_by_devno(unsigned int devno, struct ccw_device *sibling)
+get_disc_ccwdev_by_devno(unsigned int devno, unsigned int ssid,
+			 struct ccw_device *sibling)
 {
 	struct device *dev;
 	struct match_data data = {
-		.devno  = devno,
+		.devno   = devno,
+		.ssid    = ssid,
 		.sibling = sibling,
 	};
 
@@ -616,7 +620,7 @@ ccw_device_do_unreg_rereg(void *data)
 
 		need_rename = 1;
 		other_cdev = get_disc_ccwdev_by_devno(sch->schib.pmcw.dev,
-						      cdev);
+						      sch->schid.ssid, cdev);
 		if (other_cdev) {
 			struct subchannel *other_sch;
 
@@ -639,8 +643,8 @@ ccw_device_do_unreg_rereg(void *data)
 	if (test_and_clear_bit(1, &cdev->private->registered))
 		device_del(&cdev->dev);
 	if (need_rename)
-		snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.0.%04x",
-			  sch->schib.pmcw.dev);
+		snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x",
+			  sch->schid.ssid, sch->schib.pmcw.dev);
 	PREPARE_WORK(&cdev->private->kick_work,
 		     ccw_device_add_changed, (void *)cdev);
 	queue_work(ccw_device_work, &cdev->private->kick_work);
@@ -769,9 +773,11 @@ io_subchannel_recog(struct ccw_device *cdev, struct subchannel *sch)
 	sch->dev.driver_data = cdev;
 	sch->driver = &io_subchannel_driver;
 	cdev->ccwlock = &sch->lock;
+
 	/* Init private data. */
 	priv = cdev->private;
 	priv->devno = sch->schib.pmcw.dev;
+	priv->ssid = sch->schid.ssid;
 	priv->sch_no = sch->schid.sch_no;
 	priv->state = DEV_STATE_NOT_OPER;
 	INIT_LIST_HEAD(&priv->cmb_list);
@@ -779,8 +785,8 @@ io_subchannel_recog(struct ccw_device *cdev, struct subchannel *sch)
 	init_timer(&priv->timer);
 
 	/* Set an initial name for the device. */
-	snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.0.%04x",
-		  sch->schib.pmcw.dev);
+	snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x",
+		  sch->schid.ssid, sch->schib.pmcw.dev);
 
 	/* Increase counter of devices currently in recognition. */
 	atomic_inc(&ccw_device_init_count);
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index 9efeae7..23d12b6 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -257,8 +257,9 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 	switch (state) {
 	case DEV_STATE_NOT_OPER:
 		CIO_DEBUG(KERN_WARNING, 2,
-			  "SenseID : unknown device %04x on subchannel %04x\n",
-			  cdev->private->devno, sch->schid.sch_no);
+			  "SenseID : unknown device %04x on subchannel "
+			  "0.%x.%04x\n", cdev->private->devno,
+			  sch->schid.ssid, sch->schid.sch_no);
 		break;
 	case DEV_STATE_OFFLINE:
 		if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
@@ -282,16 +283,18 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
 			return;
 		}
 		/* Issue device info message. */
-		CIO_DEBUG(KERN_INFO, 2, "SenseID : device %04x reports: "
+		CIO_DEBUG(KERN_INFO, 2, "SenseID : device 0.%x.%04x reports: "
 			  "CU  Type/Mod = %04X/%02X, Dev Type/Mod = "
-			  "%04X/%02X\n", cdev->private->devno,
+			  "%04X/%02X\n",
+			  cdev->private->ssid, cdev->private->devno,
 			  cdev->id.cu_type, cdev->id.cu_model,
 			  cdev->id.dev_type, cdev->id.dev_model);
 		break;
 	case DEV_STATE_BOXED:
 		CIO_DEBUG(KERN_WARNING, 2,
-			  "SenseID : boxed device %04x on subchannel %04x\n",
-			  cdev->private->devno, sch->schid.sch_no);
+			  "SenseID : boxed device %04x on subchannel "
+			  "0.%x.%04x\n", cdev->private->devno,
+			  sch->schid.ssid, sch->schid.sch_no);
 		break;
 	}
 	cdev->private->state = state;
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 207881e..3c77c3f 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -256,16 +256,17 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 		 *     sense id information. So, for intervention required,
 		 *     we use the "whack it until it talks" strategy...
 		 */
-		CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel %04x "
-			      "reports cmd reject\n",
-			      cdev->private->devno, sch->schid.sch_no);
+		CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel "
+			      "0.%x.%04x reports cmd reject\n",
+			      cdev->private->devno, sch->schid.ssid,
+			      sch->schid.sch_no);
 		return -EOPNOTSUPP;
 	}
 	if (irb->esw.esw0.erw.cons) {
-		CIO_MSG_EVENT(2, "SenseID : UC on dev %04x, "
+		CIO_MSG_EVENT(2, "SenseID : UC on dev 0.%x.%04x, "
 			      "lpum %02X, cnt %02d, sns :"
 			      " %02X%02X%02X%02X %02X%02X%02X%02X ...\n",
-			      cdev->private->devno,
+			      cdev->private->ssid, cdev->private->devno,
 			      irb->esw.esw0.sublog.lpum,
 			      irb->esw.esw0.erw.scnt,
 			      irb->ecw[0], irb->ecw[1],
@@ -277,16 +278,17 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
 	if (irb->scsw.cc == 3) {
 		if ((sch->orb.lpm &
 		     sch->schib.pmcw.pim & sch->schib.pmcw.pam) != 0)
-			CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x on"
-				      " subchannel %04x is 'not operational'\n",
-				      sch->orb.lpm, cdev->private->devno,
+			CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x "
+				      "on subchannel 0.%x.%04x is "
+				      "'not operational'\n", sch->orb.lpm,
+				      cdev->private->devno, sch->schid.ssid,
 				      sch->schid.sch_no);
 		return -EACCES;
 	}
 	/* Hmm, whatever happened, try again. */
 	CIO_MSG_EVENT(2, "SenseID : start_IO() for device %04x on "
-		      "subchannel %04x returns status %02X%02X\n",
-		      cdev->private->devno, sch->schid.sch_no,
+		      "subchannel 0.%x.%04x returns status %02X%02X\n",
+		      cdev->private->devno, sch->schid.ssid, sch->schid.sch_no,
 		      irb->scsw.dstat, irb->scsw.cstat);
 	return -EAGAIN;
 }
diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index 3c89d70..052832d 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -57,10 +57,10 @@ __ccw_device_sense_pgid_start(struct ccw_device *cdev)
 			if (ret != -EACCES)
 				return ret;
 			CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
-				      "%04x, lpm %02X, became 'not "
+				      "0.%x.%04x, lpm %02X, became 'not "
 				      "operational'\n",
-				      cdev->private->devno, sch->schid.sch_no,
-				      cdev->private->imask);
+				      cdev->private->devno, sch->schid.ssid,
+				      sch->schid.sch_no, cdev->private->imask);
 
 		}
 		cdev->private->imask >>= 1;
@@ -106,10 +106,10 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
 		return -EOPNOTSUPP;
 	}
 	if (irb->esw.esw0.erw.cons) {
-		CIO_MSG_EVENT(2, "SNID - device %04x, unit check, "
+		CIO_MSG_EVENT(2, "SNID - device 0.%x.%04x, unit check, "
 			      "lpum %02X, cnt %02d, sns : "
 			      "%02X%02X%02X%02X %02X%02X%02X%02X ...\n",
-			      cdev->private->devno,
+			      cdev->private->ssid, cdev->private->devno,
 			      irb->esw.esw0.sublog.lpum,
 			      irb->esw.esw0.erw.scnt,
 			      irb->ecw[0], irb->ecw[1],
@@ -119,16 +119,17 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
 		return -EAGAIN;
 	}
 	if (irb->scsw.cc == 3) {
-		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
-			      "%04x, lpm %02X, became 'not operational'\n",
-			      cdev->private->devno, sch->schid.sch_no,
-			      sch->orb.lpm);
+		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel 0.%x.%04x,"
+			      " lpm %02X, became 'not operational'\n",
+			      cdev->private->devno, sch->schid.ssid,
+			      sch->schid.sch_no, sch->orb.lpm);
 		return -EACCES;
 	}
 	if (cdev->private->pgid.inf.ps.state2 == SNID_STATE2_RESVD_ELSE) {
-		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel %04x "
+		CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel 0.%x.%04x "
 			      "is reserved by someone else\n",
-			      cdev->private->devno, sch->schid.sch_no);
+			      cdev->private->devno, sch->schid.ssid,
+			      sch->schid.sch_no);
 		return -EUSERS;
 	}
 	return 0;
@@ -237,8 +238,9 @@ __ccw_device_do_pgid(struct ccw_device *cdev, __u8 func)
 	sch->lpm &= ~cdev->private->imask;
 	sch->vpm &= ~cdev->private->imask;
 	CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
-		      "%04x, lpm %02X, became 'not operational'\n",
-		      cdev->private->devno, sch->schid.sch_no, cdev->private->imask);
+		      "0.%x.%04x, lpm %02X, became 'not operational'\n",
+		      cdev->private->devno, sch->schid.ssid,
+		      sch->schid.sch_no, cdev->private->imask);
 	return ret;
 }
 
@@ -260,8 +262,10 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
 		if (irb->ecw[0] & SNS0_CMD_REJECT)
 			return -EOPNOTSUPP;
 		/* Hmm, whatever happened, try again. */
-		CIO_MSG_EVENT(2, "SPID - device %04x, unit check, cnt %02d, "
+		CIO_MSG_EVENT(2, "SPID - device 0.%x.%04x, unit check, "
+			      "cnt %02d, "
 			      "sns : %02X%02X%02X%02X %02X%02X%02X%02X ...\n",
+			      cdev->private->ssid,
 			      cdev->private->devno, irb->esw.esw0.erw.scnt,
 			      irb->ecw[0], irb->ecw[1],
 			      irb->ecw[2], irb->ecw[3],
@@ -270,10 +274,10 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
 		return -EAGAIN;
 	}
 	if (irb->scsw.cc == 3) {
-		CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
-			      "%04x, lpm %02X, became 'not operational'\n",
-			      cdev->private->devno, sch->schid.sch_no,
-			      cdev->private->imask);
+		CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel 0.%x.%04x,"
+			      " lpm %02X, became 'not operational'\n",
+			      cdev->private->devno, sch->schid.ssid,
+			      sch->schid.sch_no, cdev->private->imask);
 		return -EACCES;
 	}
 	return 0;
diff --git a/drivers/s390/cio/device_status.c b/drivers/s390/cio/device_status.c
index 929f8fb..db09c20 100644
--- a/drivers/s390/cio/device_status.c
+++ b/drivers/s390/cio/device_status.c
@@ -36,9 +36,10 @@ ccw_device_msg_control_check(struct ccw_device *cdev, struct irb *irb)
 		
 	CIO_MSG_EVENT(0, "Channel-Check or Interface-Control-Check "
 		      "received"
-		      " ... device %04X on subchannel %04X, dev_stat "
+		      " ... device %04x on subchannel 0.%x.%04x, dev_stat "
 		      ": %02X sch_stat : %02X\n",
-		      cdev->private->devno, cdev->private->sch_no,
+		      cdev->private->devno, cdev->private->ssid,
+		      cdev->private->sch_no,
 		      irb->scsw.dstat, irb->scsw.cstat);
 
 	if (irb->scsw.cc != 3) {
@@ -61,8 +62,9 @@ ccw_device_path_notoper(struct ccw_device *cdev)
 	sch = to_subchannel(cdev->dev.parent);
 	stsch (sch->schid, &sch->schib);
 
-	CIO_MSG_EVENT(0, "%s(%04x) - path(s) %02x are "
-		      "not operational \n", __FUNCTION__, sch->schid.sch_no,
+	CIO_MSG_EVENT(0, "%s(0.%x.%04x) - path(s) %02x are "
+		      "not operational \n", __FUNCTION__,
+		      sch->schid.ssid, sch->schid.sch_no,
 		      sch->schib.pmcw.pnom);
 
 	sch->lpm &= ~sch->schib.pmcw.pnom;
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 66c882e..62b0e2a 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -38,6 +38,35 @@ static inline int stsch(struct subchannel_id schid,
 	return ccode;
 }
 
+static inline int stsch_err(struct subchannel_id schid,
+				volatile struct schib *addr)
+{
+	int ccode;
+
+	__asm__ __volatile__(
+		"    lhi  %0,%3\n"
+		"    lr	  1,%1\n"
+		"    stsch 0(%2)\n"
+		"0:  ipm  %0\n"
+		"    srl  %0,28\n"
+		"1:\n"
+#ifdef CONFIG_ARCH_S390X
+		".section __ex_table,\"a\"\n"
+		"   .align 8\n"
+		"   .quad 0b,1b\n"
+		".previous"
+#else
+		".section __ex_table,\"a\"\n"
+		"   .align 4\n"
+		"   .long 0b,1b\n"
+		".previous"
+#endif
+		: "=&d" (ccode)
+		: "d" (schid), "a" (addr), "K" (-EIO), "m" (*addr)
+		: "cc", "1" );
+	return ccode;
+}
+
 static inline int msch(struct subchannel_id schid,
 			   volatile struct schib *addr)
 {
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index 5c7001b..035c77a 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -56,7 +56,7 @@
 #include "ioasm.h"
 #include "chsc.h"
 
-#define VERSION_QDIO_C "$Revision: 1.113 $"
+#define VERSION_QDIO_C "$Revision: 1.114 $"
 
 /****************** MODULE PARAMETER VARIABLES ********************/
 MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com>");
@@ -2066,21 +2066,22 @@ qdio_timeout_handler(struct ccw_device *cdev)
 
 	switch (irq_ptr->state) {
 	case QDIO_IRQ_STATE_INACTIVE:
-		QDIO_PRINT_ERR("establish queues on irq %04x: timed out\n",
-			       irq_ptr->schid.sch_no);
+		QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: timed out\n",
+			       irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		QDIO_DBF_TEXT2(1,setup,"eq:timeo");
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		break;
 	case QDIO_IRQ_STATE_CLEANUP:
-		QDIO_PRINT_INFO("Did not get interrupt on cleanup, irq=0x%x.\n",
-				irq_ptr->schid.sch_no);
+		QDIO_PRINT_INFO("Did not get interrupt on cleanup, "
+				"irq=0.%x.%x.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		break;
 	case QDIO_IRQ_STATE_ESTABLISHED:
 	case QDIO_IRQ_STATE_ACTIVE:
 		/* I/O has been terminated by common I/O layer. */
-		QDIO_PRINT_INFO("Queues on irq %04x killed by cio.\n",
-				irq_ptr->schid.sch_no);
+		QDIO_PRINT_INFO("Queues on irq 0.%x.%04x killed by cio.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		QDIO_DBF_TEXT2(1, trace, "cio:term");
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_STOPPED);
 		if (get_device(&cdev->dev)) {
@@ -2273,7 +2274,9 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	unsigned char qdioac;
 	struct {
 		struct chsc_header request;
-		u16 reserved1;
+		u16 reserved1:10;
+		u16 ssid:2;
+		u16 fmt:4;
 		u16 first_sch;
 		u16 reserved2;
 		u16 last_sch;
@@ -2318,12 +2321,13 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	};
 	ssqd_area->first_sch = irq_ptr->schid.sch_no;
 	ssqd_area->last_sch = irq_ptr->schid.sch_no;
+	ssqd_area->ssid = irq_ptr->schid.ssid;
 	result = chsc(ssqd_area);
 
 	if (result) {
 		QDIO_PRINT_WARN("CHSC returned cc %i. Using all " \
-				"SIGAs for sch x%x.\n",
-				result, irq_ptr->schid.sch_no);
+				"SIGAs for sch 0.%x.%x.\n", result,
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
@@ -2333,8 +2337,9 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 
 	if (ssqd_area->response.code != QDIO_CHSC_RESPONSE_CODE_OK) {
 		QDIO_PRINT_WARN("response upon checking SIGA needs " \
-				"is 0x%x. Using all SIGAs for sch x%x.\n",
-				ssqd_area->response.code, irq_ptr->schid.sch_no);
+				"is 0x%x. Using all SIGAs for sch 0.%x.%x.\n",
+				ssqd_area->response.code,
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
@@ -2344,8 +2349,9 @@ qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
 	if (!(ssqd_area->flags & CHSC_FLAG_QDIO_CAPABILITY) ||
 	    !(ssqd_area->flags & CHSC_FLAG_VALIDITY) ||
 	    (ssqd_area->sch != irq_ptr->schid.sch_no)) {
-		QDIO_PRINT_WARN("huh? problems checking out sch x%x... " \
-				"using all SIGAs.\n",irq_ptr->schid.sch_no);
+		QDIO_PRINT_WARN("huh? problems checking out sch 0.%x.%x... " \
+				"using all SIGAs.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY |
 			CHSC_FLAG_SIGA_OUTPUT_NECESSARY |
 			CHSC_FLAG_SIGA_SYNC_NECESSARY; /* worst case */
@@ -2453,7 +2459,8 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 	scssc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!scssc_area) {
 		QDIO_PRINT_WARN("No memory for setting indicators on " \
-				"subchannel x%x.\n", irq_ptr->schid.sch_no);
+				"subchannel 0.%x.%x.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		return -ENOMEM;
 	}
 	scssc_area->request = (struct chsc_header) {
@@ -2479,8 +2486,9 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
 
 	result = chsc(scssc_area);
 	if (result) {
-		QDIO_PRINT_WARN("could not set indicators on irq x%x, " \
-				"cc=%i.\n",irq_ptr->schid.sch_no,result);
+		QDIO_PRINT_WARN("could not set indicators on irq 0.%x.%x, " \
+				"cc=%i.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no,result);
 		result = -EIO;
 		goto out;
 	}
@@ -2536,7 +2544,8 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
 	scsscf_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!scsscf_area) {
 		QDIO_PRINT_WARN("No memory for setting delay target on " \
-				"subchannel x%x.\n", irq_ptr->schid.sch_no);
+				"subchannel 0.%x.%x.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 		return -ENOMEM;
 	}
 	scsscf_area->request = (struct chsc_header) {
@@ -2548,8 +2557,9 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
 
 	result=chsc(scsscf_area);
 	if (result) {
-		QDIO_PRINT_WARN("could not set delay target on irq x%x, " \
-				"cc=%i. Continuing.\n",irq_ptr->schid.sch_no,
+		QDIO_PRINT_WARN("could not set delay target on irq 0.%x.%x, " \
+				"cc=%i. Continuing.\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
 				result);
 		result = -EIO;
 		goto out;
@@ -2870,8 +2880,9 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 		QDIO_DBF_HEX2(0,trace,&dstat,sizeof(int));
 		QDIO_DBF_HEX2(0,trace,&cstat,sizeof(int));
 		QDIO_PRINT_ERR("received check condition on establish " \
-			       "queues on irq 0x%x (cs=x%x, ds=x%x).\n",
-			       irq_ptr->schid.sch_no,cstat,dstat);
+			       "queues on irq 0.%x.%x (cs=x%x, ds=x%x).\n",
+			       irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
+			       cstat,dstat);
 		qdio_set_state(irq_ptr,QDIO_IRQ_STATE_ERR);
 	}
 	
@@ -2879,9 +2890,10 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 		QDIO_DBF_TEXT2(1,setup,"eq:no de");
 		QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat));
 		QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat));
-		QDIO_PRINT_ERR("establish queues on irq %04x: didn't get "
+		QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: didn't get "
 			       "device end: dstat=%02x, cstat=%02x\n",
-			       irq_ptr->schid.sch_no, dstat, cstat);
+			       irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
+			       dstat, cstat);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		return 1;
 	}
@@ -2890,9 +2902,9 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
 		QDIO_DBF_TEXT2(1,setup,"eq:badio");
 		QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat));
 		QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat));
-		QDIO_PRINT_ERR("establish queues on irq %04x: got "
+		QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: got "
 			       "the following devstat: dstat=%02x, "
-			       "cstat=%02x\n",
+			       "cstat=%02x\n", irq_ptr->schid.ssid,
 			       irq_ptr->schid.sch_no, dstat, cstat);
 		qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
 		return 1;
@@ -3041,7 +3053,8 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
 		QDIO_DBF_HEX1(0,setup,&irq_ptr->dev_st_chg_ind,sizeof(void*));
 		if (!irq_ptr->dev_st_chg_ind) {
 			QDIO_PRINT_WARN("no indicator location available " \
-					"for irq 0x%x\n",irq_ptr->schid.sch_no);
+					"for irq 0.%x.%x\n",
+					irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
 			qdio_release_irq_memory(irq_ptr);
 			return -ENOBUFS;
 		}
@@ -3198,9 +3211,10 @@ qdio_establish(struct qdio_initialize *init_data)
 			sprintf(dbf_text,"eq:io%4x",result);
 			QDIO_DBF_TEXT2(1,setup,dbf_text);
 		}
-		QDIO_PRINT_WARN("establish queues on irq %04x: do_IO " \
-                           "returned %i, next try returned %i\n",
-                           irq_ptr->schid.sch_no,result,result2);
+		QDIO_PRINT_WARN("establish queues on irq 0.%x.%04x: do_IO " \
+				"returned %i, next try returned %i\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
+				result, result2);
 		result=result2;
 		if (result)
 			ccw_device_set_timeout(cdev, 0);
@@ -3298,9 +3312,10 @@ qdio_activate(struct ccw_device *cdev, int flags)
 			sprintf(dbf_text,"aq:io%4x",result);
 			QDIO_DBF_TEXT2(1,setup,dbf_text);
 		}
-		QDIO_PRINT_WARN("activate queues on irq %04x: do_IO " \
-                           "returned %i, next try returned %i\n",
-                           irq_ptr->schid.sch_no,result,result2);
+		QDIO_PRINT_WARN("activate queues on irq 0.%x.%04x: do_IO " \
+				"returned %i, next try returned %i\n",
+				irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
+				result, result2);
 		result=result2;
 	}
 
diff --git a/drivers/s390/cio/schid.h b/drivers/s390/cio/schid.h
index 220d978..54328fe 100644
--- a/drivers/s390/cio/schid.h
+++ b/drivers/s390/cio/schid.h
@@ -2,7 +2,8 @@
 #define S390_SCHID_H
 
 struct subchannel_id {
-	__u32 reserved:15;
+	__u32 reserved:13;
+	__u32 ssid:2;
 	__u32 one:1;
 	__u32 sch_no:16;
 } __attribute__ ((packed,aligned(4)));
diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 4191fd9..7dad597 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -23,7 +23,7 @@
 
 static struct semaphore m_sem;
 
-extern int css_process_crw(int);
+extern int css_process_crw(int, int);
 extern int chsc_process_crw(void);
 extern int chp_process_crw(int, int);
 extern void css_reiterate_subchannels(void);
@@ -49,9 +49,10 @@ s390_handle_damage(char *msg)
 static int
 s390_collect_crw_info(void *param)
 {
-	struct crw crw;
+	struct crw crw[2];
 	int ccode, ret, slow;
 	struct semaphore *sem;
+	unsigned int chain;
 
 	sem = (struct semaphore *)param;
 	/* Set a nice name. */
@@ -59,25 +60,50 @@ s390_collect_crw_info(void *param)
 repeat:
 	down_interruptible(sem);
 	slow = 0;
+	chain = 0;
 	while (1) {
-		ccode = stcrw(&crw);
+		if (unlikely(chain > 1)) {
+			struct crw tmp_crw;
+
+			printk(KERN_WARNING"%s: Code does not support more "
+			       "than two chained crws; please report to "
+			       "linux390@de.ibm.com!\n", __FUNCTION__);
+			ccode = stcrw(&tmp_crw);
+			printk(KERN_WARNING"%s: crw reports slct=%d, oflw=%d, "
+			       "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n",
+			       __FUNCTION__, tmp_crw.slct, tmp_crw.oflw,
+			       tmp_crw.chn, tmp_crw.rsc, tmp_crw.anc,
+			       tmp_crw.erc, tmp_crw.rsid);
+			printk(KERN_WARNING"%s: This was crw number %x in the "
+			       "chain\n", __FUNCTION__, chain);
+			if (ccode != 0)
+				break;
+			chain = tmp_crw.chn ? chain + 1 : 0;
+			continue;
+		}
+		ccode = stcrw(&crw[chain]);
 		if (ccode != 0)
 			break;
 		DBG(KERN_DEBUG "crw_info : CRW reports slct=%d, oflw=%d, "
 		    "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n",
-		    crw.slct, crw.oflw, crw.chn, crw.rsc, crw.anc,
-		    crw.erc, crw.rsid);
+		    crw[chain].slct, crw[chain].oflw, crw[chain].chn,
+		    crw[chain].rsc, crw[chain].anc, crw[chain].erc,
+		    crw[chain].rsid);
 		/* Check for overflows. */
-		if (crw.oflw) {
+		if (crw[chain].oflw) {
 			pr_debug("%s: crw overflow detected!\n", __FUNCTION__);
 			css_reiterate_subchannels();
+			chain = 0;
 			slow = 1;
 			continue;
 		}
-		switch (crw.rsc) {
+		switch (crw[chain].rsc) {
 		case CRW_RSC_SCH:
-			pr_debug("source is subchannel %04X\n", crw.rsid);
-			ret = css_process_crw (crw.rsid);
+			if (crw[0].chn && !chain)
+				break;
+			pr_debug("source is subchannel %04X\n", crw[0].rsid);
+			ret = css_process_crw (crw[0].rsid,
+					       chain ? crw[1].rsid : 0);
 			if (ret == -EAGAIN)
 				slow = 1;
 			break;
@@ -85,18 +111,18 @@ repeat:
 			pr_debug("source is monitoring facility\n");
 			break;
 		case CRW_RSC_CPATH:
-			pr_debug("source is channel path %02X\n", crw.rsid);
-			switch (crw.erc) {
+			pr_debug("source is channel path %02X\n", crw[0].rsid);
+			switch (crw[0].erc) {
 			case CRW_ERC_IPARM: /* Path has come. */
-				ret = chp_process_crw(crw.rsid, 1);
+				ret = chp_process_crw(crw[0].rsid, 1);
 				break;
 			case CRW_ERC_PERRI: /* Path has gone. */
 			case CRW_ERC_PERRN:
-				ret = chp_process_crw(crw.rsid, 0);
+				ret = chp_process_crw(crw[0].rsid, 0);
 				break;
 			default:
 				pr_debug("Don't know how to handle erc=%x\n",
-					 crw.erc);
+					 crw[0].erc);
 				ret = 0;
 			}
 			if (ret == -EAGAIN)
@@ -115,6 +141,8 @@ repeat:
 			pr_debug("unknown source\n");
 			break;
 		}
+		/* chain is always 0 or 1 here. */
+		chain = crw[chain].chn ? chain + 1 : 0;
 	}
 	if (slow)
 		queue_work(slow_path_wq, &slow_path_work);
-- 
cgit v1.1


From 88fbf18399bde8f2900cf932acd40733dfa1effa Mon Sep 17 00:00:00 2001
From: Eric Rossman <edrossma@us.ibm.com>
Date: Fri, 6 Jan 2006 00:19:25 -0800
Subject: [PATCH] s390: add support for cex2a crypto cards

Signed-off-by: Eric Rossman <edrossma@us.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/crypto/z90common.h   |   9 +-
 drivers/s390/crypto/z90crypt.h    |  13 +-
 drivers/s390/crypto/z90hardware.c | 301 +++++++++++++++++++++++++++++++++++++-
 drivers/s390/crypto/z90main.c     | 111 +++++++++-----
 4 files changed, 383 insertions(+), 51 deletions(-)

diff --git a/drivers/s390/crypto/z90common.h b/drivers/s390/crypto/z90common.h
index e319e78..f87c785 100644
--- a/drivers/s390/crypto/z90common.h
+++ b/drivers/s390/crypto/z90common.h
@@ -1,9 +1,9 @@
 /*
  *  linux/drivers/s390/crypto/z90common.h
  *
- *  z90crypt 1.3.2
+ *  z90crypt 1.3.3
  *
- *  Copyright (C)  2001, 2004 IBM Corporation
+ *  Copyright (C)  2001, 2005 IBM Corporation
  *  Author(s): Robert Burroughs (burrough@us.ibm.com)
  *             Eric Rossman (edrossma@us.ibm.com)
  *
@@ -91,12 +91,13 @@ enum hdstat {
 #define TSQ_FATAL_ERROR 34
 #define RSQ_FATAL_ERROR 35
 
-#define Z90CRYPT_NUM_TYPES	5
+#define Z90CRYPT_NUM_TYPES	6
 #define PCICA		0
 #define PCICC		1
 #define PCIXCC_MCL2	2
 #define PCIXCC_MCL3	3
 #define CEX2C		4
+#define CEX2A		5
 #define NILDEV		-1
 #define ANYDEV		-1
 #define PCIXCC_UNK	-2
@@ -105,7 +106,7 @@ enum hdevice_type {
 	PCICC_HW  = 3,
 	PCICA_HW  = 4,
 	PCIXCC_HW = 5,
-	OTHER_HW  = 6,
+	CEX2A_HW  = 6,
 	CEX2C_HW  = 7
 };
 
diff --git a/drivers/s390/crypto/z90crypt.h b/drivers/s390/crypto/z90crypt.h
index 0a3bb5a..3a18443 100644
--- a/drivers/s390/crypto/z90crypt.h
+++ b/drivers/s390/crypto/z90crypt.h
@@ -1,9 +1,9 @@
 /*
  *  linux/drivers/s390/crypto/z90crypt.h
  *
- *  z90crypt 1.3.2
+ *  z90crypt 1.3.3
  *
- *  Copyright (C)  2001, 2004 IBM Corporation
+ *  Copyright (C)  2001, 2005 IBM Corporation
  *  Author(s): Robert Burroughs (burrough@us.ibm.com)
  *             Eric Rossman (edrossma@us.ibm.com)
  *
@@ -29,11 +29,11 @@
 
 #include <linux/ioctl.h>
 
-#define VERSION_Z90CRYPT_H "$Revision: 1.11 $"
+#define VERSION_Z90CRYPT_H "$Revision: 1.2.2.4 $"
 
 #define z90crypt_VERSION 1
 #define z90crypt_RELEASE 3	// 2 = PCIXCC, 3 = rewrite for coding standards
-#define z90crypt_VARIANT 2	// 2 = added PCIXCC MCL3 and CEX2C support
+#define z90crypt_VARIANT 3	// 3 = CEX2A support
 
 /**
  * struct ica_rsa_modexpo
@@ -122,6 +122,9 @@ struct ica_rsa_modexpo_crt {
  *   Z90STAT_CEX2CCOUNT
  *     Return an integer count of all CEX2Cs.
  *
+ *   Z90STAT_CEX2ACOUNT
+ *     Return an integer count of all CEX2As.
+ *
  *   Z90STAT_REQUESTQ_COUNT
  *     Return an integer count of the number of entries waiting to be
  *     sent to a device.
@@ -144,6 +147,7 @@ struct ica_rsa_modexpo_crt {
  *       0x03: PCIXCC_MCL2
  *       0x04: PCIXCC_MCL3
  *       0x05: CEX2C
+ *       0x06: CEX2A
  *       0x0d: device is disabled via the proc filesystem
  *
  *   Z90STAT_QDEPTH_MASK
@@ -199,6 +203,7 @@ struct ica_rsa_modexpo_crt {
 #define Z90STAT_PCIXCCMCL2COUNT	_IOR(Z90_IOCTL_MAGIC, 0x4b, int)
 #define Z90STAT_PCIXCCMCL3COUNT	_IOR(Z90_IOCTL_MAGIC, 0x4c, int)
 #define Z90STAT_CEX2CCOUNT	_IOR(Z90_IOCTL_MAGIC, 0x4d, int)
+#define Z90STAT_CEX2ACOUNT	_IOR(Z90_IOCTL_MAGIC, 0x4e, int)
 #define Z90STAT_REQUESTQ_COUNT	_IOR(Z90_IOCTL_MAGIC, 0x44, int)
 #define Z90STAT_PENDINGQ_COUNT	_IOR(Z90_IOCTL_MAGIC, 0x45, int)
 #define Z90STAT_TOTALOPEN_COUNT _IOR(Z90_IOCTL_MAGIC, 0x46, int)
diff --git a/drivers/s390/crypto/z90hardware.c b/drivers/s390/crypto/z90hardware.c
index c215e08..7c3ed52 100644
--- a/drivers/s390/crypto/z90hardware.c
+++ b/drivers/s390/crypto/z90hardware.c
@@ -1,9 +1,9 @@
 /*
  *  linux/drivers/s390/crypto/z90hardware.c
  *
- *  z90crypt 1.3.2
+ *  z90crypt 1.3.3
  *
- *  Copyright (C)  2001, 2004 IBM Corporation
+ *  Copyright (C)  2001, 2005 IBM Corporation
  *  Author(s): Robert Burroughs (burrough@us.ibm.com)
  *             Eric Rossman (edrossma@us.ibm.com)
  *
@@ -648,6 +648,87 @@ static struct cca_public_sec static_cca_pub_sec = {
 #define RESPONSE_CPRB_SIZE  0x000006B8
 #define RESPONSE_CPRBX_SIZE 0x00000724
 
+struct type50_hdr {
+	u8    reserved1;
+	u8    msg_type_code;
+	u16   msg_len;
+	u8    reserved2;
+	u8    ignored;
+	u16   reserved3;
+};
+
+#define TYPE50_TYPE_CODE 0x50
+
+#define TYPE50_MEB1_LEN (sizeof(struct type50_meb1_msg))
+#define TYPE50_MEB2_LEN (sizeof(struct type50_meb2_msg))
+#define TYPE50_CRB1_LEN (sizeof(struct type50_crb1_msg))
+#define TYPE50_CRB2_LEN (sizeof(struct type50_crb2_msg))
+
+#define TYPE50_MEB1_FMT 0x0001
+#define TYPE50_MEB2_FMT 0x0002
+#define TYPE50_CRB1_FMT 0x0011
+#define TYPE50_CRB2_FMT 0x0012
+
+struct type50_meb1_msg {
+	struct type50_hdr	header;
+	u16			keyblock_type;
+	u8			reserved[6];
+	u8			exponent[128];
+	u8			modulus[128];
+	u8			message[128];
+};
+
+struct type50_meb2_msg {
+	struct type50_hdr	header;
+	u16			keyblock_type;
+	u8			reserved[6];
+	u8			exponent[256];
+	u8			modulus[256];
+	u8			message[256];
+};
+
+struct type50_crb1_msg {
+	struct type50_hdr	header;
+	u16			keyblock_type;
+	u8			reserved[6];
+	u8			p[64];
+	u8			q[64];
+	u8			dp[64];
+	u8			dq[64];
+	u8			u[64];
+	u8			message[128];
+};
+
+struct type50_crb2_msg {
+	struct type50_hdr	header;
+	u16			keyblock_type;
+	u8			reserved[6];
+	u8			p[128];
+	u8			q[128];
+	u8			dp[128];
+	u8			dq[128];
+	u8			u[128];
+	u8			message[256];
+};
+
+union type50_msg {
+	struct type50_meb1_msg meb1;
+	struct type50_meb2_msg meb2;
+	struct type50_crb1_msg crb1;
+	struct type50_crb2_msg crb2;
+};
+
+struct type80_hdr {
+	u8	reserved1;
+	u8	type;
+	u16	len;
+	u8	code;
+	u8	reserved2[3];
+	u8	reserved3[8];
+};
+
+#define TYPE80_RSP_CODE 0x80
+
 struct error_hdr {
 	unsigned char reserved1;
 	unsigned char type;
@@ -657,6 +738,7 @@ struct error_hdr {
 };
 
 #define TYPE82_RSP_CODE 0x82
+#define TYPE88_RSP_CODE 0x88
 
 #define REP82_ERROR_MACHINE_FAILURE  0x10
 #define REP82_ERROR_PREEMPT_FAILURE  0x12
@@ -679,6 +761,22 @@ struct error_hdr {
 #define REP82_ERROR_PACKET_TRUNCATED 0xA0
 #define REP82_ERROR_ZERO_BUFFER_LEN  0xB0
 
+#define REP88_ERROR_MODULE_FAILURE   0x10
+#define REP88_ERROR_MODULE_TIMEOUT   0x11
+#define REP88_ERROR_MODULE_NOTINIT   0x13
+#define REP88_ERROR_MODULE_NOTAVAIL  0x14
+#define REP88_ERROR_MODULE_DISABLED  0x15
+#define REP88_ERROR_MODULE_IN_DIAGN  0x17
+#define REP88_ERROR_FASTPATH_DISABLD 0x19
+#define REP88_ERROR_MESSAGE_TYPE     0x20
+#define REP88_ERROR_MESSAGE_MALFORMD 0x22
+#define REP88_ERROR_MESSAGE_LENGTH   0x23
+#define REP88_ERROR_RESERVED_FIELD   0x24
+#define REP88_ERROR_KEY_TYPE         0x34
+#define REP88_ERROR_INVALID_KEY      0x82
+#define REP88_ERROR_OPERAND          0x84
+#define REP88_ERROR_OPERAND_EVEN_MOD 0x85
+
 #define CALLER_HEADER 12
 
 static inline int
@@ -1029,10 +1127,6 @@ query_online(int deviceNr, int cdx, int resetNr, int *q_depth, int *dev_type)
 			stat = HD_ONLINE;
 			*q_depth = t_depth + 1;
 			switch (t_dev_type) {
-			case OTHER_HW:
-				stat = HD_NOT_THERE;
-				*dev_type = NILDEV;
-				break;
 			case PCICA_HW:
 				*dev_type = PCICA;
 				break;
@@ -1045,6 +1139,9 @@ query_online(int deviceNr, int cdx, int resetNr, int *q_depth, int *dev_type)
 			case CEX2C_HW:
 				*dev_type = CEX2C;
 				break;
+			case CEX2A_HW:
+				*dev_type = CEX2A;
+				break;
 			default:
 				*dev_type = NILDEV;
 				break;
@@ -2029,6 +2126,177 @@ ICACRT_msg_to_type6CRT_msgX(struct ica_rsa_modexpo_crt *icaMsg_p, int cdx,
 	return 0;
 }
 
+static int
+ICAMEX_msg_to_type50MEX_msg(struct ica_rsa_modexpo *icaMex_p, int *z90cMsg_l_p,
+			    union type50_msg *z90cMsg_p)
+{
+	int mod_len, msg_size, mod_tgt_len, exp_tgt_len, inp_tgt_len;
+	unsigned char *mod_tgt, *exp_tgt, *inp_tgt;
+	union type50_msg *tmp_type50_msg;
+
+	mod_len = icaMex_p->inputdatalength;
+
+	msg_size = ((mod_len <= 128) ? TYPE50_MEB1_LEN : TYPE50_MEB2_LEN) +
+		    CALLER_HEADER;
+
+	memset(z90cMsg_p, 0, msg_size);
+
+	tmp_type50_msg = (union type50_msg *)
+		((unsigned char *) z90cMsg_p + CALLER_HEADER);
+
+	tmp_type50_msg->meb1.header.msg_type_code = TYPE50_TYPE_CODE;
+
+	if (mod_len <= 128) {
+		tmp_type50_msg->meb1.header.msg_len = TYPE50_MEB1_LEN;
+		tmp_type50_msg->meb1.keyblock_type = TYPE50_MEB1_FMT;
+		mod_tgt = tmp_type50_msg->meb1.modulus;
+		mod_tgt_len = sizeof(tmp_type50_msg->meb1.modulus);
+		exp_tgt = tmp_type50_msg->meb1.exponent;
+		exp_tgt_len = sizeof(tmp_type50_msg->meb1.exponent);
+		inp_tgt = tmp_type50_msg->meb1.message;
+		inp_tgt_len = sizeof(tmp_type50_msg->meb1.message);
+	} else {
+		tmp_type50_msg->meb2.header.msg_len = TYPE50_MEB2_LEN;
+		tmp_type50_msg->meb2.keyblock_type = TYPE50_MEB2_FMT;
+		mod_tgt = tmp_type50_msg->meb2.modulus;
+		mod_tgt_len = sizeof(tmp_type50_msg->meb2.modulus);
+		exp_tgt = tmp_type50_msg->meb2.exponent;
+		exp_tgt_len = sizeof(tmp_type50_msg->meb2.exponent);
+		inp_tgt = tmp_type50_msg->meb2.message;
+		inp_tgt_len = sizeof(tmp_type50_msg->meb2.message);
+	}
+
+	mod_tgt += (mod_tgt_len - mod_len);
+	if (copy_from_user(mod_tgt, icaMex_p->n_modulus, mod_len))
+		return SEN_RELEASED;
+	if (is_empty(mod_tgt, mod_len))
+		return SEN_USER_ERROR;
+	exp_tgt += (exp_tgt_len - mod_len);
+	if (copy_from_user(exp_tgt, icaMex_p->b_key, mod_len))
+		return SEN_RELEASED;
+	if (is_empty(exp_tgt, mod_len))
+		return SEN_USER_ERROR;
+	inp_tgt += (inp_tgt_len - mod_len);
+	if (copy_from_user(inp_tgt, icaMex_p->inputdata, mod_len))
+		return SEN_RELEASED;
+	if (is_empty(inp_tgt, mod_len))
+		return SEN_USER_ERROR;
+
+	*z90cMsg_l_p = msg_size - CALLER_HEADER;
+
+	return 0;
+}
+
+static int
+ICACRT_msg_to_type50CRT_msg(struct ica_rsa_modexpo_crt *icaMsg_p,
+			    int *z90cMsg_l_p, union type50_msg *z90cMsg_p)
+{
+	int mod_len, short_len, long_len, tmp_size, p_tgt_len, q_tgt_len,
+	    dp_tgt_len, dq_tgt_len, u_tgt_len, inp_tgt_len, long_offset;
+	unsigned char *p_tgt, *q_tgt, *dp_tgt, *dq_tgt, *u_tgt, *inp_tgt,
+		      temp[8];
+	union type50_msg *tmp_type50_msg;
+
+	mod_len = icaMsg_p->inputdatalength;
+	short_len = mod_len / 2;
+	long_len = mod_len / 2 + 8;
+	long_offset = 0;
+
+	if (long_len > 128) {
+		memset(temp, 0x00, sizeof(temp));
+		if (copy_from_user(temp, icaMsg_p->np_prime, long_len-128))
+			return SEN_RELEASED;
+		if (!is_empty(temp, 8))
+			return SEN_NOT_AVAIL;
+		if (copy_from_user(temp, icaMsg_p->bp_key, long_len-128))
+			return SEN_RELEASED;
+		if (!is_empty(temp, 8))
+			return SEN_NOT_AVAIL;
+		if (copy_from_user(temp, icaMsg_p->u_mult_inv, long_len-128))
+			return SEN_RELEASED;
+		if (!is_empty(temp, 8))
+			return SEN_NOT_AVAIL;
+		long_offset = long_len - 128;
+		long_len = 128;
+	}
+
+	tmp_size = ((mod_len <= 128) ? TYPE50_CRB1_LEN : TYPE50_CRB2_LEN) +
+		    CALLER_HEADER;
+
+	memset(z90cMsg_p, 0, tmp_size);
+
+	tmp_type50_msg = (union type50_msg *)
+		((unsigned char *) z90cMsg_p + CALLER_HEADER);
+
+	tmp_type50_msg->crb1.header.msg_type_code = TYPE50_TYPE_CODE;
+	if (long_len <= 64) {
+		tmp_type50_msg->crb1.header.msg_len = TYPE50_CRB1_LEN;
+		tmp_type50_msg->crb1.keyblock_type = TYPE50_CRB1_FMT;
+		p_tgt = tmp_type50_msg->crb1.p;
+		p_tgt_len = sizeof(tmp_type50_msg->crb1.p);
+		q_tgt = tmp_type50_msg->crb1.q;
+		q_tgt_len = sizeof(tmp_type50_msg->crb1.q);
+		dp_tgt = tmp_type50_msg->crb1.dp;
+		dp_tgt_len = sizeof(tmp_type50_msg->crb1.dp);
+		dq_tgt = tmp_type50_msg->crb1.dq;
+		dq_tgt_len = sizeof(tmp_type50_msg->crb1.dq);
+		u_tgt = tmp_type50_msg->crb1.u;
+		u_tgt_len = sizeof(tmp_type50_msg->crb1.u);
+		inp_tgt = tmp_type50_msg->crb1.message;
+		inp_tgt_len = sizeof(tmp_type50_msg->crb1.message);
+	} else {
+		tmp_type50_msg->crb2.header.msg_len = TYPE50_CRB2_LEN;
+		tmp_type50_msg->crb2.keyblock_type = TYPE50_CRB2_FMT;
+		p_tgt = tmp_type50_msg->crb2.p;
+		p_tgt_len = sizeof(tmp_type50_msg->crb2.p);
+		q_tgt = tmp_type50_msg->crb2.q;
+		q_tgt_len = sizeof(tmp_type50_msg->crb2.q);
+		dp_tgt = tmp_type50_msg->crb2.dp;
+		dp_tgt_len = sizeof(tmp_type50_msg->crb2.dp);
+		dq_tgt = tmp_type50_msg->crb2.dq;
+		dq_tgt_len = sizeof(tmp_type50_msg->crb2.dq);
+		u_tgt = tmp_type50_msg->crb2.u;
+		u_tgt_len = sizeof(tmp_type50_msg->crb2.u);
+		inp_tgt = tmp_type50_msg->crb2.message;
+		inp_tgt_len = sizeof(tmp_type50_msg->crb2.message);
+	}
+
+	p_tgt += (p_tgt_len - long_len);
+	if (copy_from_user(p_tgt, icaMsg_p->np_prime + long_offset, long_len))
+		return SEN_RELEASED;
+	if (is_empty(p_tgt, long_len))
+		return SEN_USER_ERROR;
+	q_tgt += (q_tgt_len - short_len);
+	if (copy_from_user(q_tgt, icaMsg_p->nq_prime, short_len))
+		return SEN_RELEASED;
+	if (is_empty(q_tgt, short_len))
+		return SEN_USER_ERROR;
+	dp_tgt += (dp_tgt_len - long_len);
+	if (copy_from_user(dp_tgt, icaMsg_p->bp_key + long_offset, long_len))
+		return SEN_RELEASED;
+	if (is_empty(dp_tgt, long_len))
+		return SEN_USER_ERROR;
+	dq_tgt += (dq_tgt_len - short_len);
+	if (copy_from_user(dq_tgt, icaMsg_p->bq_key, short_len))
+		return SEN_RELEASED;
+	if (is_empty(dq_tgt, short_len))
+		return SEN_USER_ERROR;
+	u_tgt += (u_tgt_len - long_len);
+	if (copy_from_user(u_tgt, icaMsg_p->u_mult_inv + long_offset, long_len))
+		return SEN_RELEASED;
+	if (is_empty(u_tgt, long_len))
+		return SEN_USER_ERROR;
+	inp_tgt += (inp_tgt_len - mod_len);
+	if (copy_from_user(inp_tgt, icaMsg_p->inputdata, mod_len))
+		return SEN_RELEASED;
+	if (is_empty(inp_tgt, mod_len))
+		return SEN_USER_ERROR;
+
+	*z90cMsg_l_p = tmp_size - CALLER_HEADER;
+
+	return 0;
+}
+
 int
 convert_request(unsigned char *buffer, int func, unsigned short function,
 		int cdx, int dev_type, int *msg_l_p, unsigned char *msg_p)
@@ -2071,6 +2339,16 @@ convert_request(unsigned char *buffer, int func, unsigned short function,
 				cdx, msg_l_p, (struct type6_msg *) msg_p,
 				dev_type);
 	}
+	if (dev_type == CEX2A) {
+		if (func == ICARSACRT)
+			return ICACRT_msg_to_type50CRT_msg(
+				(struct ica_rsa_modexpo_crt *) buffer,
+				msg_l_p, (union type50_msg *) msg_p);
+		else
+			return ICAMEX_msg_to_type50MEX_msg(
+				(struct ica_rsa_modexpo *) buffer,
+				msg_l_p, (union type50_msg *) msg_p);
+	}
 
 	return 0;
 }
@@ -2081,8 +2359,8 @@ unset_ext_bitlens(void)
 {
 	if (!ext_bitlens_msg_count) {
 		PRINTK("Unable to use coprocessors for extended bitlengths. "
-		       "Using PCICAs (if present) for extended bitlengths. "
-		       "This is not an error.\n");
+		       "Using PCICAs/CEX2As (if present) for extended "
+		       "bitlengths. This is not an error.\n");
 		ext_bitlens_msg_count++;
 	}
 	ext_bitlens = 0;
@@ -2094,6 +2372,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
 {
 	struct ica_rsa_modexpo *icaMsg_p = (struct ica_rsa_modexpo *) buffer;
 	struct error_hdr *errh_p = (struct error_hdr *) response;
+	struct type80_hdr *t80h_p = (struct type80_hdr *) response;
 	struct type84_hdr *t84h_p = (struct type84_hdr *) response;
 	struct type86_fmt2_msg *t86m_p =  (struct type86_fmt2_msg *) response;
 	int reply_code, service_rc, service_rs, src_l;
@@ -2108,6 +2387,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
 	src_l = 0;
 	switch (errh_p->type) {
 	case TYPE82_RSP_CODE:
+	case TYPE88_RSP_CODE:
 		reply_code = errh_p->reply_code;
 		src_p = (unsigned char *)errh_p;
 		PRINTK("Hardware error: Type %02X Message Header: "
@@ -2116,6 +2396,10 @@ convert_response(unsigned char *response, unsigned char *buffer,
 		       src_p[0], src_p[1], src_p[2], src_p[3],
 		       src_p[4], src_p[5], src_p[6], src_p[7]);
 		break;
+	case TYPE80_RSP_CODE:
+		src_l = icaMsg_p->outputdatalength;
+		src_p = response + (int)t80h_p->len - src_l;
+		break;
 	case TYPE84_RSP_CODE:
 		src_l = icaMsg_p->outputdatalength;
 		src_p = response + (int)t84h_p->len - src_l;
@@ -2202,6 +2486,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
 	if (reply_code)
 		switch (reply_code) {
 		case REP82_ERROR_OPERAND_INVALID:
+		case REP88_ERROR_MESSAGE_MALFORMD:
 			return REC_OPERAND_INV;
 		case REP82_ERROR_OPERAND_SIZE:
 			return REC_OPERAND_SIZE;
diff --git a/drivers/s390/crypto/z90main.c b/drivers/s390/crypto/z90main.c
index 790fcbb..135ae04 100644
--- a/drivers/s390/crypto/z90main.c
+++ b/drivers/s390/crypto/z90main.c
@@ -228,7 +228,7 @@ struct device_x {
  */
 struct device {
 	int		 dev_type;	    // PCICA, PCICC, PCIXCC_MCL2,
-					    // PCIXCC_MCL3, CEX2C
+					    // PCIXCC_MCL3, CEX2C, CEX2A
 	enum devstat	 dev_stat;	    // current device status
 	int		 dev_self_x;	    // Index in array
 	int		 disabled;	    // Set when device is in error
@@ -295,26 +295,30 @@ struct caller {
 /**
  * Function prototypes from z90hardware.c
  */
-enum hdstat query_online(int, int, int, int *, int *);
-enum devstat reset_device(int, int, int);
-enum devstat send_to_AP(int, int, int, unsigned char *);
-enum devstat receive_from_AP(int, int, int, unsigned char *, unsigned char *);
-int convert_request(unsigned char *, int, short, int, int, int *,
-		    unsigned char *);
-int convert_response(unsigned char *, unsigned char *, int *, unsigned char *);
+enum hdstat query_online(int deviceNr, int cdx, int resetNr, int *q_depth,
+			 int *dev_type);
+enum devstat reset_device(int deviceNr, int cdx, int resetNr);
+enum devstat send_to_AP(int dev_nr, int cdx, int msg_len, unsigned char *msg_ext);
+enum devstat receive_from_AP(int dev_nr, int cdx, int resplen,
+			     unsigned char *resp, unsigned char *psmid);
+int convert_request(unsigned char *buffer, int func, unsigned short function,
+		    int cdx, int dev_type, int *msg_l_p, unsigned char *msg_p);
+int convert_response(unsigned char *response, unsigned char *buffer,
+		     int *respbufflen_p, unsigned char *resp_buff);
 
 /**
  * Low level function prototypes
  */
-static int create_z90crypt(int *);
-static int refresh_z90crypt(int *);
-static int find_crypto_devices(struct status *);
-static int create_crypto_device(int);
-static int destroy_crypto_device(int);
+static int create_z90crypt(int *cdx_p);
+static int refresh_z90crypt(int *cdx_p);
+static int find_crypto_devices(struct status *deviceMask);
+static int create_crypto_device(int index);
+static int destroy_crypto_device(int index);
 static void destroy_z90crypt(void);
-static int refresh_index_array(struct status *, struct device_x *);
-static int probe_device_type(struct device *);
-static int probe_PCIXCC_type(struct device *);
+static int refresh_index_array(struct status *status_str,
+			       struct device_x *index_array);
+static int probe_device_type(struct device *devPtr);
+static int probe_PCIXCC_type(struct device *devPtr);
 
 /**
  * proc fs definitions
@@ -425,7 +429,7 @@ static struct miscdevice z90crypt_misc_device = {
 MODULE_AUTHOR("zSeries Linux Crypto Team: Robert H. Burroughs, Eric D. Rossman"
 	      "and Jochen Roehrig");
 MODULE_DESCRIPTION("zSeries Linux Cryptographic Coprocessor device driver, "
-		   "Copyright 2001, 2004 IBM Corporation");
+		   "Copyright 2001, 2005 IBM Corporation");
 MODULE_LICENSE("GPL");
 module_param(domain, int, 0);
 MODULE_PARM_DESC(domain, "domain index for device");
@@ -860,6 +864,12 @@ get_status_CEX2Ccount(void)
 }
 
 static inline int
+get_status_CEX2Acount(void)
+{
+	return z90crypt.hdware_info->type_mask[CEX2A].st_count;
+}
+
+static inline int
 get_status_requestq_count(void)
 {
 	return requestq_count;
@@ -1008,11 +1018,13 @@ static inline int
 select_device_type(int *dev_type_p, int bytelength)
 {
 	static int count = 0;
-	int PCICA_avail, PCIXCC_MCL3_avail, CEX2C_avail, index_to_use;
+	int PCICA_avail, PCIXCC_MCL3_avail, CEX2C_avail, CEX2A_avail,
+	    index_to_use;
 	struct status *stat;
 	if ((*dev_type_p != PCICC) && (*dev_type_p != PCICA) &&
 	    (*dev_type_p != PCIXCC_MCL2) && (*dev_type_p != PCIXCC_MCL3) &&
-	    (*dev_type_p != CEX2C) && (*dev_type_p != ANYDEV))
+	    (*dev_type_p != CEX2C) && (*dev_type_p != CEX2A) &&
+	    (*dev_type_p != ANYDEV))
 		return -1;
 	if (*dev_type_p != ANYDEV) {
 		stat = &z90crypt.hdware_info->type_mask[*dev_type_p];
@@ -1022,7 +1034,13 @@ select_device_type(int *dev_type_p, int bytelength)
 		return -1;
 	}
 
-	/* Assumption: PCICA, PCIXCC_MCL3, and CEX2C are all similar in speed */
+	/**
+	 * Assumption: PCICA, PCIXCC_MCL3, CEX2C, and CEX2A are all similar in
+	 * speed.
+	 *
+	 * PCICA and CEX2A do NOT co-exist, so it would be either one or the
+	 * other present.
+	 */
 	stat = &z90crypt.hdware_info->type_mask[PCICA];
 	PCICA_avail = stat->st_count -
 			(stat->disabled_count + stat->user_disabled_count);
@@ -1032,29 +1050,38 @@ select_device_type(int *dev_type_p, int bytelength)
 	stat = &z90crypt.hdware_info->type_mask[CEX2C];
 	CEX2C_avail = stat->st_count -
 			(stat->disabled_count + stat->user_disabled_count);
-	if (PCICA_avail || PCIXCC_MCL3_avail || CEX2C_avail) {
+	stat = &z90crypt.hdware_info->type_mask[CEX2A];
+	CEX2A_avail = stat->st_count -
+			(stat->disabled_count + stat->user_disabled_count);
+	if (PCICA_avail || PCIXCC_MCL3_avail || CEX2C_avail || CEX2A_avail) {
 		/**
-		 * bitlength is a factor, PCICA is the most capable, even with
-		 * the new MCL for PCIXCC.
+		 * bitlength is a factor, PCICA or CEX2A are the most capable,
+		 * even with the new MCL for PCIXCC.
 		 */
 		if ((bytelength < PCIXCC_MIN_MOD_SIZE) ||
 		    (!ext_bitlens && (bytelength < OLD_PCIXCC_MIN_MOD_SIZE))) {
-			if (!PCICA_avail)
-				return -1;
-			else {
+			if (PCICA_avail) {
 				*dev_type_p = PCICA;
 				return 0;
 			}
+			if (CEX2A_avail) {
+				*dev_type_p = CEX2A;
+				return 0;
+			}
+			return -1;
 		}
 
 		index_to_use = count % (PCICA_avail + PCIXCC_MCL3_avail +
-					CEX2C_avail);
+					CEX2C_avail + CEX2A_avail);
 		if (index_to_use < PCICA_avail)
 			*dev_type_p = PCICA;
 		else if (index_to_use < (PCICA_avail + PCIXCC_MCL3_avail))
 			*dev_type_p = PCIXCC_MCL3;
-		else
+		else if (index_to_use < (PCICA_avail + PCIXCC_MCL3_avail +
+					 CEX2C_avail))
 			*dev_type_p = CEX2C;
+		else
+			*dev_type_p = CEX2A;
 		count++;
 		return 0;
 	}
@@ -1359,7 +1386,7 @@ build_caller(struct work_element *we_p, short function)
 
 	if ((we_p->devtype != PCICC) && (we_p->devtype != PCICA) &&
 	    (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) &&
-	    (we_p->devtype != CEX2C))
+	    (we_p->devtype != CEX2C) && (we_p->devtype != CEX2A))
 		return SEN_NOT_AVAIL;
 
 	memcpy(caller_p->caller_id, we_p->caller_id,
@@ -1428,7 +1455,8 @@ get_crypto_request_buffer(struct work_element *we_p)
 
 	if ((we_p->devtype != PCICA) && (we_p->devtype != PCICC) &&
 	    (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) &&
-	    (we_p->devtype != CEX2C) && (we_p->devtype != ANYDEV)) {
+	    (we_p->devtype != CEX2C) && (we_p->devtype != CEX2A) &&
+	    (we_p->devtype != ANYDEV)) {
 		PRINTK("invalid device type\n");
 		return SEN_USER_ERROR;
 	}
@@ -1503,8 +1531,9 @@ get_crypto_request_buffer(struct work_element *we_p)
 
 	function = PCI_FUNC_KEY_ENCRYPT;
 	switch (we_p->devtype) {
-	/* PCICA does everything with a simple RSA mod-expo operation */
+	/* PCICA and CEX2A do everything with a simple RSA mod-expo operation */
 	case PCICA:
+	case CEX2A:
 		function = PCI_FUNC_KEY_ENCRYPT;
 		break;
 	/**
@@ -1662,7 +1691,8 @@ z90crypt_rsa(struct priv_data *private_data_p, pid_t pid,
 		 * trigger a fallback to software.
 		 */
 		case -EINVAL:
-			if (we_p->devtype != PCICA)
+			if ((we_p->devtype != PCICA) &&
+			    (we_p->devtype != CEX2A))
 				rv = -EGETBUFF;
 			break;
 		case -ETIMEOUT:
@@ -1779,6 +1809,12 @@ z90crypt_unlocked_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			ret = -EFAULT;
 		break;
 
+	case Z90STAT_CEX2ACOUNT:
+		tempstat = get_status_CEX2Acount();
+		if (copy_to_user((int __user *)arg, &tempstat, sizeof(int)) != 0)
+			ret = -EFAULT;
+		break;
+
 	case Z90STAT_REQUESTQ_COUNT:
 		tempstat = get_status_requestq_count();
 		if (copy_to_user((int __user *)arg, &tempstat, sizeof(int)) != 0)
@@ -2019,6 +2055,8 @@ z90crypt_status(char *resp_buff, char **start, off_t offset,
 		get_status_PCIXCCMCL3count());
 	len += sprintf(resp_buff+len, "CEX2C count: %d\n",
 		get_status_CEX2Ccount());
+	len += sprintf(resp_buff+len, "CEX2A count: %d\n",
+		get_status_CEX2Acount());
 	len += sprintf(resp_buff+len, "requestq count: %d\n",
 		get_status_requestq_count());
 	len += sprintf(resp_buff+len, "pendingq count: %d\n",
@@ -2026,8 +2064,8 @@ z90crypt_status(char *resp_buff, char **start, off_t offset,
 	len += sprintf(resp_buff+len, "Total open handles: %d\n\n",
 		get_status_totalopen_count());
 	len += sprinthx(
-		"Online devices: 1: PCICA, 2: PCICC, 3: PCIXCC (MCL2), "
-		"4: PCIXCC (MCL3), 5: CEX2C",
+		"Online devices: 1=PCICA 2=PCICC 3=PCIXCC(MCL2) "
+		"4=PCIXCC(MCL3) 5=CEX2C 6=CEX2A",
 		resp_buff+len,
 		get_status_status_mask(workarea),
 		Z90CRYPT_NUM_APS);
@@ -2140,6 +2178,7 @@ z90crypt_status_write(struct file *file, const char __user *buffer,
 		case '3':	// PCIXCC_MCL2
 		case '4':	// PCIXCC_MCL3
 		case '5':	// CEX2C
+		case '6':       // CEX2A
 			j++;
 			break;
 		case 'd':
@@ -3007,7 +3046,9 @@ create_crypto_device(int index)
 			z90crypt.hdware_info->device_type_array[index] = 4;
 		else if (deviceType == CEX2C)
 			z90crypt.hdware_info->device_type_array[index] = 5;
-		else
+		else if (deviceType == CEX2A)
+			z90crypt.hdware_info->device_type_array[index] = 6;
+		else // No idea how this would happen.
 			z90crypt.hdware_info->device_type_array[index] = -1;
 	}
 
-- 
cgit v1.1


From 3b793060e768197d525e892fd1f84dbc8767cada Mon Sep 17 00:00:00 2001
From: Cornelia Huck <huckc@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:26 -0800
Subject: [PATCH] s390: Fix missing release function and cosmetic changes

- Use kzalloc() in blacklist.c.
- Kill unwanted casts in blacklist.c.
- Provide release function for struct channel_subsystem.

Signed-off-by: Cornelia Huck <huckc@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/cio/blacklist.c |  7 +++----
 drivers/s390/cio/css.c       | 10 ++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 2d444cb..daf21e0 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -299,10 +299,9 @@ cio_ignore_proc_seq_start(struct seq_file *s, loff_t *offset)
 
 	if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
 		return NULL;
-	iter = kmalloc(sizeof(struct ccwdev_iter), GFP_KERNEL);
+	iter = kzalloc(sizeof(struct ccwdev_iter), GFP_KERNEL);
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
-	memset(iter, 0, sizeof(struct ccwdev_iter));
 	iter->ssid = *offset / (__MAX_SUBCHANNEL + 1);
 	iter->devno = *offset % (__MAX_SUBCHANNEL + 1);
 	return iter;
@@ -322,7 +321,7 @@ cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
 
 	if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
 		return NULL;
-	iter = (struct ccwdev_iter *)it;
+	iter = it;
 	if (iter->devno == __MAX_SUBCHANNEL) {
 		iter->devno = 0;
 		iter->ssid++;
@@ -339,7 +338,7 @@ cio_ignore_proc_seq_show(struct seq_file *s, void *it)
 {
 	struct ccwdev_iter *iter;
 
-	iter = (struct ccwdev_iter *)it;
+	iter = it;
 	if (!is_blacklisted(iter->ssid, iter->devno))
 		/* Not blacklisted, nothing to output. */
 		return 0;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 9e9d4a1..e565193 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -444,6 +444,15 @@ css_generate_pgid(struct channel_subsystem *css, u32 tod_high)
 
 }
 
+static void
+channel_subsystem_release(struct device *dev)
+{
+	struct channel_subsystem *css;
+
+	css = to_css(dev);
+	kfree(css);
+}
+
 static inline void __init
 setup_css(int nr)
 {
@@ -453,6 +462,7 @@ setup_css(int nr)
 	css[nr]->valid = 1;
 	css[nr]->cssid = nr;
 	sprintf(css[nr]->device.bus_id, "css%x", nr);
+	css[nr]->device.release = channel_subsystem_release;
 	tod_high = (u32) (get_clock() >> 32);
 	css_generate_pgid(css[nr], tod_high);
 }
-- 
cgit v1.1


From 9bbc8346fb21fad3f678220b067450e436e45dbf Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:27 -0800
Subject: [PATCH] s390: fix invalid return code in sclp_cpi

When the sclp_cpi module is loaded on a system which does not support the
required SCLP call (e.g.  on z/VM), ENOSUPP is returned to user space.  The
correct return value is EOPNOTSUPP.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/char/sclp_cpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/char/sclp_cpi.c b/drivers/s390/char/sclp_cpi.c
index 5a6cef2..80f7f31 100644
--- a/drivers/s390/char/sclp_cpi.c
+++ b/drivers/s390/char/sclp_cpi.c
@@ -204,7 +204,7 @@ cpi_module_init(void)
 		printk(KERN_WARNING "cpi: no control program identification "
 		       "support\n");
 		sclp_unregister(&sclp_cpi_event);
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	req = cpi_prepare_req();
-- 
cgit v1.1


From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:28 -0800
Subject: [PATCH] s390: cleanup Kconfig

Sanitize some s390 Kconfig options.  We have ARCH_S390, ARCH_S390X,
ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT.  Replace these 6 options by
S390, 64BIT and COMPAT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig                  | 27 +++++++--------------------
 arch/s390/Makefile                 |  6 ++----
 arch/s390/appldata/appldata_base.c |  8 ++++----
 arch/s390/crypto/crypt_s390.h      | 10 +++++-----
 arch/s390/defconfig                |  4 +---
 arch/s390/kernel/Makefile          | 15 +++++----------
 arch/s390/kernel/cpcmd.c           | 16 ++++++++--------
 arch/s390/kernel/entry64.S         | 18 +++++++++---------
 arch/s390/kernel/head.S            |  4 ++--
 arch/s390/kernel/module.c          | 12 ++++++------
 arch/s390/kernel/process.c         | 12 ++++++------
 arch/s390/kernel/ptrace.c          | 24 ++++++++++++------------
 arch/s390/kernel/reipl_diag.c      |  2 +-
 arch/s390/kernel/setup.c           | 14 +++++++-------
 arch/s390/kernel/signal.c          |  2 +-
 arch/s390/kernel/smp.c             |  8 ++++----
 arch/s390/kernel/sys_s390.c        | 12 +++++-------
 arch/s390/kernel/traps.c           | 10 +++++-----
 arch/s390/kernel/vmlinux.lds.S     |  2 +-
 arch/s390/lib/Makefile             |  5 ++---
 arch/s390/lib/spinlock.c           |  2 +-
 arch/s390/mm/extmem.c              |  2 +-
 arch/s390/mm/fault.c               | 18 +++++++++---------
 arch/s390/mm/init.c                |  8 ++++----
 arch/s390/mm/mmap.c                |  2 +-
 block/Kconfig                      |  2 +-
 crypto/Kconfig                     |  8 ++++----
 drivers/char/Kconfig               |  2 +-
 drivers/char/hangcheck-timer.c     |  2 +-
 drivers/char/watchdog/Kconfig      |  2 +-
 drivers/input/evdev.c              |  2 +-
 drivers/net/phy/Kconfig            |  2 +-
 drivers/s390/block/Kconfig         |  8 ++++----
 drivers/s390/block/dasd.c          |  2 +-
 drivers/s390/block/dasd_diag.c     |  2 +-
 drivers/s390/block/dasd_diag.h     |  6 +++---
 drivers/s390/block/dasd_eckd.c     |  2 +-
 drivers/s390/block/dasd_fba.c      |  2 +-
 drivers/s390/block/xpram.c         |  4 ++--
 drivers/s390/char/vmwatchdog.c     |  2 +-
 drivers/s390/cio/cio.c             |  2 +-
 drivers/s390/cio/device_id.c       |  2 +-
 drivers/s390/cio/ioasm.h           |  4 ++--
 drivers/s390/cio/qdio.c            |  2 +-
 drivers/s390/cio/qdio.h            | 34 +++++++++++++++++-----------------
 drivers/s390/crypto/z90hardware.c  |  8 ++++----
 drivers/s390/net/Kconfig           |  2 +-
 drivers/s390/net/claw.c            |  6 +++---
 drivers/s390/s390mach.c            | 10 +++++-----
 drivers/s390/sysinfo.c             |  2 +-
 drivers/scsi/Kconfig               |  2 +-
 fs/partitions/Kconfig              |  2 +-
 fs/proc/array.c                    |  2 +-
 include/asm-s390/unistd.h          |  2 +-
 include/linux/irq.h                |  2 +-
 init/Kconfig                       |  2 +-
 init/do_mounts_rd.c                |  4 ++--
 kernel/panic.c                     |  4 ++--
 kernel/sysctl.c                    |  6 +++---
 lib/Kconfig.debug                  |  2 +-
 60 files changed, 183 insertions(+), 208 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1846fbf..6fe532d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -23,14 +23,14 @@ config GENERIC_BUST_SPINLOCK
 
 mainmenu "Linux Kernel Configuration"
 
-config ARCH_S390
+config S390
 	bool
 	default y
 
 config UID16
 	bool
 	default y
-	depends on ARCH_S390X = 'n'
+	depends on !64BIT
 
 source "init/Kconfig"
 
@@ -38,20 +38,12 @@ menu "Base setup"
 
 comment "Processor type and features"
 
-config ARCH_S390X
+config 64BIT
 	bool "64 bit kernel"
 	help
 	  Select this option if you have a 64 bit IBM zSeries machine
 	  and want to use the 64 bit addressing mode.
 
-config 64BIT
-	def_bool ARCH_S390X
-
-config ARCH_S390_31
-	bool
-	depends on ARCH_S390X = 'n'
-	default y
-
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -101,20 +93,15 @@ config MATHEMU
 	  on older S/390 machines. Say Y unless you know your machine doesn't
 	  need this.
 
-config S390_SUPPORT
+config COMPAT
 	bool "Kernel support for 31 bit emulation"
-	depends on ARCH_S390X
+	depends on 64BIT
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
 	  (and some other stuff like libraries and such) is needed for
 	  executing 31 bit applications.  It is safe to say "Y".
 
-config COMPAT
-	bool
-	depends on S390_SUPPORT
-	default y
-
 config SYSVIPC_COMPAT
 	bool
 	depends on COMPAT && SYSVIPC
@@ -122,7 +109,7 @@ config SYSVIPC_COMPAT
 
 config BINFMT_ELF32
 	tristate "Kernel support for 31 bit ELF binaries"
-	depends on S390_SUPPORT
+	depends on COMPAT
 	help
 	  This allows you to run 32-bit Linux/ELF binaries on your zSeries
 	  in 64 bit mode. Everybody wants this; say Y.
@@ -135,7 +122,7 @@ choice
 
 config MARCH_G5
 	bool "S/390 model G5 and G6"
-	depends on ARCH_S390_31
+	depends on !64BIT
 	help
 	  Select this to build a 31 bit kernel that works
 	  on all S/390 and zSeries machines.
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 73a09a6..6c6b197 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -13,16 +13,14 @@
 # Copyright (C) 1994 by Linus Torvalds
 #
 
-ifdef CONFIG_ARCH_S390_31
+ifndef CONFIG_64BIT
 LDFLAGS		:= -m elf_s390
 CFLAGS		+= -m31
 AFLAGS		+= -m31
 UTS_MACHINE	:= s390
 STACK_SIZE	:= 8192
 CHECKFLAGS	+= -D__s390__
-endif
-
-ifdef CONFIG_ARCH_S390X
+else
 LDFLAGS		:= -m elf64_s390
 MODFLAGS	+= -fpic -D__PIC__
 CFLAGS		+= -m64
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index dee6ab5..d06a8d7 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -40,7 +40,7 @@
 
 #define TOD_MICRO	0x01000			/* nr. of TOD clock units
 						   for 1 microsecond */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 #define APPLDATA_START_INTERVAL_REC 0x00   	/* Function codes for */
 #define APPLDATA_STOP_REC	    0x01	/* DIAG 0xDC	  */
@@ -54,13 +54,13 @@
 #define APPLDATA_GEN_EVENT_RECORD   0x82
 #define APPLDATA_START_CONFIG_REC   0x83
 
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 
 /*
  * Parameter list for DIAGNOSE X'DC'
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct appldata_parameter_list {
 	u16 diag;		/* The DIAGNOSE code X'00DC'          */
 	u8  function;		/* The function code for the DIAGNOSE */
@@ -82,7 +82,7 @@ struct appldata_parameter_list {
 	u64 product_id_addr;
 	u64 buffer_addr;
 };
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * /proc entries (sysctl)
diff --git a/arch/s390/crypto/crypt_s390.h b/arch/s390/crypto/crypt_s390.h
index d6712cf..d1c259a 100644
--- a/arch/s390/crypto/crypt_s390.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -112,7 +112,7 @@ struct crypt_s390_query_status {
  * [ret] is the variable to receive the error code
  * [ERR] is the error code value
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -129,7 +129,7 @@ struct crypt_s390_query_status {
 	"	.long	0b,7b \n"	\
 	"	.long	1b,8b \n"	\
 	".previous"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_fixup \
 	".section .fixup,\"ax\" \n"	\
 	"7:	lhi	%0,%h[e1] \n"	\
@@ -142,7 +142,7 @@ struct crypt_s390_query_status {
 	"	.quad	0b,7b \n"	\
 	"	.quad	1b,8b \n"	\
 	".previous"
-#endif /* __s390x__ */
+#endif /* CONFIG_64BIT */
 
 /*
  * Standard code for setting the result of s390 crypto instructions.
@@ -150,10 +150,10 @@ struct crypt_s390_query_status {
  * [result]: the register containing the result (e.g. second operand length
  * to compute number of processed bytes].
  */
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 #define __crypt_s390_set_result \
 	"	lr	%0,%[result] \n"
-#else /* __s390x__ */
+#else /* CONFIG_64BIT */
 #define __crypt_s390_set_result \
 	"	lgr	%0,%[result] \n"
 #endif
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index f195c7e..7d23edc 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -6,7 +6,7 @@
 CONFIG_MMU=y
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
-CONFIG_ARCH_S390=y
+CONFIG_S390=y
 CONFIG_UID16=y
 
 #
@@ -89,9 +89,7 @@ CONFIG_DEFAULT_IOSCHED="anticipatory"
 #
 # Processor type and features
 #
-# CONFIG_ARCH_S390X is not set
 # CONFIG_64BIT is not set
-CONFIG_ARCH_S390_31=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=32
 CONFIG_HOTPLUG_CPU=y
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7434c32..4865e4b 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -8,31 +8,26 @@ obj-y	:=  bitmap.o traps.o time.o process.o \
             setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \
             semaphore.o s390_ext.o debug.o profile.o irq.o reipl_diag.o
 
+obj-y	+= $(if $(CONFIG_64BIT),entry64.o,entry.o)
+obj-y	+= $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
+
 extra-y				+= head.o init_task.o vmlinux.lds
 
 obj-$(CONFIG_MODULES)		+= s390_ksyms.o module.o
 obj-$(CONFIG_SMP)		+= smp.o
 
-obj-$(CONFIG_S390_SUPPORT)	+= compat_linux.o compat_signal.o \
+obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o \
 					compat_ioctl.o compat_wrapper.o \
 					compat_exec_domain.o
 obj-$(CONFIG_BINFMT_ELF32)	+= binfmt_elf32.o
 
-obj-$(CONFIG_ARCH_S390_31)	+= entry.o reipl.o
-obj-$(CONFIG_ARCH_S390X)	+= entry64.o reipl64.o
-
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
 
 # Kexec part
 S390_KEXEC_OBJS := machine_kexec.o crash.o
-ifeq ($(CONFIG_ARCH_S390X),y)
-S390_KEXEC_OBJS += relocate_kernel64.o
-else
-S390_KEXEC_OBJS += relocate_kernel.o
-endif
+S390_KEXEC_OBJS += $(if $(CONFIG_64BIT),relocate_kernel64.o,relocate_kernel.o)
 obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS)
 
-
 #
 # This is just to get the dependencies...
 #
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index d47fecb..4ef44e5 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -39,7 +39,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 	if (response != NULL && rlen > 0) {
 		memset(response, 0, rlen);
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		asm volatile (	"lra	2,0(%2)\n"
 				"lr	4,%3\n"
 				"o	4,%6\n"
@@ -55,7 +55,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%2)\n"
 				"lgr	4,%3\n"
 				"o	4,%6\n"
@@ -73,11 +73,11 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "a" (cpcmd_buf), "d" (cmdlen),
 				"a" (response), "d" (rlen), "m" (mask)
 				: "cc", "2", "3", "4", "5" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
                 EBCASC(response, rlen);
         } else {
 		return_len = 0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
                 asm volatile (	"lra	2,0(%1)\n"
 				"lr	3,%2\n"
 				"diag	2,3,0x8\n"
@@ -85,7 +85,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3"  );
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
                 asm volatile (	"lrag	2,0(%1)\n"
 				"lgr	3,%2\n"
 				"sam31\n"
@@ -95,7 +95,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 				: "=d" (return_code)
 				: "a" (cpcmd_buf), "d" (cmdlen)
 				: "2", "3" );
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         }
 	spin_unlock_irqrestore(&cpcmd_lock, flags);
 	if (response_code != NULL)
@@ -105,7 +105,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 
 EXPORT_SYMBOL(__cpcmd);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 {
 	char *lowbuf;
@@ -129,4 +129,4 @@ int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 }
 
 EXPORT_SYMBOL(cpcmd);
-#endif		/* CONFIG_ARCH_S390X */
+#endif		/* CONFIG_64BIT */
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 4eb71ff..369ab44 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -213,7 +213,7 @@ sysc_nr_ok:
 	mvc	SP_ARGS(8,%r15),SP_R7(%r15)
 sysc_do_restart:
 	larl    %r10,sys_call_table
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 	tm	__TI_flags+5(%r9),(_TIF_31BIT>>16)  # running in 31 bit mode ?
 	jno	sysc_noemu
 	larl    %r10,sys_call_table_emu  # use 31 bit emulation system calls
@@ -361,7 +361,7 @@ sys_clone_glue:
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys_clone              # branch to sys_clone
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_clone_glue: 
         la      %r2,SP_PTREGS(%r15)    # load pt_regs
         jg      sys32_clone            # branch to sys32_clone
@@ -383,7 +383,7 @@ sys_execve_glue:
         bnz     0(%r12)               # it did fail -> store result in gpr2
         b       6(%r12)               # SKIP STG 2,SP_R2(15) in
                                       # system_call/sysc_tracesys
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_execve_glue:        
         la      %r2,SP_PTREGS(%r15)   # load pt_regs
 	lgr     %r12,%r14             # save return address
@@ -398,7 +398,7 @@ sys_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigreturn         # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigreturn       # branch to sys32_sigreturn
@@ -408,7 +408,7 @@ sys_rt_sigreturn_glue:
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_rt_sigreturn      # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigreturn_glue:     
         la      %r2,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_rt_sigreturn    # branch to sys32_sigreturn
@@ -429,7 +429,7 @@ sys_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_sigsuspend        # branch to sys_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigsuspend_glue:    
 	llgfr	%r4,%r4               # unsigned long			
         lgr     %r5,%r4               # move mask back
@@ -449,7 +449,7 @@ sys_rt_sigsuspend_glue:
 	la      %r14,6(%r14)          # skip store of return value
         jg      sys_rt_sigsuspend     # branch to sys_rt_sigsuspend
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_rt_sigsuspend_glue: 
 	llgfr	%r3,%r3               # size_t			
         lgr     %r4,%r3               # move sigsetsize parameter
@@ -464,7 +464,7 @@ sys_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys_sigaltstack       # branch to sys_sigreturn
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 sys32_sigaltstack_glue:
         la      %r4,SP_PTREGS(%r15)   # load pt_regs as parameter
         jg      sys32_sigaltstack_wrapper # branch to sys_sigreturn
@@ -1009,7 +1009,7 @@ sys_call_table:
 #include "syscalls.S"
 #undef SYSCALL
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 
 #define SYSCALL(esa,esame,emu)	.long emu
 	.globl  sys_call_table_emu
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index d31a97c..ea88d06 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -30,7 +30,7 @@
 #include <asm/thread_info.h>
 #include <asm/page.h>
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define ARCH_OFFSET	4
 #else
 #define ARCH_OFFSET	0
@@ -539,7 +539,7 @@ ipl_devno:
 	.word 0
 .endm
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #include "head64.S"
 #else
 #include "head31.S"
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 607d506..c271cda 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,11 +37,11 @@
 #define DEBUGP(fmt , ...)
 #endif
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define PLT_ENTRY_SIZE 12
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define PLT_ENTRY_SIZE 20
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void *module_alloc(unsigned long size)
 {
@@ -294,17 +294,17 @@ apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			unsigned int *ip;
 			ip = me->module_core + me->arch.plt_offset +
 				info->plt_offset;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 			ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */
 			ip[1] = 0x100607f1;
 			ip[2] = val;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 			ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
 			ip[1] = 0x100a0004;
 			ip[2] = 0x07f10000;
 			ip[3] = (unsigned int) (val >> 32);
 			ip[4] = (unsigned int) val;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 			info->plt_initialized = 1;
 		}
 		if (r_type == R_390_PLTOFF16 ||
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 78b64fe..a942bf2 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -235,7 +235,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Save access registers to new thread structure. */
 	save_access_regs(&p->thread.acrs[0]);
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the child.
@@ -247,7 +247,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS)
 		p->thread.acrs[0] = regs->gprs[6];
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	/* Save the fpu registers to new thread structure. */
 	save_fp_regs(&p->thread.fp_regs);
         p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE;
@@ -260,7 +260,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 			p->thread.acrs[1] = (unsigned int) regs->gprs[6];
 		}
 	}
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	/* start new process with ar4 pointing to the correct address space */
 	p->thread.mm_segment = get_fs();
         /* Don't copy debug registers */
@@ -339,16 +339,16 @@ out:
  */
 int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 {
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
         /*
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the dump.
 	 */
 	save_fp_regs(&current->thread.fp_regs);
 	memcpy(fpregs, &current->thread.fp_regs, sizeof(s390_fp_regs));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	save_fp_regs(fpregs);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return 1;
 }
 
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 06afa31..8ecda6d 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -42,7 +42,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 #include "compat_ptrace.h"
 #endif
 
@@ -59,7 +59,7 @@ FixPerRegisters(struct task_struct *task)
 	
 	if (per_info->single_step) {
 		per_info->control_regs.bits.starting_addr = 0;
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			per_info->control_regs.bits.ending_addr = 0x7fffffffUL;
 		else
@@ -112,7 +112,7 @@ ptrace_disable(struct task_struct *child)
 	clear_single_step(child);
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 # define __ADDR_MASK 3
 #else
 # define __ADDR_MASK 7
@@ -138,7 +138,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -160,7 +160,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb reading
 		 * from acrs[15]. Result is a 64 bit value. Read the
@@ -218,7 +218,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 	 * an alignment of 4. Programmers from hell indeed...
 	 */
 	mask = __ADDR_MASK;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	if (addr >= (addr_t) &dummy->regs.acrs &&
 	    addr < (addr_t) &dummy->regs.orig_gpr2)
 		mask = 3;
@@ -231,13 +231,13 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * psw and gprs are stored on the stack
 		 */
 		if (addr == (addr_t) &dummy->regs.psw.mask &&
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		    data != PSW_MASK_MERGE(PSW_USER32_BITS, data) &&
 #endif
 		    data != PSW_MASK_MERGE(PSW_USER_BITS, data))
 			/* Invalid psw mask. */
 			return -EINVAL;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		if (addr == (addr_t) &dummy->regs.psw.addr)
 			/* I'd like to reject addresses without the
 			   high order bit but older gdb's rely on it */
@@ -250,7 +250,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * access registers are stored in the thread structure
 		 */
 		offset = addr - (addr_t) &dummy->regs.acrs;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		/*
 		 * Very special case: old & broken 64 bit gdb writing
 		 * to acrs[15] with a 64 bit value. Ignore the lower
@@ -357,7 +357,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
 	return ptrace_request(child, request, addr, data);
 }
 
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 /*
  * Now the fun part starts... a 31 bit program running in the
  * 31 bit emulation tracing another program. PTRACE_PEEKTEXT,
@@ -629,7 +629,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 			return peek_user(child, addr, data);
 		if (request == PTRACE_POKEUSR && addr == PT_IEEE_IP)
 			return poke_user(child, addr, data);
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (request == PTRACE_PEEKUSR &&
 		    addr == PT32_IEEE_IP && test_thread_flag(TIF_31BIT))
 			return peek_user_emu31(child, addr, data);
@@ -695,7 +695,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	/* Do requests that differ for 31/64 bit */
 	default:
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT))
 			return do_ptrace_emu31(child, request, addr, data);
 #endif
diff --git a/arch/s390/kernel/reipl_diag.c b/arch/s390/kernel/reipl_diag.c
index 83cb42b..1f33951 100644
--- a/arch/s390/kernel/reipl_diag.c
+++ b/arch/s390/kernel/reipl_diag.c
@@ -26,7 +26,7 @@ void reipl_diag(void)
 		"   st   %%r4,%0\n"
 		"   st   %%r5,%1\n"
                 ".section __ex_table,\"a\"\n"
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
                 "   .align 8\n"
                 "   .quad 0b, 0b\n"
 #else
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 31e7b19..b03847d 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -427,7 +427,7 @@ setup_lowcore(void)
 		__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + PAGE_SIZE;
 	lc->current_task = (unsigned long) init_thread_union.thread_info.task;
 	lc->thread_info = (unsigned long) &init_thread_union;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE) {
 		lc->extended_save_area_addr = (__u32)
 			__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0);
@@ -562,21 +562,21 @@ setup_arch(char **cmdline_p)
         /*
          * print what head.S has found out about the machine
          */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (31 bit mode)\n" :
 	       "We are running native (31 bit mode)\n");
 	printk((MACHINE_HAS_IEEE) ?
 	       "This machine has an IEEE fpu\n" :
 	       "This machine has no IEEE fpu\n");
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	printk((MACHINE_IS_VM) ?
 	       "We are running under VM (64 bit mode)\n" :
 	       "We are running native (64 bit mode)\n");
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
         ROOT_DEV = Root_RAM0;
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	memory_end = memory_size & ~0x400000UL;  /* align memory end to 4MB */
         /*
          * We need some free virtual space to be able to do vmalloc.
@@ -585,9 +585,9 @@ setup_arch(char **cmdline_p)
          */
         if (memory_end > 1920*1024*1024)
                 memory_end = 1920*1024*1024;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	memory_end = memory_size & ~0x200000UL;  /* detected in head.s */
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) &_etext;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 13592d0..6ae4a77 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -501,7 +501,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 
 	if (signr > 0) {
 		/* Whee!  Actually deliver the signal.  */
-#ifdef CONFIG_S390_SUPPORT
+#ifdef CONFIG_COMPAT
 		if (test_thread_flag(TIF_31BIT)) {
 			extern void handle_signal32(unsigned long sig,
 						    struct k_sigaction *ka,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index bd5b311..e10f4ca 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -402,7 +402,7 @@ static void smp_ext_bitcall_others(ec_bit_sig sig)
         }
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 /*
  * this function sends a 'purge tlb' signal to another CPU.
  */
@@ -416,7 +416,7 @@ void smp_ptlb_all(void)
         on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
-#endif /* ! CONFIG_ARCH_S390X */
+#endif /* ! CONFIG_64BIT */
 
 /*
  * this function sends a 'reschedule' IPI to another CPU.
@@ -783,7 +783,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (stack == 0ULL)
 			panic("smp_boot_cpus failed to allocate memory\n");
 		lowcore_ptr[i]->panic_stack = stack + (PAGE_SIZE);
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 		if (MACHINE_HAS_IEEE) {
 			lowcore_ptr[i]->extended_save_area_addr =
 				(__u32) __get_free_pages(GFP_KERNEL,0);
@@ -793,7 +793,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		}
 #endif
 	}
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		ctl_set_bit(14, 29); /* enable extended save area */
 #endif
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index efe6b83..6a63553 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -26,9 +26,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#ifdef CONFIG_ARCH_S390X
 #include <linux/personality.h>
-#endif /* CONFIG_ARCH_S390X */
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
@@ -121,7 +119,7 @@ out:
 	return error;
 }
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 struct sel_arg_struct {
 	unsigned long n;
 	fd_set *inp, *outp, *exp;
@@ -138,7 +136,7 @@ asmlinkage long old_select(struct sel_arg_struct __user *arg)
 	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
 
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * sys_ipc() is the de-multiplexer for the SysV IPC calls..
@@ -211,7 +209,7 @@ asmlinkage long sys_ipc(uint call, int first, unsigned long second,
 	return -EINVAL;
 }
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 asmlinkage long s390x_newuname(struct new_utsname __user *name)
 {
 	int ret = sys_newuname(name);
@@ -235,12 +233,12 @@ asmlinkage long s390x_personality(unsigned long personality)
 
 	return ret;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * Wrapper function for sys_fadvise64/fadvise64_64
  */
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 
 asmlinkage long
 s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice)
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index c5bd36f..95d1099 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -67,13 +67,13 @@ extern pgm_check_handler_t do_monitor_call;
 
 #define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; })
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define FOURLONG "%08lx %08lx %08lx %08lx\n"
 static int kstack_depth_to_print = 12;
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define FOURLONG "%016lx %016lx %016lx %016lx\n"
 static int kstack_depth_to_print = 20;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 /*
  * For show_trace we have tree different stack to consider:
@@ -702,12 +702,12 @@ void __init trap_init(void)
         pgm_check_table[0x11] = &do_dat_exception;
         pgm_check_table[0x12] = &translation_exception;
         pgm_check_table[0x13] = &special_op_exception;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
         pgm_check_table[0x38] = &do_dat_exception;
 	pgm_check_table[0x39] = &do_dat_exception;
 	pgm_check_table[0x3A] = &do_dat_exception;
         pgm_check_table[0x3B] = &do_dat_exception;
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
         pgm_check_table[0x15] = &operand_exception;
         pgm_check_table[0x1C] = &space_switch_exception;
         pgm_check_table[0x1D] = &hfp_sqrt_exception;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 89fdb38..9289fac 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <asm-generic/vmlinux.lds.h>
 #include <linux/config.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
 OUTPUT_ARCH(s390)
 ENTRY(_start)
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index b701efa..d9b97b3 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -4,6 +4,5 @@
 
 EXTRA_AFLAGS := -traditional
 
-lib-y += delay.o string.o
-lib-$(CONFIG_ARCH_S390_31) += uaccess.o spinlock.o
-lib-$(CONFIG_ARCH_S390X) += uaccess64.o spinlock.o
+lib-y += delay.o string.o spinlock.o
+lib-y += $(if $(CONFIG_64BIT),uaccess64.o,uaccess.o)
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 2dc14e9..68d79c5 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -29,7 +29,7 @@ __setup("spin_retry=", spin_retry_setup);
 static inline void
 _diag44(void)
 {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	if (MACHINE_HAS_DIAG44)
 #endif
 		asm volatile("diag 0,0,0x44");
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 506a33b..a9566bc 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -143,7 +143,7 @@ dcss_diag (__u8 func, void *parameter,
 	rx = (unsigned long) parameter;
 	ry = (unsigned long) func;
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"   sam31\n" // switch to 31 bit
 		"   diag    %0,%1,0x64\n"
 		"   sam64\n" // switch back to 64 bit
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fb2607c..81ade40 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,17 +31,17 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 #define __FAIL_ADDR_MASK 0x7ffff000
 #define __FIXUP_MASK 0x7fffffff
 #define __SUBCODE_MASK 0x0200
 #define __PF_RES_FIELD 0ULL
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define __FAIL_ADDR_MASK -4096L
 #define __FIXUP_MASK ~0L
 #define __SUBCODE_MASK 0x0600
 #define __PF_RES_FIELD 0x8000000000000000ULL
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 #ifdef CONFIG_SYSCTL
 extern int sysctl_userprocess_debug;
@@ -393,11 +393,11 @@ int pfault_init(void)
 		"2:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,1b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,1b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
                 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc" );
         __ctl_set_bit(0, 9);
@@ -417,11 +417,11 @@ void pfault_fini(void)
 		"0:\n"
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		"   .long  0b,0b\n"
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 		"   .quad  0b,0b\n"
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 		".previous"
 		: : "a" (&refbk), "m" (refbk) : "cc" );
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6ec5cd9..df95338 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -44,7 +44,7 @@ void diag10(unsigned long addr)
 {
         if (addr >= 0x7ff00000)
                 return;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
         asm volatile (
 		"   sam31\n"
 		"   diag %0,%0,0x10\n"
@@ -106,7 +106,7 @@ extern unsigned long __initdata zholes_size[];
  * paging_init() sets up the page tables
  */
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -175,7 +175,7 @@ void __init paging_init(void)
         return;
 }
 
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 void __init paging_init(void)
 {
         pgd_t * pg_dir;
@@ -256,7 +256,7 @@ void __init paging_init(void)
 
         return;
 }
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 void __init mem_init(void)
 {
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fb187e5..356257c 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -50,7 +50,7 @@ static inline unsigned long mmap_base(void)
 
 static inline int mmap_is_legacy(void)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * Force standard allocation for 64 bit programs.
 	 */
diff --git a/block/Kconfig b/block/Kconfig
index eb48edb..377f6dd 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,7 +5,7 @@
 #for instance.
 config LBD
 	bool "Support for Large Block Devices"
-	depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
+	depends on X86 || (MIPS && 32BIT) || PPC32 || (S390 && !64BIT) || SUPERH || UML
 	help
 	  Say Y here if you want to attach large (bigger than 2TB) discs to
 	  your machine, or if you want to have a raid or loopback device
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c696f7a..52e1d41 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -42,7 +42,7 @@ config CRYPTO_SHA1
 
 config CRYPTO_SHA1_S390
 	tristate "SHA1 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
@@ -58,7 +58,7 @@ config CRYPTO_SHA256
 
 config CRYPTO_SHA256_S390
 	tristate "SHA256 digest algorithm (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  SHA256 secure hash standard (DFIPS 180-2).
@@ -111,7 +111,7 @@ config CRYPTO_DES
 
 config CRYPTO_DES_S390
 	tristate "DES and Triple DES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
 
@@ -217,7 +217,7 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_S390
 	tristate "AES cipher algorithms (s390)"
-	depends on CRYPTO && ARCH_S390
+	depends on CRYPTO && S390
 	help
 	  This is the s390 hardware accelerated implementation of the
 	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 84e68cd..5ebd06b 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -985,7 +985,7 @@ config HPET_MMAP
 
 config HANGCHECK_TIMER
 	tristate "Hangcheck timer"
-	depends on X86 || IA64 || PPC64 || ARCH_S390
+	depends on X86 || IA64 || PPC64 || S390
 	help
 	  The hangcheck-timer module detects when the system has gone
 	  out to lunch past a certain margin.  It can reboot the system
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 66e53dd..40a67c8 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -120,7 +120,7 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 #if defined(CONFIG_X86)
 # define HAVE_MONOTONIC
 # define TIMER_FREQ 1000000000ULL
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 /* FA240000 is 1 Second in the IBM time universe (Page 4-38 Principles of Op for zSeries */
 # define TIMER_FREQ 0xFA240000ULL
 #elif defined(CONFIG_IA64)
diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig
index 344001b..a654479 100644
--- a/drivers/char/watchdog/Kconfig
+++ b/drivers/char/watchdog/Kconfig
@@ -438,7 +438,7 @@ config INDYDOG
 
 config ZVM_WATCHDOG
 	tristate "z/VM Watchdog Timer"
-	depends on WATCHDOG && ARCH_S390
+	depends on WATCHDOG && S390
 	help
 	  IBM s/390 and zSeries machines running under z/VM 5.1 or later
 	  provide a virtual watchdog timer to their guest that cause a
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9f2352b..a1e660e 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -157,7 +157,7 @@ struct input_event_compat {
 #  define COMPAT_TEST test_thread_flag(TIF_IA32)
 #elif defined(CONFIG_IA64)
 #  define COMPAT_TEST IS_IA32_PROCESS(ia64_task_regs(current))
-#elif defined(CONFIG_ARCH_S390)
+#elif defined(CONFIG_S390)
 #  define COMPAT_TEST test_thread_flag(TIF_31BIT)
 #elif defined(CONFIG_MIPS)
 #  define COMPAT_TEST (current->thread.mflags & MF_32BIT_ADDR)
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c782a63..fa39b94 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -6,7 +6,7 @@ menu "PHY device support"
 
 config PHYLIB
 	tristate "PHY Device support and infrastructure"
-	depends on NET_ETHERNET && (BROKEN || !ARCH_S390)
+	depends on NET_ETHERNET && (BROKEN || !S390)
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 6e7d7b0..6f50cc9 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,11 +1,11 @@
-if ARCH_S390
+if S390
 
 comment "S/390 block device drivers"
-	depends on ARCH_S390
+	depends on S390
 
 config BLK_DEV_XPRAM
 	tristate "XPRAM disk support"
-	depends on ARCH_S390
+	depends on S390
 	help
 	  Select this option if you want to use your expanded storage on S/390
 	  or zSeries as a disk.  This is useful as a _fast_ swap device if you
@@ -49,7 +49,7 @@ config DASD_FBA
 
 config DASD_DIAG
 	tristate "Support for DIAG access to Disks"
-	depends on DASD && ( ARCH_S390X = 'n' || EXPERIMENTAL)
+	depends on DASD && ( 64BIT = 'n' || EXPERIMENTAL)
 	help
 	  Select this option if you want to use Diagnose250 command to access
 	  Disks under VM.  If you are not running under VM or unsure what it is,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 1141a59..041e1a6 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -604,7 +604,7 @@ dasd_smalloc_request(char *magic, int cplength, int datasize,
 void
 dasd_kfree_request(struct dasd_ccw_req * cqr, struct dasd_device * device)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	struct ccw1 *ccw;
 
 	/* Clear any idals used for the request. */
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index a33d406..ba80fde 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -75,7 +75,7 @@ dia250(void *iob, int cmd)
 	int rc;
 
 	__asm__ __volatile__(
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		"	lghi	%0,3\n"
 		"	lgr	0,%3\n"
 		"	diag	0,%2,0x250\n"
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index 37edf6e..a4f80bd 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -45,7 +45,7 @@ struct dasd_diag_characteristics {
 } __attribute__ ((packed, aligned(4)));
 
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define DASD_DIAG_FLAGA_DEFAULT		DASD_DIAG_FLAGA_FORMAT_64BIT
 
 typedef u64 blocknum_t;
@@ -86,7 +86,7 @@ struct dasd_diag_rw_io {
 	struct dasd_diag_bio *bio_list;
 	u8  spare4[8];
 } __attribute__ ((packed, aligned(8)));
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define DASD_DIAG_FLAGA_DEFAULT		0x0
 
 typedef u32 blocknum_t;
@@ -125,4 +125,4 @@ struct dasd_diag_rw_io {
 	u32 interrupt_params;
 	u8 spare3[20];
 } __attribute__ ((packed, aligned(8)));
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index efc4cf6..96eb482 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -1041,7 +1041,7 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
 				/* Eckd can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len >> (device->s2b_shift + 9);
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 9bac8d8..8ec75dc 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -271,7 +271,7 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
 				/* Fba can only do full blocks. */
 				return ERR_PTR(-EINVAL);
 			count += bv->bv_len >> (device->s2b_shift + 9);
-#if defined(CONFIG_ARCH_S390X)
+#if defined(CONFIG_64BIT)
 			if (idal_is_needed (page_address(bv->bv_page),
 					    bv->bv_len))
 				cidaw += bv->bv_len / blksize;
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index d428c90..bf3a67c 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -160,7 +160,7 @@ static int xpram_page_in (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
@@ -208,7 +208,7 @@ static long xpram_page_out (unsigned long page_addr, unsigned int xpage_index)
                 "0: ipm   %0\n"
 		"   srl   %0,28\n"
 		"1:\n"
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 4\n"
 		"   .long  0b,1b\n"
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 5473c23..5acc0ac 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -66,7 +66,7 @@ static int __diag288(enum vmwdt_func func, unsigned int timeout,
 	__cmdl = len;
 	err = 0;
 	asm volatile (
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		       "diag %2,%4,0x288\n"
 		"1:	\n"
 		".section .fixup,\"ax\"\n"
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 6f274f4..7376bc8 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -195,7 +195,7 @@ cio_start_key (struct subchannel *sch,	/* subchannel structure */
 	sch->orb.spnd = sch->options.suspend;
 	sch->orb.ssic = sch->options.suspend && sch->options.inter;
 	sch->orb.lpm = (lpm != 0) ? (lpm & sch->opm) : sch->lpm;
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/*
 	 * for 64 bit we always support 64 bit IDAWs with 4k page size only
 	 */
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 3c77c3f..04ceba3 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -27,7 +27,7 @@
 /*
  * diag210 is used under VM to get information about a virtual device
  */
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 int
 diag210(struct diag210 * addr)
 {
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 62b0e2a..95a9462 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -50,7 +50,7 @@ static inline int stsch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
@@ -95,7 +95,7 @@ static inline int msch_err(struct subchannel_id schid,
 		"0:  ipm  %0\n"
 		"    srl  %0,28\n"
 		"1:\n"
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 		".section __ex_table,\"a\"\n"
 		"   .align 8\n"
 		"   .quad 0b,1b\n"
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index 035c77a..30a836f 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -2394,7 +2394,7 @@ tiqdio_check_chsc_availability(void)
 	sprintf(dbf_text,"hydrati%1x", hydra_thinints);
 	QDIO_DBF_TEXT0(0,setup,dbf_text);
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	/* Check for QEBSM support in general (bit 58). */
 	is_passthrough = css_general_characteristics.qebsm;
 #endif
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 43b840a..fa385e7 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -271,7 +271,7 @@ static inline int
 do_sqbs(unsigned long sch, unsigned char state, int queue,
        unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
        register unsigned long _ccq asm ("0") = *count;
        register unsigned long _sch asm ("1") = sch;
        unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -295,7 +295,7 @@ static inline int
 do_eqbs(unsigned long sch, unsigned char *state, int queue,
 	unsigned int *start, unsigned int *count)
 {
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 	register unsigned long _ccq asm ("0") = *count;
 	register unsigned long _sch asm ("1") = sch;
 	unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
@@ -323,7 +323,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,2	\n\t"
 		"lr	1,%1	\n\t"
@@ -336,7 +336,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,2	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -349,7 +349,7 @@ do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
 		: "d" (schid), "d" (mask1), "d" (mask2)
 		: "cc", "0", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	return cc;
 }
 
@@ -358,7 +358,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 {
 	int cc;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,1	\n\t"
 		"lr	1,%1	\n\t"
@@ -370,7 +370,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	0,1	\n\t"
 		"llgfr	1,%1	\n\t"
@@ -382,7 +382,7 @@ do_siga_input(struct subchannel_id schid, unsigned int mask)
 		: "d" (schid), "d" (mask)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return cc;
 }
@@ -394,7 +394,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 	int cc;
 	__u32 busy_bit;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	0,0	\n\t"
 		"lr	1,%2	\n\t"
@@ -424,7 +424,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
 		: "cc", "0", "1", "2", "memory"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
         	"llgfr  0,%5    \n\t"
                 "lgr    1,%2    \n\t"
@@ -449,7 +449,7 @@ do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
 		"i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
 		: "cc", "0", "1", "2", "memory"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	(*bb) = busy_bit;
 	return cc;
@@ -461,21 +461,21 @@ do_clear_global_summary(void)
 
 	unsigned long time;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	asm volatile (
 		"lhi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 	asm volatile (
 		"lghi	1,3	\n\t"
 		".insn	rre,0xb2650000,2,0	\n\t"
 		"lgr	%0,3	\n\t"
 		: "=d" (time) : : "cc", "1", "2", "3"
 		);
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 	
 	return time;
 }
@@ -542,11 +542,11 @@ struct qdio_perf_stats {
 
 #define MY_MODULE_STRING(x) #x
 
-#ifdef CONFIG_ARCH_S390X
+#ifdef CONFIG_64BIT
 #define QDIO_GET_ADDR(x) ((__u32)(unsigned long)x)
-#else /* CONFIG_ARCH_S390X */
+#else /* CONFIG_64BIT */
 #define QDIO_GET_ADDR(x) ((__u32)(long)x)
-#endif /* CONFIG_ARCH_S390X */
+#endif /* CONFIG_64BIT */
 
 struct qdio_q {
 	volatile struct slsb slsb;
diff --git a/drivers/s390/crypto/z90hardware.c b/drivers/s390/crypto/z90hardware.c
index 7c3ed52..d7f7494 100644
--- a/drivers/s390/crypto/z90hardware.c
+++ b/drivers/s390/crypto/z90hardware.c
@@ -785,7 +785,7 @@ testq(int q_nr, int *q_depth, int *dev_type, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%4		\n"
 	 "	slgr	1,1		\n"
 	 "	lgr	2,1		\n"
@@ -855,7 +855,7 @@ resetq(int q_nr, struct ap_status_word *stat_p)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lghi	1,1		\n"
 	 "	sll	1,24		\n"
@@ -921,7 +921,7 @@ sen(int msg_len, unsigned char *msg_ext, struct ap_status_word *stat)
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	lgr	6,%3		\n"
 	 "	llgfr	7,%2		\n"
 	 "	llgt	0,0(6)		\n"
@@ -1000,7 +1000,7 @@ rec(int q_nr, int buff_l, unsigned char *rsp, unsigned char *id,
 	int ccode;
 
 	asm volatile
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	("	llgfr	0,%2		\n"
 	 "	lgr	3,%4		\n"
 	 "	lgr	6,%3		\n"
diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig
index a7efc39..5488547 100644
--- a/drivers/s390/net/Kconfig
+++ b/drivers/s390/net/Kconfig
@@ -1,5 +1,5 @@
 menu "S/390 network device drivers"
-	depends on NETDEVICES && ARCH_S390
+	depends on NETDEVICES && S390
 
 config LCS
 	tristate "Lan Channel Station Interface"
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index 6b63d21..e70af7f 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -1603,7 +1603,7 @@ dumpit(char* buf, int len)
         __u32      ct, sw, rm, dup;
         char       *ptr, *rptr;
         char       tbuf[82], tdup[82];
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
         char       addr[22];
 #else
         char       addr[12];
@@ -1619,7 +1619,7 @@ dumpit(char* buf, int len)
         dup = 0;
         for ( ct=0; ct < len; ct++, ptr++, rptr++ )  {
                 if (sw == 0) {
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                         sprintf(addr, "%16.16lX",(unsigned long)rptr);
 #else
                         sprintf(addr, "%8.8X",(__u32)rptr);
@@ -1634,7 +1634,7 @@ dumpit(char* buf, int len)
                 if (sw == 8) {
                         strcat(bhex, "  ");
                 }
-#if (CONFIG_ARCH_S390X)
+#if (CONFIG_64BIT)
                 sprintf(tbuf,"%2.2lX", (unsigned long)*ptr);
 #else
                 sprintf(tbuf,"%2.2X", (__u32)*ptr);
diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 7dad597..3bf4666 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -246,7 +246,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		kill_task = 1;
 
-#ifndef __s390x__
+#ifndef CONFIG_64BIT
 	asm volatile("ld 0,0(%0)\n"
 		     "ld 2,8(%0)\n"
 		     "ld 4,16(%0)\n"
@@ -255,7 +255,7 @@ s390_revalidate_registers(struct mci *mci)
 #endif
 
 	if (MACHINE_HAS_IEEE) {
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		fpt_save_area = &S390_lowcore.floating_pt_save_area;
 		fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area;
 #else
@@ -314,7 +314,7 @@ s390_revalidate_registers(struct mci *mci)
 		 */
 		s390_handle_damage("invalid control registers.");
 	else
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 		asm volatile("lctlg 0,15,0(%0)"
 			     : : "a" (&S390_lowcore.cregs_save_area));
 #else
@@ -327,7 +327,7 @@ s390_revalidate_registers(struct mci *mci)
 	 * can't write something sensible into that register.
 	 */
 
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 	/*
 	 * See if we can revalidate the TOD programmable register with its
 	 * old contents (should be zero) otherwise set it to zero.
@@ -384,7 +384,7 @@ s390_do_machine_check(struct pt_regs *regs)
 		if (mci->b) {
 			/* Processing backup -> verify if we can survive this */
 			u64 z_mcic, o_mcic, t_mcic;
-#ifdef __s390x__
+#ifdef CONFIG_64BIT
 			z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
 			o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
 				  1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
diff --git a/drivers/s390/sysinfo.c b/drivers/s390/sysinfo.c
index 87c2db1..66da840 100644
--- a/drivers/s390/sysinfo.c
+++ b/drivers/s390/sysinfo.c
@@ -106,7 +106,7 @@ static inline int stsi (void *sysinfo,
 {
 	int cc, retv;
 
-#ifndef CONFIG_ARCH_S390X
+#ifndef CONFIG_64BIT
 	__asm__ __volatile__ (	"lr\t0,%2\n"
 				"\tlr\t1,%3\n"
 				"\tstsi\t0(%4)\n"
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4c42065..9e8254f 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1815,7 +1815,7 @@ config SCSI_SUNESP
 
 config ZFCP
 	tristate "FCP host bus adapter driver for IBM eServer zSeries"
-	depends on ARCH_S390 && QDIO && SCSI
+	depends on S390 && QDIO && SCSI
 	select SCSI_FC_ATTRS
 	help
           If you want to access SCSI devices attached to your IBM eServer
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43..e227a04 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
 
 config IBM_PARTITION
 	bool "IBM disk label and partition support"
-	depends on PARTITION_ADVANCED && ARCH_S390
+	depends on PARTITION_ADVANCED && S390
 	help
 	  Say Y here if you would like to be able to read the hard disk
 	  partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e..5e9251f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
 	buffer = task_sig(task, buffer);
 	buffer = task_cap(task, buffer);
 	buffer = cpuset_task_status_allowed(task, buffer);
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	buffer = task_show_regs(task, buffer);
 #endif
 	return buffer - orig;
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index f97d926..2861cdc 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -539,7 +539,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4,    \
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
-# ifdef CONFIG_ARCH_S390_31
+# ifndef CONFIG_64BIT
 #   define __ARCH_WANT_STAT64
 #   define __ARCH_WANT_SYS_TIME
 # endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 60f8bc7..6c5d4c8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -12,7 +12,7 @@
 #include <linux/config.h>
 #include <linux/smp.h>
 
-#if !defined(CONFIG_ARCH_S390)
+#if !defined(CONFIG_S390)
 
 #include <linux/linkage.h>
 #include <linux/cache.h>
diff --git a/init/Kconfig b/init/Kconfig
index 24e0f7c..ba42f37 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -189,7 +189,7 @@ config AUDIT
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64)
+	depends on AUDIT && (X86 || PPC || PPC64 || S390 || IA64 || UML || SPARC64)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index c10b08a..c2683fc 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -145,7 +145,7 @@ int __init rd_load_image(char *from)
 	int nblocks, i, disk;
 	char *buf = NULL;
 	unsigned short rotate = 0;
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 	char rotator[4] = { '|' , '/' , '-' , '\\' };
 #endif
 
@@ -237,7 +237,7 @@ int __init rd_load_image(char *from)
 		}
 		sys_read(in_fd, buf, BLOCK_SIZE);
 		sys_write(out_fd, buf, BLOCK_SIZE);
-#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES)
+#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
 		if (!(i % 16)) {
 			printk("%c\b", rotator[rotate & 0x3]);
 			rotate++;
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f8..c5c4ab2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         disabled_wait(caller);
 #endif
 	local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1..a85047b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 extern int sysctl_ieee_emulation_warnings;
 #endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
 		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1cedc23..80598cf 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -32,7 +32,7 @@ config MAGIC_SYSRQ
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 21
-	default 17 if ARCH_S390
+	default 17 if S390
 	default 16 if X86_NUMAQ || IA64
 	default 15 if SMP
 	default 14
-- 
cgit v1.1


From a1a5ea70a6e9db6332b27fe2d96666e17aa1436b Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:29 -0800
Subject: [PATCH] I2O: changed I2O API to create I2O messages in kernel memory

Changed the I2O API to create I2O messages first in kernel memory and then
transfer it at once over the PCI bus instead of sending each quad-word over
the PCI bus.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/bus-osm.c    |   21 +-
 drivers/message/i2o/device.c     |   51 +-
 drivers/message/i2o/exec-osm.c   |   93 +-
 drivers/message/i2o/i2o_block.c  |  157 +--
 drivers/message/i2o/i2o_config.c |  169 ++--
 drivers/message/i2o/i2o_scsi.c   |   50 +-
 drivers/message/i2o/iop.c        |  296 +++---
 drivers/message/i2o/pci.c        |    1 +
 include/linux/i2o.h              | 1947 ++++++++++++++++++++------------------
 9 files changed, 1450 insertions(+), 1335 deletions(-)

diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
index 151b228..ce039d3 100644
--- a/drivers/message/i2o/bus-osm.c
+++ b/drivers/message/i2o/bus-osm.c
@@ -39,18 +39,18 @@ static struct i2o_class_id i2o_bus_class_id[] = {
  */
 static int i2o_bus_scan(struct i2o_device *dev)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
 		return -ETIMEDOUT;
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
+			tid);
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 };
 
 /**
@@ -59,8 +59,9 @@ static int i2o_bus_scan(struct i2o_device *dev)
  *
  *	Returns count.
  */
-static ssize_t i2o_bus_store_scan(struct device *d, struct device_attribute *attr, const char *buf,
-				  size_t count)
+static ssize_t i2o_bus_store_scan(struct device *d,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(d);
 	int rc;
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 8eb50cd..002ae0e 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -35,18 +35,18 @@
 static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
 					 u32 type)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid, &msg->u.head[1]);
-	writel(type, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+		cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
+	msg->body[0] = cpu_to_le32(type);
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 }
 
 /**
@@ -419,10 +419,9 @@ int i2o_device_parse_lct(struct i2o_controller *c)
  *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
  */
 int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-			  int oplen, void *reslist, int reslen)
+		   int oplen, void *reslist, int reslen)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	u32 *res32 = (u32 *) reslist;
 	u32 *restmp = (u32 *) reslist;
 	int len = 0;
@@ -437,26 +436,28 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 	if (i2o_dma_alloc(dev, &res, reslen, GFP_KERNEL))
 		return -ENOMEM;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY) {
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg)) {
 		i2o_dma_free(dev, &res);
-		return -ETIMEDOUT;
+		return PTR_ERR(msg);
 	}
 
 	i = 0;
-	writel(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(0, &msg->body[i++]);
-	writel(0x4C000000 | oplen, &msg->body[i++]);	/* OperationList */
-	memcpy_toio(&msg->body[i], oplist, oplen);
+	msg->u.head[1] =
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
+	msg->body[i++] = cpu_to_le32(0x00000000);
+	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
+	memcpy(&msg->body[i], oplist, oplen);
+
 	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
-	writel(0xD0000000 | res.len, &msg->body[i++]);	/* ResultList */
-	writel(res.phys, &msg->body[i++]);
+	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
+	msg->body[i++] = cpu_to_le32(res.phys);
 
-	writel(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
-	       SGL_OFFSET_5, &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
+			SGL_OFFSET_5);
 
-	rc = i2o_msg_post_wait_mem(c, m, 10, &res);
+	rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
 
 	/* This only looks like a memory leak - don't "fix" it. */
 	if (rc == -ETIMEDOUT)
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 9c339a2..71a0933 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -114,13 +114,12 @@ static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
  *	Returns 0 on success, negative error code on timeout or positive error
  *	code from reply.
  */
-int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
-			  timeout, struct i2o_dma *dma)
+int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
+			  unsigned long timeout, struct i2o_dma *dma)
 {
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	struct i2o_exec_wait *wait;
 	static u32 tcntxt = 0x80000000;
-	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
 	int rc = 0;
 
 	wait = i2o_exec_wait_alloc();
@@ -138,15 +137,15 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 	 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
 	 * so we could find a POST WAIT reply easier in the reply handler.
 	 */
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
 	wait->tcntxt = tcntxt++;
-	writel(wait->tcntxt, &msg->u.s.tcntxt);
+	msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
 
 	/*
 	 * Post the message to the controller. At some point later it will
 	 * return. If we time out before it returns then complete will be zero.
 	 */
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	if (!wait->complete) {
 		wait->wq = &wq;
@@ -266,7 +265,8 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
  *
  *	Returns number of bytes printed into buffer.
  */
-static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute *attr, char *buf)
+static ssize_t i2o_exec_show_vendor_id(struct device *d,
+				       struct device_attribute *attr, char *buf)
 {
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
@@ -286,7 +286,9 @@ static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute
  *
  *	Returns number of bytes printed into buffer.
  */
-static ssize_t i2o_exec_show_product_id(struct device *d, struct device_attribute *attr, char *buf)
+static ssize_t i2o_exec_show_product_id(struct device *d,
+					struct device_attribute *attr,
+					char *buf)
 {
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
@@ -385,23 +387,22 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
 	u32 context;
 
 	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
+		struct i2o_message __iomem *pmsg;
+		u32 pm;
+
 		/*
 		 * If Fail bit is set we must take the transaction context of
 		 * the preserved message to find the right request again.
 		 */
-		struct i2o_message __iomem *pmsg;
-		u32 pm;
 
 		pm = le32_to_cpu(msg->body[3]);
-
 		pmsg = i2o_msg_in_to_virt(c, pm);
+		context = readl(&pmsg->u.s.tcntxt);
 
 		i2o_report_status(KERN_INFO, "i2o_core", msg);
 
-		context = readl(&pmsg->u.s.tcntxt);
-
 		/* Release the preserved msg */
-		i2o_msg_nop(c, pm);
+		i2o_msg_nop_mfa(c, pm);
 	} else
 		context = le32_to_cpu(msg->u.s.tcntxt);
 
@@ -462,25 +463,26 @@ static void i2o_exec_event(struct i2o_event *evt)
  */
 int i2o_exec_lct_get(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int i = 0;
 	int rc = -EAGAIN;
 
 	for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-		if (m == I2O_QUEUE_EMPTY)
-			return -ETIMEDOUT;
-
-		writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-		writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID,
-		       &msg->u.head[1]);
-		writel(0xffffffff, &msg->body[0]);
-		writel(0x00000000, &msg->body[1]);
-		writel(0xd0000000 | c->dlct.len, &msg->body[2]);
-		writel(c->dlct.phys, &msg->body[3]);
-
-		rc = i2o_msg_post_wait(c, m, I2O_TIMEOUT_LCT_GET);
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+
+		msg->u.head[0] =
+		    cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xffffffff);
+		msg->body[1] = cpu_to_le32(0x00000000);
+		msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+		msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+		rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
 		if (rc < 0)
 			break;
 
@@ -506,29 +508,28 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 {
 	i2o_status_block *sb = c->status_block.virt;
 	struct device *dev;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
 	dev = &c->pdev->dev;
 
 	if (i2o_dma_realloc(dev, &c->dlct, sb->expected_lct_size, GFP_KERNEL))
 		return -ENOMEM;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	/* FIXME */
-	writel(0xffffffff, &msg->body[0]);
-	writel(change_ind, &msg->body[1]);
-	writel(0xd0000000 | c->dlct.len, &msg->body[2]);
-	writel(c->dlct.phys, &msg->body[3]);
-
-	i2o_msg_post(c, m);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
+				     ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0xffffffff);
+	msg->body[1] = cpu_to_le32(change_ind);
+	msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
+	msg->body[3] = cpu_to_le32(c->dlct.phys);
+
+	i2o_msg_post(c, msg);
 
 	return 0;
 };
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5b..2bd15c7 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -130,20 +130,20 @@ static int i2o_block_remove(struct device *dev)
  */
 static int i2o_block_device_flush(struct i2o_device *dev)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(60 << 16, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(60 << 16);
 	osm_debug("Flushing...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 60);
+	return i2o_msg_post_wait(dev->iop, msg, 60);
 };
 
 /**
@@ -181,21 +181,21 @@ static int i2o_block_issue_flush(request_queue_t * queue, struct gendisk *disk,
  */
 static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
-
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(-1, &msg->body[0]);
-	writel(0, &msg->body[1]);
+	struct i2o_message *msg;
+
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
+	msg->body[1] = cpu_to_le32(0x00000000);
 	osm_debug("Mounting...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -210,20 +210,20 @@ static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
  */
 static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg) == I2O_QUEUE_EMPTY)
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(-1, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(-1);
 	osm_debug("Locking...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -238,20 +238,20 @@ static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
  */
 static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid,
-	       &msg->u.head[1]);
-	writel(media_id, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(media_id);
 	osm_debug("Unlocking...\n");
 
-	return i2o_msg_post_wait(dev->iop, m, 2);
+	return i2o_msg_post_wait(dev->iop, msg, 2);
 };
 
 /**
@@ -267,21 +267,21 @@ static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
 {
 	struct i2o_device *i2o_dev = dev->i2o_dev;
 	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int rc;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->lct_data.
-	       tid, &msg->u.head[1]);
-	writel(op << 24, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
+			lct_data.tid);
+	msg->body[0] = cpu_to_le32(op << 24);
 	osm_debug("Power...\n");
 
-	rc = i2o_msg_post_wait(c, m, 60);
+	rc = i2o_msg_post_wait(c, msg, 60);
 	if (!rc)
 		dev->power = op;
 
@@ -331,7 +331,7 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
  */
 static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
 					 struct i2o_block_request *ireq,
-					 u32 __iomem ** mptr)
+					 u32 ** mptr)
 {
 	int nents;
 	enum dma_data_direction direction;
@@ -745,10 +745,9 @@ static int i2o_block_transfer(struct request *req)
 	struct i2o_block_device *dev = req->rq_disk->private_data;
 	struct i2o_controller *c;
 	int tid = dev->i2o_dev->lct_data.tid;
-	struct i2o_message __iomem *msg;
-	u32 __iomem *mptr;
+	struct i2o_message *msg;
+	u32 *mptr;
 	struct i2o_block_request *ireq = req->special;
-	u32 m;
 	u32 tcntxt;
 	u32 sgl_offset = SGL_OFFSET_8;
 	u32 ctl_flags = 0x00000000;
@@ -763,9 +762,9 @@ static int i2o_block_transfer(struct request *req)
 
 	c = dev->i2o_dev->iop;
 
-	m = i2o_msg_get(c, &msg);
-	if (m == I2O_QUEUE_EMPTY) {
-		rc = -EBUSY;
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
+		rc = PTR_ERR(msg);
 		goto exit;
 	}
 
@@ -775,8 +774,8 @@ static int i2o_block_transfer(struct request *req)
 		goto nop_msg;
 	}
 
-	writel(i2o_block_driver.context, &msg->u.s.icntxt);
-	writel(tcntxt, &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
 
 	mptr = &msg->body[0];
 
@@ -834,11 +833,11 @@ static int i2o_block_transfer(struct request *req)
 
 		sgl_offset = SGL_OFFSET_12;
 
-		writel(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid,
-		       &msg->u.head[1]);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
 
-		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
-		writel(tid, mptr++);
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(tid);
 
 		/*
 		 * ENABLE_DISCONNECT
@@ -853,22 +852,24 @@ static int i2o_block_transfer(struct request *req)
 			scsi_flags = 0xa0a0000a;
 		}
 
-		writel(scsi_flags, mptr++);
+		*mptr++ = cpu_to_le32(scsi_flags);
 
 		*((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
 		*((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
 
-		memcpy_toio(mptr, cmd, 10);
+		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
+		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
 	} else
 #endif
 	{
-		writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
-		writel(ctl_flags, mptr++);
-		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
-		writel((u32) (req->sector << KERNEL_SECTOR_SHIFT), mptr++);
-		writel(req->sector >> (32 - KERNEL_SECTOR_SHIFT), mptr++);
+		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+		*mptr++ = cpu_to_le32(ctl_flags);
+		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT));
+		*mptr++ =
+		    cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT));
 	}
 
 	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
@@ -876,13 +877,13 @@ static int i2o_block_transfer(struct request *req)
 		goto context_remove;
 	}
 
-	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) |
-	       sgl_offset, &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
 
 	list_add_tail(&ireq->queue, &dev->open_queue);
 	dev->open_queue_depth++;
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 
@@ -890,7 +891,7 @@ static int i2o_block_transfer(struct request *req)
 	i2o_cntxt_list_remove(c, req);
 
       nop_msg:
-	i2o_msg_nop(c, m);
+	i2o_msg_nop(c, msg);
 
       exit:
 	return rc;
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 3c3a7ab..4fe73d6 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -230,8 +230,7 @@ static int i2o_cfg_swdl(unsigned long arg)
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
 	unsigned char maxfrag = 0, curfrag = 1;
 	struct i2o_dma buffer;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int status = 0, swlen = 0, fragsize = 8192;
 	struct i2o_controller *c;
 
@@ -257,31 +256,34 @@ static int i2o_cfg_swdl(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
 	__copy_from_user(buffer.virt, kxfer.buf, fragsize);
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
-	writel(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((((u32) kxfer.flags) << 24) | (((u32) kxfer.sw_type) << 16) |
-	       (((u32) maxfrag) << 8) | (((u32) curfrag)), &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
-	writel(0xD0000000 | fragsize, &msg->body[3]);
-	writel(buffer.phys, &msg->body[4]);
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
+							sw_type) << 16) |
+			(((u32) maxfrag) << 8) | (((u32) curfrag)));
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
 
 	osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
 
 	if (status != -ETIMEDOUT)
 		i2o_dma_free(&c->pdev->dev, &buffer);
@@ -302,8 +304,7 @@ static int i2o_cfg_swul(unsigned long arg)
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
 	unsigned char maxfrag = 0, curfrag = 1;
 	struct i2o_dma buffer;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int status = 0, swlen = 0, fragsize = 8192;
 	struct i2o_controller *c;
 	int ret = 0;
@@ -330,30 +331,30 @@ static int i2o_cfg_swul(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
-	writel(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((u32) kxfer.flags << 24 | (u32) kxfer.
-	       sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag,
-	       &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
-	writel(0xD0000000 | fragsize, &msg->body[3]);
-	writel(buffer.phys, &msg->body[4]);
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
+			sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
+	msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
+	msg->body[4] = cpu_to_le32(buffer.phys);
 
 	osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
-	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+	status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
 
 	if (status != I2O_POST_WAIT_OK) {
 		if (status != -ETIMEDOUT)
@@ -380,8 +381,7 @@ static int i2o_cfg_swdel(unsigned long arg)
 	struct i2o_controller *c;
 	struct i2o_sw_xfer kxfer;
 	struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int swlen;
 	int token;
 
@@ -395,21 +395,21 @@ static int i2o_cfg_swdel(unsigned long arg)
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	writel((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16,
-	       &msg->body[0]);
-	writel(swlen, &msg->body[1]);
-	writel(kxfer.sw_id, &msg->body[2]);
+	msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
+	msg->body[0] =
+	    cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
+	msg->body[1] = cpu_to_le32(swlen);
+	msg->body[2] = cpu_to_le32(kxfer.sw_id);
 
-	token = i2o_msg_post_wait(c, m, 10);
+	token = i2o_msg_post_wait(c, msg, 10);
 
 	if (token != I2O_POST_WAIT_OK) {
 		osm_info("swdel failed, DetailedStatus = %d\n", token);
@@ -423,25 +423,24 @@ static int i2o_cfg_validate(unsigned long arg)
 {
 	int token;
 	int iop = (int)arg;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	struct i2o_controller *c;
 
 	c = i2o_find_iop(iop);
 	if (!c)
 		return -ENXIO;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(0);
 
-	token = i2o_msg_post_wait(c, m, 10);
+	token = i2o_msg_post_wait(c, msg, 10);
 
 	if (token != I2O_POST_WAIT_OK) {
 		osm_info("Can't validate configuration, ErrorStatus = %d\n",
@@ -454,8 +453,7 @@ static int i2o_cfg_validate(unsigned long arg)
 
 static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
 	struct i2o_evt_id kdesc;
 	struct i2o_controller *c;
@@ -474,18 +472,19 @@ static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
 	if (!d)
 		return -ENODEV;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -EBUSY;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | kdesc.tid,
-	       &msg->u.head[1]);
-	writel(i2o_config_driver.context, &msg->u.head[2]);
-	writel(i2o_cntxt_list_add(c, fp->private_data), &msg->u.head[3]);
-	writel(kdesc.evt_mask, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
+			kdesc.tid);
+	msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
+	msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
+	msg->body[0] = cpu_to_le32(kdesc.evt_mask);
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 }
@@ -537,7 +536,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 	u32 sg_index = 0;
 	i2o_status_block *sb;
 	struct i2o_message *msg;
-	u32 m;
 	unsigned int iop;
 
 	cmd = (struct i2o_cmd_passthru32 __user *)arg;
@@ -553,7 +551,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		return -ENXIO;
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
 
 	sb = c->status_block.virt;
 
@@ -595,8 +593,8 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	writel(i2o_config_driver.context, &msg->u.s.icntxt);
-	writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
 
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
@@ -662,7 +660,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
-	rcode = i2o_msg_post_wait(c, m, 60);
+	rcode = i2o_msg_post_wait(c, msg, 60);
 	if (rcode)
 		goto sg_list_cleanup;
 
@@ -780,8 +778,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 	u32 i = 0;
 	void *p = NULL;
 	i2o_status_block *sb;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned int iop;
 
 	if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
@@ -793,7 +790,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		return -ENXIO;
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
 
 	sb = c->status_block.virt;
 
@@ -830,8 +827,8 @@ static int i2o_cfg_passthru(unsigned long arg)
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	writel(i2o_config_driver.context, &msg->u.s.icntxt);
-	writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
+	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
 
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
@@ -894,7 +891,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
-	rcode = i2o_msg_post_wait(c, m, 60);
+	rcode = i2o_msg_post_wait(c, msg, 60);
 	if (rcode)
 		goto sg_list_cleanup;
 
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 9f1744c..7a784fd 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -510,8 +510,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	struct i2o_controller *c;
 	struct i2o_device *i2o_dev;
 	int tid;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	/*
 	 * ENABLE_DISCONNECT
 	 * SIMPLE_TAG
@@ -519,7 +518,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 */
 	u32 scsi_flags = 0x20a00000;
 	u32 sgl_offset;
-	u32 __iomem *mptr;
+	u32 *mptr;
 	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
 	int rc = 0;
 
@@ -576,8 +575,8 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 *      throw it back to the scsi layer
 	 */
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY) {
+	msg = i2o_msg_get(c);
+	if (IS_ERR(msg)) {
 		rc = SCSI_MLQUEUE_HOST_BUSY;
 		goto exit;
 	}
@@ -617,16 +616,16 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 		if (sgl_offset == SGL_OFFSET_10)
 			sgl_offset = SGL_OFFSET_12;
 		cmd = I2O_CMD_PRIVATE << 24;
-		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
-		writel(adpt_flags | tid, mptr++);
+		*mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
+		*mptr++ = cpu_to_le32(adpt_flags | tid);
 	}
 #endif
 
-	writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
-	writel(i2o_scsi_driver.context, &msg->u.s.icntxt);
+	msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
+	msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
 
 	/* We want the SCSI control block back */
-	writel(i2o_cntxt_list_add(c, SCpnt), &msg->u.s.tcntxt);
+	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
 
 	/* LSI_920_PCI_QUIRK
 	 *
@@ -649,15 +648,15 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	   }
 	 */
 
-	writel(scsi_flags | SCpnt->cmd_len, mptr++);
+	*mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
 
 	/* Write SCSI command into the message - always 16 byte block */
-	memcpy_toio(mptr, SCpnt->cmnd, 16);
+	memcpy(mptr, SCpnt->cmnd, 16);
 	mptr += 4;
 
 	if (sgl_offset != SGL_OFFSET_0) {
 		/* write size of data addressed by SGL */
-		writel(SCpnt->request_bufflen, mptr++);
+		*mptr++ = cpu_to_le32(SCpnt->request_bufflen);
 
 		/* Now fill in the SGList and command */
 		if (SCpnt->use_sg) {
@@ -676,11 +675,11 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	}
 
 	/* Stick the headers on */
-	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset,
-	       &msg->u.head[0]);
+	msg->u.head[0] =
+	    cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
 
 	/* Queue the message */
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	osm_debug("Issued %ld\n", SCpnt->serial_number);
 
@@ -688,7 +687,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 
       nomem:
 	rc = -ENOMEM;
-	i2o_msg_nop(c, m);
+	i2o_msg_nop(c, msg);
 
       exit:
 	return rc;
@@ -709,8 +708,7 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
 {
 	struct i2o_device *i2o_dev;
 	struct i2o_controller *c;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int tid;
 	int status = FAILED;
 
@@ -720,16 +718,16 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
 	c = i2o_dev->iop;
 	tid = i2o_dev->lct_data.tid;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
 		return SCSI_MLQUEUE_HOST_BUSY;
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid,
-	       &msg->u.head[1]);
-	writel(i2o_cntxt_list_get_ptr(c, SCpnt), &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
+	msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
 
-	if (i2o_msg_post_wait(c, m, I2O_TIMEOUT_SCSI_SCB_ABORT))
+	if (i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
 		status = SUCCESS;
 
 	return status;
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 4eb5325..f86abb4 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -47,27 +47,6 @@ static struct i2o_dma i2o_systab;
 static int i2o_hrt_get(struct i2o_controller *c);
 
 /**
- *	i2o_msg_nop - Returns a message which is not used
- *	@c: I2O controller from which the message was created
- *	@m: message which should be returned
- *
- *	If you fetch a message via i2o_msg_get, and can't use it, you must
- *	return the message with this function. Otherwise the message frame
- *	is lost.
- */
-void i2o_msg_nop(struct i2o_controller *c, u32 m)
-{
-	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
-
-	writel(THREE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(0, &msg->u.head[2]);
-	writel(0, &msg->u.head[3]);
-	i2o_msg_post(c, m);
-};
-
-/**
  *	i2o_msg_get_wait - obtain an I2O message from the IOP
  *	@c: I2O controller
  *	@msg: pointer to a I2O message pointer
@@ -81,22 +60,21 @@ void i2o_msg_nop(struct i2o_controller *c, u32 m)
  *	address from the read port (see the i2o spec). If no message is
  *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
  */
-u32 i2o_msg_get_wait(struct i2o_controller *c,
-		     struct i2o_message __iomem ** msg, int wait)
+struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
 {
 	unsigned long timeout = jiffies + wait * HZ;
-	u32 m;
+	struct i2o_message *msg;
 
-	while ((m = i2o_msg_get(c, msg)) == I2O_QUEUE_EMPTY) {
+	while (IS_ERR(msg = i2o_msg_get(c))) {
 		if (time_after(jiffies, timeout)) {
 			osm_debug("%s: Timeout waiting for message frame.\n",
 				  c->name);
-			return I2O_QUEUE_EMPTY;
+			return ERR_PTR(-ETIMEDOUT);
 		}
 		schedule_timeout_uninterruptible(1);
 	}
 
-	return m;
+	return msg;
 };
 
 #if BITS_PER_LONG == 64
@@ -301,8 +279,7 @@ struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
  */
 static int i2o_iop_quiesce(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc;
 
@@ -313,16 +290,17 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
 	    (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
 		return 0;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/* Long timeout needed for quiesce if lots of devices */
-	if ((rc = i2o_msg_post_wait(c, m, 240)))
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
 		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Quiesced.\n", c->name);
@@ -342,8 +320,7 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
  */
 static int i2o_iop_enable(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc;
 
@@ -353,16 +330,17 @@ static int i2o_iop_enable(struct i2o_controller *c)
 	if (sb->iop_state != ADAPTER_STATE_READY)
 		return -EINVAL;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/* How long of a timeout do we need? */
-	if ((rc = i2o_msg_post_wait(c, m, 240)))
+	if ((rc = i2o_msg_post_wait(c, msg, 240)))
 		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Enabled.\n", c->name);
@@ -413,22 +391,22 @@ static inline void i2o_iop_enable_all(void)
  */
 static int i2o_iop_clear(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	int rc;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	/* Quiesce all IOPs first */
 	i2o_iop_quiesce_all();
 
-	writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
-	if ((rc = i2o_msg_post_wait(c, m, 30)))
+	if ((rc = i2o_msg_post_wait(c, msg, 30)))
 		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
 	else
 		osm_debug("%s: Cleared.\n", c->name);
@@ -446,13 +424,13 @@ static int i2o_iop_clear(struct i2o_controller *c)
  *	Clear and (re)initialize IOP's outbound queue and post the message
  *	frames to the IOP.
  *
- *	Returns 0 on success or a negative errno code on failure.
+ *	Returns 0 on success or negative error code on failure.
  */
 static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 {
-	volatile u8 *status = c->status.virt;
 	u32 m;
-	struct i2o_message __iomem *msg;
+	volatile u8 *status = c->status.virt;
+	struct i2o_message *msg;
 	ulong timeout;
 	int i;
 
@@ -460,23 +438,24 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 
 	memset(c->status.virt, 0, 4);
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0x00000000, &msg->u.s.tcntxt);
-	writel(PAGE_SIZE, &msg->body[0]);
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(PAGE_SIZE);
 	/* Outbound msg frame size in words and Initcode */
-	writel(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);
-	writel(0xd0000004, &msg->body[2]);
-	writel(i2o_dma_low(c->status.phys), &msg->body[3]);
-	writel(i2o_dma_high(c->status.phys), &msg->body[4]);
+	msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
+	msg->body[2] = cpu_to_le32(0xd0000004);
+	msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
 	while (*status <= I2O_CMD_IN_PROGRESS) {
@@ -511,34 +490,34 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 static int i2o_iop_reset(struct i2o_controller *c)
 {
 	volatile u8 *status = c->status.virt;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	unsigned long timeout;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc = 0;
 
 	osm_debug("%s: Resetting controller\n", c->name);
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	memset(c->status_block.virt, 0, 8);
 
 	/* Quiesce all IOPs first */
 	i2o_iop_quiesce_all();
 
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	//FIXME: use reasonable transaction context
-	writel(0, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(i2o_dma_low(c->status.phys), &msg->body[2]);
-	writel(i2o_dma_high(c->status.phys), &msg->body[3]);
+	msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	/* Wait for a reply */
 	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
@@ -567,18 +546,15 @@ static int i2o_iop_reset(struct i2o_controller *c)
 		osm_debug("%s: Reset in progress, waiting for reboot...\n",
 			  c->name);
 
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
-		while (m == I2O_QUEUE_EMPTY) {
+		while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
 			if (time_after(jiffies, timeout)) {
 				osm_err("%s: IOP reset timeout.\n", c->name);
-				rc = -ETIMEDOUT;
+				rc = PTR_ERR(msg);
 				goto exit;
 			}
 			schedule_timeout_uninterruptible(1);
-
-			m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
 		}
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 
 		/* from here all quiesce commands are safe */
 		c->no_quiesce = 0;
@@ -686,8 +662,7 @@ static int i2o_iop_activate(struct i2o_controller *c)
  */
 static int i2o_iop_systab_set(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	i2o_status_block *sb = c->status_block.virt;
 	struct device *dev = &c->pdev->dev;
 	struct resource *root;
@@ -735,20 +710,21 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 		}
 	}
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
 	i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
 					 PCI_DMA_TODEVICE);
 	if (!i2o_systab.phys) {
-		i2o_msg_nop(c, m);
+		i2o_msg_nop(c, msg);
 		return -ENOMEM;
 	}
 
-	writel(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
+	msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
 
 	/*
 	 * Provide three SGL-elements:
@@ -760,16 +736,16 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 	 * same table to everyone. We have to go remap it for them all
 	 */
 
-	writel(c->unit + 2, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(0x54000000 | i2o_systab.len, &msg->body[2]);
-	writel(i2o_systab.phys, &msg->body[3]);
-	writel(0x54000000 | sb->current_mem_size, &msg->body[4]);
-	writel(sb->current_mem_base, &msg->body[5]);
-	writel(0xd4000000 | sb->current_io_size, &msg->body[6]);
-	writel(sb->current_io_base, &msg->body[6]);
+	msg->body[0] = cpu_to_le32(c->unit + 2);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
+	msg->body[3] = cpu_to_le32(i2o_systab.phys);
+	msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
+	msg->body[5] = cpu_to_le32(sb->current_mem_base);
+	msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
+	msg->body[6] = cpu_to_le32(sb->current_io_base);
 
-	rc = i2o_msg_post_wait(c, m, 120);
+	rc = i2o_msg_post_wait(c, msg, 120);
 
 	dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
 			 PCI_DMA_TODEVICE);
@@ -952,30 +928,30 @@ static int i2o_parse_hrt(struct i2o_controller *c)
  */
 int i2o_status_get(struct i2o_controller *c)
 {
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 	volatile u8 *status_block;
 	unsigned long timeout;
 
 	status_block = (u8 *) c->status_block.virt;
 	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0, &msg->u.s.tcntxt);	// FIXME: use resonable transaction context
-	writel(0, &msg->body[0]);
-	writel(0, &msg->body[1]);
-	writel(i2o_dma_low(c->status_block.phys), &msg->body[2]);
-	writel(i2o_dma_high(c->status_block.phys), &msg->body[3]);
-	writel(sizeof(i2o_status_block), &msg->body[4]);	/* always 88 bytes */
+	msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
+			ADAPTER_TID);
+	msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
+	msg->u.s.tcntxt = cpu_to_le32(0x00000000);
+	msg->body[0] = cpu_to_le32(0x00000000);
+	msg->body[1] = cpu_to_le32(0x00000000);
+	msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
+	msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
+	msg->body[4] = cpu_to_le32(sizeof(i2o_status_block));	/* always 88 bytes */
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	/* Wait for a reply */
 	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
@@ -1013,20 +989,20 @@ static int i2o_hrt_get(struct i2o_controller *c)
 	struct device *dev = &c->pdev->dev;
 
 	for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
-		struct i2o_message __iomem *msg;
-		u32 m;
+		struct i2o_message *msg;
 
-		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-		if (m == I2O_QUEUE_EMPTY)
-			return -ETIMEDOUT;
+		msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
 
-		writel(SIX_WORD_MSG_SIZE | SGL_OFFSET_4, &msg->u.head[0]);
-		writel(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 | ADAPTER_TID,
-		       &msg->u.head[1]);
-		writel(0xd0000000 | c->hrt.len, &msg->body[0]);
-		writel(c->hrt.phys, &msg->body[1]);
+		msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
+		msg->u.head[1] =
+		    cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
+				ADAPTER_TID);
+		msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
+		msg->body[1] = cpu_to_le32(c->hrt.phys);
 
-		rc = i2o_msg_post_wait_mem(c, m, 20, &c->hrt);
+		rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
 
 		if (rc < 0) {
 			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
@@ -1056,6 +1032,7 @@ static int i2o_hrt_get(struct i2o_controller *c)
  */
 void i2o_iop_free(struct i2o_controller *c)
 {
+	i2o_pool_free(&c->in_msg);
 	kfree(c);
 };
 
@@ -1080,7 +1057,7 @@ static struct class *i2o_controller_class;
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
  *	Allocate the necessary memory for a i2o_controller struct and
- *	initialize the lists.
+ *	initialize the lists and message mempool.
  *
  *	Returns a pointer to the I2O controller or a negative error code on
  *	failure.
@@ -1089,6 +1066,7 @@ struct i2o_controller *i2o_iop_alloc(void)
 {
 	static int unit = 0;	/* 0 and 1 are NULL IOP and Local Host */
 	struct i2o_controller *c;
+	char poolname[32];
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c) {
@@ -1098,11 +1076,20 @@ struct i2o_controller *i2o_iop_alloc(void)
 	}
 	memset(c, 0, sizeof(*c));
 
+	c->unit = unit++;
+	sprintf(c->name, "iop%d", c->unit);
+
+	snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
+	if (i2o_pool_alloc
+	    (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4,
+	     I2O_MSG_INPOOL_MIN)) {
+		kfree(c);
+		return ERR_PTR(-ENOMEM);
+	};
+
 	INIT_LIST_HEAD(&c->devices);
 	spin_lock_init(&c->lock);
 	init_MUTEX(&c->lct_lock);
-	c->unit = unit++;
-	sprintf(c->name, "iop%d", c->unit);
 
 	device_initialize(&c->device);
 
@@ -1199,28 +1186,27 @@ int i2o_iop_add(struct i2o_controller *c)
  *	is waited for, or expected. If you do not want further notifications,
  *	call the i2o_event_register again with a evt_mask of 0.
  *
- *	Returns 0 on success or -ETIMEDOUT if no message could be fetched for
- *	sending the request.
+ *	Returns 0 on success or negative error code on failure.
  */
 int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
 		       int tcntxt, u32 evt_mask)
 {
 	struct i2o_controller *c = dev->iop;
-	struct i2o_message __iomem *msg;
-	u32 m;
+	struct i2o_message *msg;
 
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
+	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 
-	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
-	writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->lct_data.
-	       tid, &msg->u.head[1]);
-	writel(drv->context, &msg->u.s.icntxt);
-	writel(tcntxt, &msg->u.s.tcntxt);
-	writel(evt_mask, &msg->body[0]);
+	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
+	msg->u.head[1] =
+	    cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
+			lct_data.tid);
+	msg->u.s.icntxt = cpu_to_le32(drv->context);
+	msg->u.s.tcntxt = cpu_to_le32(tcntxt);
+	msg->body[0] = cpu_to_le32(evt_mask);
 
-	i2o_msg_post(c, m);
+	i2o_msg_post(c, msg);
 
 	return 0;
 };
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index ee7075f..329d482 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -483,4 +483,5 @@ void __exit i2o_pci_exit(void)
 {
 	pci_unregister_driver(&i2o_pci_driver);
 };
+
 MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index d79c8a4..9e359a9 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -30,6 +30,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>	/* work_struct */
+#include <linux/mempool.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>	/* Needed for MUTEX init macros */
@@ -38,1091 +39,1219 @@
 #define I2O_QUEUE_EMPTY		0xffffffff
 
 /*
- *	Message structures
+ *	Cache strategies
  */
-struct i2o_message {
-	union {
-		struct {
-			u8 version_offset;
-			u8 flags;
-			u16 size;
-			u32 target_tid:12;
-			u32 init_tid:12;
-			u32 function:8;
-			u32 icntxt;	/* initiator context */
-			u32 tcntxt;	/* transaction context */
-		} s;
-		u32 head[4];
-	} u;
-	/* List follows */
-	u32 body[0];
-};
 
-/*
- *	Each I2O device entity has one of these. There is one per device.
+/*	The NULL strategy leaves everything up to the controller. This tends to be a
+ *	pessimal but functional choice.
  */
-struct i2o_device {
-	i2o_lct_entry lct_data;	/* Device LCT information */
-
-	struct i2o_controller *iop;	/* Controlling IOP */
-	struct list_head list;	/* node in IOP devices list */
-
-	struct device device;
-
-	struct semaphore lock;	/* device lock */
-};
+#define CACHE_NULL		0
+/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
+ *	into the controller cache.
+ */
+#define CACHE_PREFETCH		1
+/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
+ *	into the controller cache. When an I/O is less <= 8K we assume its probably
+ *	not sequential and don't prefetch (default)
+ */
+#define CACHE_SMARTFETCH	2
+/*	Data is written to the cache and then out on to the disk. The I/O must be
+ *	physically on the medium before the write is acknowledged (default without
+ *	NVRAM)
+ */
+#define CACHE_WRITETHROUGH	17
+/*	Data is written to the cache and then out on to the disk. The controller
+ *	is permitted to write back the cache any way it wants. (default if battery
+ *	backed NVRAM is present). It can be useful to set this for swap regardless of
+ *	battery state.
+ */
+#define CACHE_WRITEBACK		18
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writeback cached
+ */
+#define CACHE_SMARTBACK		19
+/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
+ *	write large I/O's directly to disk bypassing the cache to avoid the extra
+ *	memory copy hits. Small writes are writethrough cached. Suitable for devices
+ *	lacking battery backup
+ */
+#define CACHE_SMARTTHROUGH	20
 
 /*
- *	Event structure provided to the event handling function
+ *	Ioctl structures
  */
-struct i2o_event {
-	struct work_struct work;
-	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
-					   event reply was initiated */
-	u16 size;		/* Size of data in 32-bit words */
-	u32 tcntxt;		/* Transaction context used at
-				   registration */
-	u32 event_indicator;	/* Event indicator from reply */
-	u32 data[0];		/* Event data from reply */
-};
+
+#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
+#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
+#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
+#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
 
 /*
- *	I2O classes which could be handled by the OSM
+ *	I2O Function codes
  */
-struct i2o_class_id {
-	u16 class_id:12;
-};
 
 /*
- *	I2O driver structure for OSMs
+ *	Executive Class
  */
-struct i2o_driver {
-	char *name;		/* OSM name */
-	int context;		/* Low 8 bits of the transaction info */
-	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
-
-	/* Message reply handler */
-	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
-
-	/* Event handler */
-	void (*event) (struct i2o_event *);
-
-	struct workqueue_struct *event_queue;	/* Event queue */
-
-	struct device_driver driver;
-
-	/* notification of changes */
-	void (*notify_controller_add) (struct i2o_controller *);
-	void (*notify_controller_remove) (struct i2o_controller *);
-	void (*notify_device_add) (struct i2o_device *);
-	void (*notify_device_remove) (struct i2o_device *);
-
-	struct semaphore lock;
-};
+#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
+#define	I2O_CMD_ADAPTER_READ		0xB2
+#define	I2O_CMD_ADAPTER_RELEASE		0xB5
+#define	I2O_CMD_BIOS_INFO_SET		0xA5
+#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
+#define	I2O_CMD_CONFIG_VALIDATE		0xBB
+#define	I2O_CMD_CONN_SETUP		0xCA
+#define	I2O_CMD_DDM_DESTROY		0xB1
+#define	I2O_CMD_DDM_ENABLE		0xD5
+#define	I2O_CMD_DDM_QUIESCE		0xC7
+#define	I2O_CMD_DDM_RESET		0xD9
+#define	I2O_CMD_DDM_SUSPEND		0xAF
+#define	I2O_CMD_DEVICE_ASSIGN		0xB7
+#define	I2O_CMD_DEVICE_RELEASE		0xB9
+#define	I2O_CMD_HRT_GET			0xA8
+#define	I2O_CMD_ADAPTER_CLEAR		0xBE
+#define	I2O_CMD_ADAPTER_CONNECT		0xC9
+#define	I2O_CMD_ADAPTER_RESET		0xBD
+#define	I2O_CMD_LCT_NOTIFY		0xA2
+#define	I2O_CMD_OUTBOUND_INIT		0xA1
+#define	I2O_CMD_PATH_ENABLE		0xD3
+#define	I2O_CMD_PATH_QUIESCE		0xC5
+#define	I2O_CMD_PATH_RESET		0xD7
+#define	I2O_CMD_STATIC_MF_CREATE	0xDD
+#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
+#define	I2O_CMD_STATUS_GET		0xA0
+#define	I2O_CMD_SW_DOWNLOAD		0xA9
+#define	I2O_CMD_SW_UPLOAD		0xAB
+#define	I2O_CMD_SW_REMOVE		0xAD
+#define	I2O_CMD_SYS_ENABLE		0xD1
+#define	I2O_CMD_SYS_MODIFY		0xC1
+#define	I2O_CMD_SYS_QUIESCE		0xC3
+#define	I2O_CMD_SYS_TAB_SET		0xA3
 
 /*
- *	Contains DMA mapped address information
+ * Utility Class
  */
-struct i2o_dma {
-	void *virt;
-	dma_addr_t phys;
-	size_t len;
-};
+#define I2O_CMD_UTIL_NOP		0x00
+#define I2O_CMD_UTIL_ABORT		0x01
+#define I2O_CMD_UTIL_CLAIM		0x09
+#define I2O_CMD_UTIL_RELEASE		0x0B
+#define I2O_CMD_UTIL_PARAMS_GET		0x06
+#define I2O_CMD_UTIL_PARAMS_SET		0x05
+#define I2O_CMD_UTIL_EVT_REGISTER	0x13
+#define I2O_CMD_UTIL_EVT_ACK		0x14
+#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
+#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
+#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
+#define I2O_CMD_UTIL_LOCK		0x17
+#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
+#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
 
 /*
- *	Contains IO mapped address information
+ * SCSI Host Bus Adapter Class
  */
-struct i2o_io {
-	void __iomem *virt;
-	unsigned long phys;
-	unsigned long len;
-};
+#define I2O_CMD_SCSI_EXEC		0x81
+#define I2O_CMD_SCSI_ABORT		0x83
+#define I2O_CMD_SCSI_BUSRESET		0x27
 
 /*
- *	Context queue entry, used for 32-bit context on 64-bit systems
+ * Bus Adapter Class
  */
-struct i2o_context_list_element {
-	struct list_head list;
-	u32 context;
-	void *ptr;
-	unsigned long timestamp;
-};
+#define I2O_CMD_BUS_ADAPTER_RESET	0x85
+#define I2O_CMD_BUS_RESET		0x87
+#define I2O_CMD_BUS_SCAN		0x89
+#define I2O_CMD_BUS_QUIESCE		0x8b
 
 /*
- * Each I2O controller has one of these objects
+ * Random Block Storage Class
  */
-struct i2o_controller {
-	char name[16];
-	int unit;
-	int type;
+#define I2O_CMD_BLOCK_READ		0x30
+#define I2O_CMD_BLOCK_WRITE		0x31
+#define I2O_CMD_BLOCK_CFLUSH		0x37
+#define I2O_CMD_BLOCK_MLOCK		0x49
+#define I2O_CMD_BLOCK_MUNLOCK		0x4B
+#define I2O_CMD_BLOCK_MMOUNT		0x41
+#define I2O_CMD_BLOCK_MEJECT		0x43
+#define I2O_CMD_BLOCK_POWER		0x70
 
-	struct pci_dev *pdev;	/* PCI device */
+#define I2O_CMD_PRIVATE			0xFF
 
-	unsigned int promise:1;	/* Promise controller */
-	unsigned int adaptec:1;	/* DPT / Adaptec controller */
-	unsigned int raptor:1;	/* split bar */
-	unsigned int no_quiesce:1;	/* dont quiesce before reset */
-	unsigned int short_req:1;	/* use small block sizes */
-	unsigned int limit_sectors:1;	/* limit number of sectors / request */
-	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+/* Command status values  */
 
-	struct list_head devices;	/* list of I2O devices */
-	struct list_head list;	/* Controller list */
+#define I2O_CMD_IN_PROGRESS	0x01
+#define I2O_CMD_REJECTED	0x02
+#define I2O_CMD_FAILED		0x03
+#define I2O_CMD_COMPLETED	0x04
 
-	void __iomem *in_port;	/* Inbout port address */
-	void __iomem *out_port;	/* Outbound port address */
-	void __iomem *irq_status;	/* Interrupt status register address */
-	void __iomem *irq_mask;	/* Interrupt mask register address */
+/* I2O API function return values */
 
-	/* Dynamic LCT related data */
+#define I2O_RTN_NO_ERROR			0
+#define I2O_RTN_NOT_INIT			1
+#define I2O_RTN_FREE_Q_EMPTY			2
+#define I2O_RTN_TCB_ERROR			3
+#define I2O_RTN_TRANSACTION_ERROR		4
+#define I2O_RTN_ADAPTER_ALREADY_INIT		5
+#define I2O_RTN_MALLOC_ERROR			6
+#define I2O_RTN_ADPTR_NOT_REGISTERED		7
+#define I2O_RTN_MSG_REPLY_TIMEOUT		8
+#define I2O_RTN_NO_STATUS			9
+#define I2O_RTN_NO_FIRM_VER			10
+#define	I2O_RTN_NO_LINK_SPEED			11
 
-	struct i2o_dma status;	/* IOP status block */
+/* Reply message status defines for all messages */
 
-	struct i2o_dma hrt;	/* HW Resource Table */
-	i2o_lct *lct;		/* Logical Config Table */
-	struct i2o_dma dlct;	/* Temp LCT */
-	struct semaphore lct_lock;	/* Lock for LCT updates */
-	struct i2o_dma status_block;	/* IOP status block */
+#define I2O_REPLY_STATUS_SUCCESS                    	0x00
+#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
+#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
+#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
+#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
+#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
+#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
+#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
+#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
+#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
 
-	struct i2o_io base;	/* controller messaging unit */
-	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
-	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+/* Status codes and Error Information for Parameter functions */
 
-	unsigned int battery:1;	/* Has a battery backup */
-	unsigned int io_alloc:1;	/* An I/O resource was allocated */
-	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+#define I2O_PARAMS_STATUS_SUCCESS		0x00
+#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
+#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
+#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
+#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
+#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
+#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
+#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
+#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
+#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
+#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
+#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
+#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
+#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
+#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
+#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
+#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
 
-	struct resource io_resource;	/* I/O resource allocated to the IOP */
-	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
+ * messages: Table 3-2 Detailed Status Codes.*/
 
-	struct device device;
-	struct class_device *classdev;	/* I2O controller class device */
-	struct i2o_device *exec;	/* Executive */
-#if BITS_PER_LONG == 64
-	spinlock_t context_list_lock;	/* lock for context_list */
-	atomic_t context_list_counter;	/* needed for unique contexts */
-	struct list_head context_list;	/* list of context id's
-					   and pointers */
-#endif
-	spinlock_t lock;	/* lock for controller
-				   configuration */
+#define I2O_DSC_SUCCESS                        0x0000
+#define I2O_DSC_BAD_KEY                        0x0002
+#define I2O_DSC_TCL_ERROR                      0x0003
+#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
+#define I2O_DSC_NO_SUCH_PAGE                   0x0005
+#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
+#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
+#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
+#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
+#define I2O_DSC_DEVICE_LOCKED                  0x000B
+#define I2O_DSC_DEVICE_RESET                   0x000C
+#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
+#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
+#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
+#define I2O_DSC_INVALID_OFFSET                 0x0010
+#define I2O_DSC_INVALID_PARAMETER              0x0011
+#define I2O_DSC_INVALID_REQUEST                0x0012
+#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
+#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
+#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
+#define I2O_DSC_MISSING_PARAMETER              0x0016
+#define I2O_DSC_TIMEOUT                        0x0017
+#define I2O_DSC_UNKNOWN_ERROR                  0x0018
+#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
+#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
+#define I2O_DSC_DEVICE_BUSY                    0x001B
+#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
 
-	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
-};
+/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
+   Status Codes.*/
 
-/*
- * I2O System table entry
- *
- * The system table contains information about all the IOPs in the
- * system.  It is sent to all IOPs so that they can create peer2peer
- * connections between them.
- */
-struct i2o_sys_tbl_entry {
-	u16 org_id;
-	u16 reserved1;
-	u32 iop_id:12;
-	u32 reserved2:20;
-	u16 seg_num:12;
-	u16 i2o_version:4;
-	u8 iop_state;
-	u8 msg_type;
-	u16 frame_size;
-	u16 reserved3;
-	u32 last_changed;
-	u32 iop_capabilities;
-	u32 inbound_low;
-	u32 inbound_high;
-};
+#define I2O_BSA_DSC_SUCCESS               0x0000
+#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
+#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
+#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
+#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
+#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
+#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
+#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
+#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
+#define I2O_BSA_DSC_BUS_FAILURE           0x0009
+#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
+#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
+#define I2O_BSA_DSC_DEVICE_RESET          0x000C
+#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
+#define I2O_BSA_DSC_TIMEOUT               0x000E
 
-struct i2o_sys_tbl {
-	u8 num_entries;
-	u8 version;
-	u16 reserved1;
-	u32 change_ind;
-	u32 reserved2;
-	u32 reserved3;
-	struct i2o_sys_tbl_entry iops[0];
-};
+/* FailureStatusCodes, Table 3-3 Message Failure Codes */
 
-extern struct list_head i2o_controllers;
+#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
+#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
+#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
+#define I2O_FSC_TRANSPORT_FAILURE                       0x84
+#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
+#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
+#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
+#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
+#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
+#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
+#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
+#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
+#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
+#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
+#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
 
-/* Message functions */
-static inline u32 i2o_msg_get(struct i2o_controller *,
-			      struct i2o_message __iomem **);
-extern u32 i2o_msg_get_wait(struct i2o_controller *,
-			    struct i2o_message __iomem **, int);
-static inline void i2o_msg_post(struct i2o_controller *, u32);
-static inline int i2o_msg_post_wait(struct i2o_controller *, u32,
-				    unsigned long);
-extern int i2o_msg_post_wait_mem(struct i2o_controller *, u32, unsigned long,
-				 struct i2o_dma *);
-extern void i2o_msg_nop(struct i2o_controller *, u32);
-static inline void i2o_flush_reply(struct i2o_controller *, u32);
+/* Device Claim Types */
+#define	I2O_CLAIM_PRIMARY					0x01000000
+#define	I2O_CLAIM_MANAGEMENT					0x02000000
+#define	I2O_CLAIM_AUTHORIZED					0x03000000
+#define	I2O_CLAIM_SECONDARY					0x04000000
 
-/* IOP functions */
-extern int i2o_status_get(struct i2o_controller *);
+/* Message header defines for VersionOffset */
+#define I2OVER15	0x0001
+#define I2OVER20	0x0002
 
-extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
-			      u32);
-extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
-extern struct i2o_controller *i2o_find_iop(int);
+/* Default is 1.5 */
+#define I2OVERSION	I2OVER15
 
-/* Functions needed for handling 64-bit pointers in 32-bit context */
-#if BITS_PER_LONG == 64
-extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
-extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
-extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
-extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
+#define SGL_OFFSET_0    I2OVERSION
+#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
+#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
+#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
+#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
+#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
+#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
+#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
+#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
+#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
+#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
 
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) (u64) ptr;
-};
+/* Transaction Reply Lists (TRL) Control Word structure */
+#define TRL_SINGLE_FIXED_LENGTH		0x00
+#define TRL_SINGLE_VARIABLE_LENGTH	0x40
+#define TRL_MULTIPLE_FIXED_LENGTH	0x80
 
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return (u32) ((u64) ptr >> 32);
-};
+ /* msg header defines for MsgFlags */
+#define MSG_STATIC	0x0100
+#define MSG_64BIT_CNTXT	0x0200
+#define MSG_MULTI_TRANS	0x1000
+#define MSG_FAIL	0x2000
+#define MSG_FINAL	0x4000
+#define MSG_REPLY	0x8000
 
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) (u64) dma_addr;
-};
+ /* minimum size msg */
+#define THREE_WORD_MSG_SIZE	0x00030000
+#define FOUR_WORD_MSG_SIZE	0x00040000
+#define FIVE_WORD_MSG_SIZE	0x00050000
+#define SIX_WORD_MSG_SIZE	0x00060000
+#define SEVEN_WORD_MSG_SIZE	0x00070000
+#define EIGHT_WORD_MSG_SIZE	0x00080000
+#define NINE_WORD_MSG_SIZE	0x00090000
+#define TEN_WORD_MSG_SIZE	0x000A0000
+#define ELEVEN_WORD_MSG_SIZE	0x000B0000
+#define I2O_MESSAGE_SIZE(x)	((x)<<16)
 
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return (u32) ((u64) dma_addr >> 32);
-};
-#else
-static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+/* special TID assignments */
+#define ADAPTER_TID		0
+#define HOST_TID		1
 
-static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
-{
-	return (void *)context;
-};
+/* outbound queue defines */
+#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
+#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
 
-static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+/* inbound queue definitions */
+#define I2O_MSG_INPOOL_MIN		32
+#define I2O_INBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
 
-static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
-{
-	return (u32) ptr;
-};
+#define I2O_POST_WAIT_OK	0
+#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
 
-static inline u32 i2o_ptr_low(void *ptr)
-{
-	return (u32) ptr;
-};
+#define I2O_CONTEXT_LIST_MIN_LENGTH	15
+#define I2O_CONTEXT_LIST_USED		0x01
+#define I2O_CONTEXT_LIST_DELETED	0x02
 
-static inline u32 i2o_ptr_high(void *ptr)
-{
-	return 0;
-};
+/* timeouts */
+#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
+#define I2O_TIMEOUT_MESSAGE_GET		5
+#define I2O_TIMEOUT_RESET		30
+#define I2O_TIMEOUT_STATUS_GET		5
+#define I2O_TIMEOUT_LCT_GET		360
+#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
 
-static inline u32 i2o_dma_low(dma_addr_t dma_addr)
-{
-	return (u32) dma_addr;
-};
+/* retries */
+#define I2O_HRT_GET_TRIES		3
+#define I2O_LCT_GET_TRIES		3
 
-static inline u32 i2o_dma_high(dma_addr_t dma_addr)
-{
-	return 0;
-};
-#endif
+/* defines for max_sectors and max_phys_segments */
+#define I2O_MAX_SECTORS			1024
+#define I2O_MAX_SECTORS_LIMITED		256
+#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
-/**
- *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
- *	@c: I2O controller for which the calculation should be done
- *	@body_size: maximum body size used for message in 32-bit words.
- *
- *	Return the maximum number of SG elements in a SG list.
+/*
+ *	Message structures
  */
-static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	u16 sg_count =
-	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
-	    body_size;
-
-	if (c->pae_support) {
-		/*
-		 * for 64-bit a SG attribute element must be added and each
-		 * SG element needs 12 bytes instead of 8.
-		 */
-		sg_count -= 2;
-		sg_count /= 3;
-	} else
-		sg_count /= 2;
-
-	if (c->short_req && (sg_count > 8))
-		sg_count = 8;
+struct i2o_message {
+	union {
+		struct {
+			u8 version_offset;
+			u8 flags;
+			u16 size;
+			u32 target_tid:12;
+			u32 init_tid:12;
+			u32 function:8;
+			u32 icntxt;	/* initiator context */
+			u32 tcntxt;	/* transaction context */
+		} s;
+		u32 head[4];
+	} u;
+	/* List follows */
+	u32 body[0];
+};
 
-	return sg_count;
+/* MFA and I2O message used by mempool */
+struct i2o_msg_mfa {
+	u32 mfa;		/* MFA returned by the controller */
+	struct i2o_message msg;	/* I2O message */
 };
 
-/**
- *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
- *	@c: I2O controller
- *	@ptr: pointer to the data which should be mapped
- *	@size: size of data in bytes
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
- *	SG list if the allocation was successful.
- *
- *	Returns DMA address which must be checked for failures using
- *	dma_mapping_error().
+/*
+ *	Each I2O device entity has one of these. There is one per device.
  */
-static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
-					    size_t size,
-					    enum dma_data_direction direction,
-					    u32 __iomem ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 __iomem *mptr = *sg_ptr;
-	dma_addr_t dma_addr;
+struct i2o_device {
+	i2o_lct_entry lct_data;	/* Device LCT information */
 
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0xd4000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0xd0000000;
-		break;
-	default:
-		return 0;
-	}
+	struct i2o_controller *iop;	/* Controlling IOP */
+	struct list_head list;	/* node in IOP devices list */
 
-	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(dma_addr)) {
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-			writel(0x7C020002, mptr++);
-			writel(PAGE_SIZE, mptr++);
-		}
-#endif
+	struct device device;
 
-		writel(sg_flags | size, mptr++);
-		writel(i2o_dma_low(dma_addr), mptr++);
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			writel(i2o_dma_high(dma_addr), mptr++);
-#endif
-		*sg_ptr = mptr;
-	}
-	return dma_addr;
+	struct semaphore lock;	/* device lock */
 };
 
-/**
- *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
- *	@c: I2O controller
- *	@sg: SG list to be mapped
- *	@sg_count: number of elements in the SG list
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
- *	list if the allocation was successful.
- *
- *	Returns 0 on failure or 1 on success.
+/*
+ *	Event structure provided to the event handling function
  */
-static inline int i2o_dma_map_sg(struct i2o_controller *c,
-				 struct scatterlist *sg, int sg_count,
-				 enum dma_data_direction direction,
-				 u32 __iomem ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 __iomem *mptr = *sg_ptr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0x14000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0x10000000;
-		break;
-	default:
-		return 0;
-	}
-
-	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
-	if (!sg_count)
-		return 0;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-		writel(0x7C020002, mptr++);
-		writel(PAGE_SIZE, mptr++);
-	}
-#endif
-
-	while (sg_count-- > 0) {
-		if (!sg_count)
-			sg_flags |= 0xC0000000;
-		writel(sg_flags | sg_dma_len(sg), mptr++);
-		writel(i2o_dma_low(sg_dma_address(sg)), mptr++);
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			writel(i2o_dma_high(sg_dma_address(sg)), mptr++);
-#endif
-		sg++;
-	}
-	*sg_ptr = mptr;
+struct i2o_event {
+	struct work_struct work;
+	struct i2o_device *i2o_dev;	/* I2O device pointer from which the
+					   event reply was initiated */
+	u16 size;		/* Size of data in 32-bit words */
+	u32 tcntxt;		/* Transaction context used at
+				   registration */
+	u32 event_indicator;	/* Event indicator from reply */
+	u32 data[0];		/* Event data from reply */
+};
 
-	return 1;
+/*
+ *	I2O classes which could be handled by the OSM
+ */
+struct i2o_class_id {
+	u16 class_id:12;
 };
 
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *	@gfp_mask: GFP mask
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
+/*
+ *	I2O driver structure for OSMs
  */
-static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
-				size_t len, gfp_t gfp_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
+struct i2o_driver {
+	char *name;		/* OSM name */
+	int context;		/* Low 8 bits of the transaction info */
+	struct i2o_class_id *classes;	/* I2O classes that this OSM handles */
 
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
-		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
-			return -ENOMEM;
-	}
+	/* Message reply handler */
+	int (*reply) (struct i2o_controller *, u32, struct i2o_message *);
 
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
+	/* Event handler */
+	void (*event) (struct i2o_event *);
 
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+	struct workqueue_struct *event_queue;	/* Event queue */
 
-	if (!addr->virt)
-		return -ENOMEM;
+	struct device_driver driver;
 
-	memset(addr->virt, 0, len);
-	addr->len = len;
+	/* notification of changes */
+	void (*notify_controller_add) (struct i2o_controller *);
+	void (*notify_controller_remove) (struct i2o_controller *);
+	void (*notify_device_add) (struct i2o_device *);
+	void (*notify_device_remove) (struct i2o_device *);
 
-	return 0;
+	struct semaphore lock;
 };
 
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
+/*
+ *	Contains DMA mapped address information
  */
-static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
+struct i2o_dma {
+	void *virt;
+	dma_addr_t phys;
+	size_t len;
 };
 
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *	@gfp_mask: GFP mask
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
+/*
+ *	Contains slab cache and mempool information
  */
-static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
-				  size_t len, gfp_t gfp_mask)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len, gfp_mask);
-
-	return 0;
+struct i2o_pool {
+	char *name;
+	kmem_cache_t *slab;
+	mempool_t *mempool;
 };
 
-/* I2O driver (OSM) functions */
-extern int i2o_driver_register(struct i2o_driver *);
-extern void i2o_driver_unregister(struct i2o_driver *);
-
-/**
- *	i2o_driver_notify_controller_add - Send notification of added controller
- *					   to a single I2O driver
- *
- *	Send notification of added controller to a single registered driver.
+/*
+ *	Contains IO mapped address information
  */
-static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
-						    struct i2o_controller *c)
-{
-	if (drv->notify_controller_add)
-		drv->notify_controller_add(c);
+struct i2o_io {
+	void __iomem *virt;
+	unsigned long phys;
+	unsigned long len;
 };
 
-/**
- *	i2o_driver_notify_controller_remove - Send notification of removed
- *					      controller to a single I2O driver
- *
- *	Send notification of removed controller to a single registered driver.
+/*
+ *	Context queue entry, used for 32-bit context on 64-bit systems
  */
-static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
-						       struct i2o_controller *c)
-{
-	if (drv->notify_controller_remove)
-		drv->notify_controller_remove(c);
+struct i2o_context_list_element {
+	struct list_head list;
+	u32 context;
+	void *ptr;
+	unsigned long timestamp;
 };
 
-/**
- *	i2o_driver_notify_device_add - Send notification of added device to a
- *				       single I2O driver
- *
- *	Send notification of added device to a single registered driver.
+/*
+ * Each I2O controller has one of these objects
  */
-static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
-						struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_add)
-		drv->notify_device_add(i2o_dev);
+struct i2o_controller {
+	char name[16];
+	int unit;
+	int type;
+
+	struct pci_dev *pdev;	/* PCI device */
+
+	unsigned int promise:1;	/* Promise controller */
+	unsigned int adaptec:1;	/* DPT / Adaptec controller */
+	unsigned int raptor:1;	/* split bar */
+	unsigned int no_quiesce:1;	/* dont quiesce before reset */
+	unsigned int short_req:1;	/* use small block sizes */
+	unsigned int limit_sectors:1;	/* limit number of sectors / request */
+	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
+
+	struct list_head devices;	/* list of I2O devices */
+	struct list_head list;	/* Controller list */
+
+	void __iomem *in_port;	/* Inbout port address */
+	void __iomem *out_port;	/* Outbound port address */
+	void __iomem *irq_status;	/* Interrupt status register address */
+	void __iomem *irq_mask;	/* Interrupt mask register address */
+
+	struct i2o_dma status;	/* IOP status block */
+
+	struct i2o_dma hrt;	/* HW Resource Table */
+	i2o_lct *lct;		/* Logical Config Table */
+	struct i2o_dma dlct;	/* Temp LCT */
+	struct semaphore lct_lock;	/* Lock for LCT updates */
+	struct i2o_dma status_block;	/* IOP status block */
+
+	struct i2o_io base;	/* controller messaging unit */
+	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
+	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
+
+	struct i2o_pool in_msg;	/* mempool for inbound messages */
+
+	unsigned int battery:1;	/* Has a battery backup */
+	unsigned int io_alloc:1;	/* An I/O resource was allocated */
+	unsigned int mem_alloc:1;	/* A memory resource was allocated */
+
+	struct resource io_resource;	/* I/O resource allocated to the IOP */
+	struct resource mem_resource;	/* Mem resource allocated to the IOP */
+
+	struct device device;
+	struct class_device *classdev;	/* I2O controller class device */
+	struct i2o_device *exec;	/* Executive */
+#if BITS_PER_LONG == 64
+	spinlock_t context_list_lock;	/* lock for context_list */
+	atomic_t context_list_counter;	/* needed for unique contexts */
+	struct list_head context_list;	/* list of context id's
+					   and pointers */
+#endif
+	spinlock_t lock;	/* lock for controller
+				   configuration */
+
+	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
 };
 
-/**
- *	i2o_driver_notify_device_remove - Send notification of removed device
- *					  to a single I2O driver
+/*
+ * I2O System table entry
  *
- *	Send notification of removed device to a single registered driver.
+ * The system table contains information about all the IOPs in the
+ * system.  It is sent to all IOPs so that they can create peer2peer
+ * connections between them.
  */
-static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
-						   struct i2o_device *i2o_dev)
-{
-	if (drv->notify_device_remove)
-		drv->notify_device_remove(i2o_dev);
+struct i2o_sys_tbl_entry {
+	u16 org_id;
+	u16 reserved1;
+	u32 iop_id:12;
+	u32 reserved2:20;
+	u16 seg_num:12;
+	u16 i2o_version:4;
+	u8 iop_state;
+	u8 msg_type;
+	u16 frame_size;
+	u16 reserved3;
+	u32 last_changed;
+	u32 iop_capabilities;
+	u32 inbound_low;
+	u32 inbound_high;
 };
 
-extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
-extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
-extern void i2o_driver_notify_device_add_all(struct i2o_device *);
-extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
+struct i2o_sys_tbl {
+	u8 num_entries;
+	u8 version;
+	u16 reserved1;
+	u32 change_ind;
+	u32 reserved2;
+	u32 reserved3;
+	struct i2o_sys_tbl_entry iops[0];
+};
 
-/* I2O device functions */
-extern int i2o_device_claim(struct i2o_device *);
-extern int i2o_device_claim_release(struct i2o_device *);
+extern struct list_head i2o_controllers;
 
-/* Exec OSM functions */
-extern int i2o_exec_lct_get(struct i2o_controller *);
+/* Message functions */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *);
+extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
+static inline void i2o_msg_post(struct i2o_controller *, struct i2o_message *);
+static inline int i2o_msg_post_wait(struct i2o_controller *,
+				    struct i2o_message *, unsigned long);
+extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
+				 unsigned long, struct i2o_dma *);
+static inline void i2o_flush_reply(struct i2o_controller *, u32);
 
-/* device / driver / kobject conversion functions */
-#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
-#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
-#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
-#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
+/* IOP functions */
+extern int i2o_status_get(struct i2o_controller *);
 
-/**
- *	i2o_msg_get - obtain an I2O message from the IOP
- *	@c: I2O controller
- *	@msg: pointer to a I2O message pointer
- *
- *	This function tries to get a message slot. If no message slot is
- *	available do not wait until one is availabe (see also i2o_msg_get_wait).
- *
- *	On a success the message is returned and the pointer to the message is
- *	set in msg. The returned message is the physical page frame offset
- *	address from the read port (see the i2o spec). If no message is
- *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
- */
-static inline u32 i2o_msg_get(struct i2o_controller *c,
-			      struct i2o_message __iomem ** msg)
-{
-	u32 m = readl(c->in_port);
+extern int i2o_event_register(struct i2o_device *, struct i2o_driver *, int,
+			      u32);
+extern struct i2o_device *i2o_iop_find_device(struct i2o_controller *, u16);
+extern struct i2o_controller *i2o_find_iop(int);
 
-	if (m != I2O_QUEUE_EMPTY)
-		*msg = c->in_queue.virt + m;
+/* Functions needed for handling 64-bit pointers in 32-bit context */
+#if BITS_PER_LONG == 64
+extern u32 i2o_cntxt_list_add(struct i2o_controller *, void *);
+extern void *i2o_cntxt_list_get(struct i2o_controller *, u32);
+extern u32 i2o_cntxt_list_remove(struct i2o_controller *, void *);
+extern u32 i2o_cntxt_list_get_ptr(struct i2o_controller *, void *);
 
-	return m;
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) (u64) ptr;
 };
 
-/**
- *	i2o_msg_post - Post I2O message to I2O controller
- *	@c: I2O controller to which the message should be send
- *	@m: the message identifier
- *
- *	Post the message to the I2O controller.
- */
-static inline void i2o_msg_post(struct i2o_controller *c, u32 m)
+static inline u32 i2o_ptr_high(void *ptr)
 {
-	writel(m, c->in_port);
+	return (u32) ((u64) ptr >> 32);
 };
 
-/**
- * 	i2o_msg_post_wait - Post and wait a message and wait until return
- *	@c: controller
- *	@m: message to post
- *	@timeout: time in seconds to wait
- *
- * 	This API allows an OSM to post a message and then be told whether or
- *	not the system received a successful reply. If the message times out
- *	then the value '-ETIMEDOUT' is returned.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static inline int i2o_msg_post_wait(struct i2o_controller *c, u32 m,
-				    unsigned long timeout)
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
 {
-	return i2o_msg_post_wait_mem(c, m, timeout, NULL);
+	return (u32) (u64) dma_addr;
 };
 
-/**
- *	i2o_flush_reply - Flush reply from I2O controller
- *	@c: I2O controller
- *	@m: the message identifier
- *
- *	The I2O controller must be informed that the reply message is not needed
- *	anymore. If you forget to flush the reply, the message frame can't be
- *	used by the controller anymore and is therefore lost.
- */
-static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
 {
-	writel(m, c->out_port);
+	return (u32) ((u64) dma_addr >> 32);
+};
+#else
+static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
 };
 
-/**
- *	i2o_out_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
- *
- *	Turn a receive message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for sender side messages as they are ioremap objects
- *	provided by the I2O controller.
- */
-static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
-						      u32 m)
+static inline void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
 {
-	BUG_ON(m < c->out_queue.phys
-	       || m >= c->out_queue.phys + c->out_queue.len);
+	return (void *)context;
+};
 
-	return c->out_queue.virt + (m - c->out_queue.phys);
+static inline u32 i2o_cntxt_list_remove(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
 };
 
-/**
- *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
- *	@c: controller
- *	@m: message engine value
+static inline u32 i2o_cntxt_list_get_ptr(struct i2o_controller *c, void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_low(void *ptr)
+{
+	return (u32) ptr;
+};
+
+static inline u32 i2o_ptr_high(void *ptr)
+{
+	return 0;
+};
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return 0;
+};
+#endif
+
+/**
+ *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
+ *	@c: I2O controller for which the calculation should be done
+ *	@body_size: maximum body size used for message in 32-bit words.
  *
- *	Turn a send message from an I2O controller bus address into
- *	a Linux virtual address. The shared page frame is a linear block
- *	so we simply have to shift the offset. This function does not
- *	work for receive side messages as they are kmalloc objects
- *	in a different pool.
+ *	Return the maximum number of SG elements in a SG list.
  */
-static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
-							     i2o_controller *c,
-							     u32 m)
+static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
 {
-	return c->in_queue.virt + m;
+	i2o_status_block *sb = c->status_block.virt;
+	u16 sg_count =
+	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
+	    body_size;
+
+	if (c->pae_support) {
+		/*
+		 * for 64-bit a SG attribute element must be added and each
+		 * SG element needs 12 bytes instead of 8.
+		 */
+		sg_count -= 2;
+		sg_count /= 3;
+	} else
+		sg_count /= 2;
+
+	if (c->short_req && (sg_count > 8))
+		sg_count = 8;
+
+	return sg_count;
 };
 
-/*
- *	Endian handling wrapped into the macro - keeps the core code
- *	cleaner.
+/**
+ *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@ptr: pointer to the data which should be mapped
+ *	@size: size of data in bytes
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
+ *	SG list if the allocation was successful.
+ *
+ *	Returns DMA address which must be checked for failures using
+ *	dma_mapping_error().
  */
+static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+	dma_addr_t dma_addr;
 
-#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0xd4000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0xd0000000;
+		break;
+	default:
+		return 0;
+	}
 
-extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
-extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
-			      void *, int);
+	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
+	if (!dma_mapping_error(dma_addr)) {
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+			*mptr++ = cpu_to_le32(0x7C020002);
+			*mptr++ = cpu_to_le32(PAGE_SIZE);
+		}
+#endif
 
-/* debugging and troubleshooting/diagnostic helpers. */
-#define osm_printk(level, format, arg...)  \
-	printk(level "%s: " format, OSM_NAME , ## arg)
+		*mptr++ = cpu_to_le32(sg_flags | size);
+		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
+#endif
+		*sg_ptr = mptr;
+	}
+	return dma_addr;
+};
 
-#ifdef DEBUG
-#define osm_debug(format, arg...) \
-	osm_printk(KERN_DEBUG, format , ## arg)
-#else
-#define osm_debug(format, arg...) \
-        do { } while (0)
+/**
+ *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@sg: SG list to be mapped
+ *	@sg_count: number of elements in the SG list
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
+ *	list if the allocation was successful.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+static inline int i2o_dma_map_sg(struct i2o_controller *c,
+				 struct scatterlist *sg, int sg_count,
+				 enum dma_data_direction direction,
+				 u32 ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 *mptr = *sg_ptr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0x14000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0x10000000;
+		break;
+	default:
+		return 0;
+	}
+
+	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
+	if (!sg_count)
+		return 0;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+		*mptr++ = cpu_to_le32(0x7C020002);
+		*mptr++ = cpu_to_le32(PAGE_SIZE);
+	}
 #endif
 
-#define osm_err(format, arg...)		\
-	osm_printk(KERN_ERR, format , ## arg)
-#define osm_info(format, arg...)		\
-	osm_printk(KERN_INFO, format , ## arg)
-#define osm_warn(format, arg...)		\
-	osm_printk(KERN_WARNING, format , ## arg)
+	while (sg_count-- > 0) {
+		if (!sg_count)
+			sg_flags |= 0xC0000000;
+		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
+		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
+#endif
+		sg++;
+	}
+	*sg_ptr = mptr;
 
-/* debugging functions */
-extern void i2o_report_status(const char *, const char *, struct i2o_message *);
-extern void i2o_dump_message(struct i2o_message *);
-extern void i2o_dump_hrt(struct i2o_controller *c);
-extern void i2o_debug_state(struct i2o_controller *c);
+	return 1;
+};
 
-/*
- *	Cache strategies
+/**
+ *	i2o_dma_alloc - Allocate DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which should get the DMA buffer
+ *	@len: length of the new DMA memory
+ *	@gfp_mask: GFP mask
+ *
+ *	Allocate a coherent DMA memory and write the pointers into addr.
+ *
+ *	Returns 0 on success or -ENOMEM on failure.
  */
+static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
+				size_t len, gfp_t gfp_mask)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
 
-/*	The NULL strategy leaves everything up to the controller. This tends to be a
- *	pessimal but functional choice.
- */
-#define CACHE_NULL		0
-/*	Prefetch data when reading. We continually attempt to load the next 32 sectors
- *	into the controller cache.
- */
-#define CACHE_PREFETCH		1
-/*	Prefetch data when reading. We sometimes attempt to load the next 32 sectors
- *	into the controller cache. When an I/O is less <= 8K we assume its probably
- *	not sequential and don't prefetch (default)
- */
-#define CACHE_SMARTFETCH	2
-/*	Data is written to the cache and then out on to the disk. The I/O must be
- *	physically on the medium before the write is acknowledged (default without
- *	NVRAM)
- */
-#define CACHE_WRITETHROUGH	17
-/*	Data is written to the cache and then out on to the disk. The controller
- *	is permitted to write back the cache any way it wants. (default if battery
- *	backed NVRAM is present). It can be useful to set this for swap regardless of
- *	battery state.
- */
-#define CACHE_WRITEBACK		18
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writeback cached
- */
-#define CACHE_SMARTBACK		19
-/*	Optimise for under powered controllers, especially on RAID1 and RAID0. We
- *	write large I/O's directly to disk bypassing the cache to avoid the extra
- *	memory copy hits. Small writes are writethrough cached. Suitable for devices
- *	lacking battery backup
- */
-#define CACHE_SMARTTHROUGH	20
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
+		dma_64 = 1;
+		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+			return -ENOMEM;
+	}
 
-/*
- *	Ioctl structures
- */
+	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
 
-#define 	BLKI2OGRSTRAT	_IOR('2', 1, int)
-#define 	BLKI2OGWSTRAT	_IOR('2', 2, int)
-#define 	BLKI2OSRSTRAT	_IOW('2', 3, int)
-#define 	BLKI2OSWSTRAT	_IOW('2', 4, int)
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
 
-/*
- *	I2O Function codes
- */
+	if (!addr->virt)
+		return -ENOMEM;
 
-/*
- *	Executive Class
- */
-#define	I2O_CMD_ADAPTER_ASSIGN		0xB3
-#define	I2O_CMD_ADAPTER_READ		0xB2
-#define	I2O_CMD_ADAPTER_RELEASE		0xB5
-#define	I2O_CMD_BIOS_INFO_SET		0xA5
-#define	I2O_CMD_BOOT_DEVICE_SET		0xA7
-#define	I2O_CMD_CONFIG_VALIDATE		0xBB
-#define	I2O_CMD_CONN_SETUP		0xCA
-#define	I2O_CMD_DDM_DESTROY		0xB1
-#define	I2O_CMD_DDM_ENABLE		0xD5
-#define	I2O_CMD_DDM_QUIESCE		0xC7
-#define	I2O_CMD_DDM_RESET		0xD9
-#define	I2O_CMD_DDM_SUSPEND		0xAF
-#define	I2O_CMD_DEVICE_ASSIGN		0xB7
-#define	I2O_CMD_DEVICE_RELEASE		0xB9
-#define	I2O_CMD_HRT_GET			0xA8
-#define	I2O_CMD_ADAPTER_CLEAR		0xBE
-#define	I2O_CMD_ADAPTER_CONNECT		0xC9
-#define	I2O_CMD_ADAPTER_RESET		0xBD
-#define	I2O_CMD_LCT_NOTIFY		0xA2
-#define	I2O_CMD_OUTBOUND_INIT		0xA1
-#define	I2O_CMD_PATH_ENABLE		0xD3
-#define	I2O_CMD_PATH_QUIESCE		0xC5
-#define	I2O_CMD_PATH_RESET		0xD7
-#define	I2O_CMD_STATIC_MF_CREATE	0xDD
-#define	I2O_CMD_STATIC_MF_RELEASE	0xDF
-#define	I2O_CMD_STATUS_GET		0xA0
-#define	I2O_CMD_SW_DOWNLOAD		0xA9
-#define	I2O_CMD_SW_UPLOAD		0xAB
-#define	I2O_CMD_SW_REMOVE		0xAD
-#define	I2O_CMD_SYS_ENABLE		0xD1
-#define	I2O_CMD_SYS_MODIFY		0xC1
-#define	I2O_CMD_SYS_QUIESCE		0xC3
-#define	I2O_CMD_SYS_TAB_SET		0xA3
+	memset(addr->virt, 0, len);
+	addr->len = len;
 
-/*
- * Utility Class
+	return 0;
+};
+
+/**
+ *	i2o_dma_free - Free DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which contains the DMA buffer
+ *
+ *	Free a coherent DMA memory and set virtual address of addr to NULL.
  */
-#define I2O_CMD_UTIL_NOP		0x00
-#define I2O_CMD_UTIL_ABORT		0x01
-#define I2O_CMD_UTIL_CLAIM		0x09
-#define I2O_CMD_UTIL_RELEASE		0x0B
-#define I2O_CMD_UTIL_PARAMS_GET		0x06
-#define I2O_CMD_UTIL_PARAMS_SET		0x05
-#define I2O_CMD_UTIL_EVT_REGISTER	0x13
-#define I2O_CMD_UTIL_EVT_ACK		0x14
-#define I2O_CMD_UTIL_CONFIG_DIALOG	0x10
-#define I2O_CMD_UTIL_DEVICE_RESERVE	0x0D
-#define I2O_CMD_UTIL_DEVICE_RELEASE	0x0F
-#define I2O_CMD_UTIL_LOCK		0x17
-#define I2O_CMD_UTIL_LOCK_RELEASE	0x19
-#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY	0x15
+static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
+{
+	if (addr->virt) {
+		if (addr->phys)
+			dma_free_coherent(dev, addr->len, addr->virt,
+					  addr->phys);
+		else
+			kfree(addr->virt);
+		addr->virt = NULL;
+	}
+};
 
-/*
- * SCSI Host Bus Adapter Class
+/**
+ *	i2o_dma_realloc - Realloc DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: pointer to a i2o_dma struct DMA buffer
+ *	@len: new length of memory
+ *	@gfp_mask: GFP mask
+ *
+ *	If there was something allocated in the addr, free it first. If len > 0
+ *	than try to allocate it and write the addresses back to the addr
+ *	structure. If len == 0 set the virtual address to NULL.
+ *
+ *	Returns the 0 on success or negative error code on failure.
  */
-#define I2O_CMD_SCSI_EXEC		0x81
-#define I2O_CMD_SCSI_ABORT		0x83
-#define I2O_CMD_SCSI_BUSRESET		0x27
+static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+				  size_t len, gfp_t gfp_mask)
+{
+	i2o_dma_free(dev, addr);
+
+	if (len)
+		return i2o_dma_alloc(dev, addr, len, gfp_mask);
+
+	return 0;
+};
 
 /*
- * Bus Adapter Class
+ *	i2o_pool_alloc - Allocate an slab cache and mempool
+ *	@mempool: pointer to struct i2o_pool to write data into.
+ *	@name: name which is used to identify cache
+ *	@size: size of each object
+ *	@min_nr: minimum number of objects
+ *
+ *	First allocates a slab cache with name and size. Then allocates a
+ *	mempool which uses the slab cache for allocation and freeing.
+ *
+ *	Returns 0 on success or negative error code on failure.
  */
-#define I2O_CMD_BUS_ADAPTER_RESET	0x85
-#define I2O_CMD_BUS_RESET		0x87
-#define I2O_CMD_BUS_SCAN		0x89
-#define I2O_CMD_BUS_QUIESCE		0x8b
+static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr)
+{
+	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!pool->name)
+		goto exit;
+	strcpy(pool->name, name);
+
+	pool->slab =
+	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL,
+			      NULL);
+	if (!pool->slab)
+		goto free_name;
+
+	pool->mempool =
+	    mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
+			   pool->slab);
+	if (!pool->mempool)
+		goto free_slab;
+
+	return 0;
+
+      free_slab:
+	kmem_cache_destroy(pool->slab);
+
+      free_name:
+	kfree(pool->name);
+
+      exit:
+	return -ENOMEM;
+};
 
 /*
- * Random Block Storage Class
+ *	i2o_pool_free - Free slab cache and mempool again
+ *	@mempool: pointer to struct i2o_pool which should be freed
+ *
+ *	Note that you have to return all objects to the mempool again before
+ *	calling i2o_pool_free().
  */
-#define I2O_CMD_BLOCK_READ		0x30
-#define I2O_CMD_BLOCK_WRITE		0x31
-#define I2O_CMD_BLOCK_CFLUSH		0x37
-#define I2O_CMD_BLOCK_MLOCK		0x49
-#define I2O_CMD_BLOCK_MUNLOCK		0x4B
-#define I2O_CMD_BLOCK_MMOUNT		0x41
-#define I2O_CMD_BLOCK_MEJECT		0x43
-#define I2O_CMD_BLOCK_POWER		0x70
-
-#define I2O_CMD_PRIVATE			0xFF
+static inline void i2o_pool_free(struct i2o_pool *pool)
+{
+	mempool_destroy(pool->mempool);
+	kmem_cache_destroy(pool->slab);
+	kfree(pool->name);
+};
 
-/* Command status values  */
+/* I2O driver (OSM) functions */
+extern int i2o_driver_register(struct i2o_driver *);
+extern void i2o_driver_unregister(struct i2o_driver *);
 
-#define I2O_CMD_IN_PROGRESS	0x01
-#define I2O_CMD_REJECTED	0x02
-#define I2O_CMD_FAILED		0x03
-#define I2O_CMD_COMPLETED	0x04
+/**
+ *	i2o_driver_notify_controller_add - Send notification of added controller
+ *					   to a single I2O driver
+ *
+ *	Send notification of added controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_add(struct i2o_driver *drv,
+						    struct i2o_controller *c)
+{
+	if (drv->notify_controller_add)
+		drv->notify_controller_add(c);
+};
 
-/* I2O API function return values */
+/**
+ *	i2o_driver_notify_controller_remove - Send notification of removed
+ *					      controller to a single I2O driver
+ *
+ *	Send notification of removed controller to a single registered driver.
+ */
+static inline void i2o_driver_notify_controller_remove(struct i2o_driver *drv,
+						       struct i2o_controller *c)
+{
+	if (drv->notify_controller_remove)
+		drv->notify_controller_remove(c);
+};
 
-#define I2O_RTN_NO_ERROR			0
-#define I2O_RTN_NOT_INIT			1
-#define I2O_RTN_FREE_Q_EMPTY			2
-#define I2O_RTN_TCB_ERROR			3
-#define I2O_RTN_TRANSACTION_ERROR		4
-#define I2O_RTN_ADAPTER_ALREADY_INIT		5
-#define I2O_RTN_MALLOC_ERROR			6
-#define I2O_RTN_ADPTR_NOT_REGISTERED		7
-#define I2O_RTN_MSG_REPLY_TIMEOUT		8
-#define I2O_RTN_NO_STATUS			9
-#define I2O_RTN_NO_FIRM_VER			10
-#define	I2O_RTN_NO_LINK_SPEED			11
+/**
+ *	i2o_driver_notify_device_add - Send notification of added device to a
+ *				       single I2O driver
+ *
+ *	Send notification of added device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_add(struct i2o_driver *drv,
+						struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_add)
+		drv->notify_device_add(i2o_dev);
+};
 
-/* Reply message status defines for all messages */
+/**
+ *	i2o_driver_notify_device_remove - Send notification of removed device
+ *					  to a single I2O driver
+ *
+ *	Send notification of removed device to a single registered driver.
+ */
+static inline void i2o_driver_notify_device_remove(struct i2o_driver *drv,
+						   struct i2o_device *i2o_dev)
+{
+	if (drv->notify_device_remove)
+		drv->notify_device_remove(i2o_dev);
+};
 
-#define I2O_REPLY_STATUS_SUCCESS                    	0x00
-#define I2O_REPLY_STATUS_ABORT_DIRTY                	0x01
-#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER     	0x02
-#define	I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER		0x03
-#define	I2O_REPLY_STATUS_ERROR_DIRTY			0x04
-#define	I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER		0x05
-#define	I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER		0x06
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY		0x08
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER	0x09
-#define	I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER	0x0A
-#define	I2O_REPLY_STATUS_TRANSACTION_ERROR		0x0B
-#define	I2O_REPLY_STATUS_PROGRESS_REPORT		0x80
+extern void i2o_driver_notify_controller_add_all(struct i2o_controller *);
+extern void i2o_driver_notify_controller_remove_all(struct i2o_controller *);
+extern void i2o_driver_notify_device_add_all(struct i2o_device *);
+extern void i2o_driver_notify_device_remove_all(struct i2o_device *);
 
-/* Status codes and Error Information for Parameter functions */
+/* I2O device functions */
+extern int i2o_device_claim(struct i2o_device *);
+extern int i2o_device_claim_release(struct i2o_device *);
 
-#define I2O_PARAMS_STATUS_SUCCESS		0x00
-#define I2O_PARAMS_STATUS_BAD_KEY_ABORT		0x01
-#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE   	0x02
-#define I2O_PARAMS_STATUS_BUFFER_FULL		0x03
-#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL	0x04
-#define I2O_PARAMS_STATUS_FIELD_UNREADABLE	0x05
-#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE	0x06
-#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS	0x07
-#define I2O_PARAMS_STATUS_INVALID_GROUP_ID	0x08
-#define I2O_PARAMS_STATUS_INVALID_OPERATION	0x09
-#define I2O_PARAMS_STATUS_NO_KEY_FIELD		0x0A
-#define I2O_PARAMS_STATUS_NO_SUCH_FIELD		0x0B
-#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP	0x0C
-#define I2O_PARAMS_STATUS_OPERATION_ERROR	0x0D
-#define I2O_PARAMS_STATUS_SCALAR_ERROR		0x0E
-#define I2O_PARAMS_STATUS_TABLE_ERROR		0x0F
-#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE	0x10
+/* Exec OSM functions */
+extern int i2o_exec_lct_get(struct i2o_controller *);
 
-/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
- * messages: Table 3-2 Detailed Status Codes.*/
+/* device / driver / kobject conversion functions */
+#define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
+#define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
+#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
+#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
 
-#define I2O_DSC_SUCCESS                        0x0000
-#define I2O_DSC_BAD_KEY                        0x0002
-#define I2O_DSC_TCL_ERROR                      0x0003
-#define I2O_DSC_REPLY_BUFFER_FULL              0x0004
-#define I2O_DSC_NO_SUCH_PAGE                   0x0005
-#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT     0x0006
-#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD     0x0007
-#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE         0x0009
-#define I2O_DSC_UNSUPPORTED_FUNCTION           0x000A
-#define I2O_DSC_DEVICE_LOCKED                  0x000B
-#define I2O_DSC_DEVICE_RESET                   0x000C
-#define I2O_DSC_INAPPROPRIATE_FUNCTION         0x000D
-#define I2O_DSC_INVALID_INITIATOR_ADDRESS      0x000E
-#define I2O_DSC_INVALID_MESSAGE_FLAGS          0x000F
-#define I2O_DSC_INVALID_OFFSET                 0x0010
-#define I2O_DSC_INVALID_PARAMETER              0x0011
-#define I2O_DSC_INVALID_REQUEST                0x0012
-#define I2O_DSC_INVALID_TARGET_ADDRESS         0x0013
-#define I2O_DSC_MESSAGE_TOO_LARGE              0x0014
-#define I2O_DSC_MESSAGE_TOO_SMALL              0x0015
-#define I2O_DSC_MISSING_PARAMETER              0x0016
-#define I2O_DSC_TIMEOUT                        0x0017
-#define I2O_DSC_UNKNOWN_ERROR                  0x0018
-#define I2O_DSC_UNKNOWN_FUNCTION               0x0019
-#define I2O_DSC_UNSUPPORTED_VERSION            0x001A
-#define I2O_DSC_DEVICE_BUSY                    0x001B
-#define I2O_DSC_DEVICE_NOT_AVAILABLE           0x001C
+/**
+ *	i2o_out_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a receive message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for sender side messages as they are ioremap objects
+ *	provided by the I2O controller.
+ */
+static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
+						      u32 m)
+{
+	BUG_ON(m < c->out_queue.phys
+	       || m >= c->out_queue.phys + c->out_queue.len);
 
-/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
-   Status Codes.*/
+	return c->out_queue.virt + (m - c->out_queue.phys);
+};
 
-#define I2O_BSA_DSC_SUCCESS               0x0000
-#define I2O_BSA_DSC_MEDIA_ERROR           0x0001
-#define I2O_BSA_DSC_ACCESS_ERROR          0x0002
-#define I2O_BSA_DSC_DEVICE_FAILURE        0x0003
-#define I2O_BSA_DSC_DEVICE_NOT_READY      0x0004
-#define I2O_BSA_DSC_MEDIA_NOT_PRESENT     0x0005
-#define I2O_BSA_DSC_MEDIA_LOCKED          0x0006
-#define I2O_BSA_DSC_MEDIA_FAILURE         0x0007
-#define I2O_BSA_DSC_PROTOCOL_FAILURE      0x0008
-#define I2O_BSA_DSC_BUS_FAILURE           0x0009
-#define I2O_BSA_DSC_ACCESS_VIOLATION      0x000A
-#define I2O_BSA_DSC_WRITE_PROTECTED       0x000B
-#define I2O_BSA_DSC_DEVICE_RESET          0x000C
-#define I2O_BSA_DSC_VOLUME_CHANGED        0x000D
-#define I2O_BSA_DSC_TIMEOUT               0x000E
+/**
+ *	i2o_msg_in_to_virt - Turn an I2O message to a virtual address
+ *	@c: controller
+ *	@m: message engine value
+ *
+ *	Turn a send message from an I2O controller bus address into
+ *	a Linux virtual address. The shared page frame is a linear block
+ *	so we simply have to shift the offset. This function does not
+ *	work for receive side messages as they are kmalloc objects
+ *	in a different pool.
+ */
+static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
+							     i2o_controller *c,
+							     u32 m)
+{
+	return c->in_queue.virt + m;
+};
 
-/* FailureStatusCodes, Table 3-3 Message Failure Codes */
+/**
+ *	i2o_msg_get - obtain an I2O message from the IOP
+ *	@c: I2O controller
+ *
+ *	This function tries to get a message frame. If no message frame is
+ *	available do not wait until one is availabe (see also i2o_msg_get_wait).
+ *	The returned pointer to the message frame is not in I/O memory, it is
+ *	allocated from a mempool. But because a MFA is allocated from the
+ *	controller too it is guaranteed that i2o_msg_post() will never fail.
+ *
+ *	On a success a pointer to the message frame is returned. If the message
+ *	queue is empty -EBUSY is returned and if no memory is available -ENOMEM
+ *	is returned.
+ */
+static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
+{
+	struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
+	if (!mmsg)
+		return ERR_PTR(-ENOMEM);
+
+	mmsg->mfa = readl(c->in_port);
+	if (mmsg->mfa == I2O_QUEUE_EMPTY) {
+		mempool_free(mmsg, c->in_msg.mempool);
+		return ERR_PTR(-EBUSY);
+	}
 
-#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED             0x81
-#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED            0x82
-#define I2O_FSC_TRANSPORT_CONGESTION                    0x83
-#define I2O_FSC_TRANSPORT_FAILURE                       0x84
-#define I2O_FSC_TRANSPORT_STATE_ERROR                   0x85
-#define I2O_FSC_TRANSPORT_TIME_OUT                      0x86
-#define I2O_FSC_TRANSPORT_ROUTING_FAILURE               0x87
-#define I2O_FSC_TRANSPORT_INVALID_VERSION               0x88
-#define I2O_FSC_TRANSPORT_INVALID_OFFSET                0x89
-#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS             0x8A
-#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL               0x8B
-#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE               0x8C
-#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID             0x8D
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID          0x8E
-#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT     0x8F
-#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE               0xFF
+	return &mmsg->msg;
+};
 
-/* Device Claim Types */
-#define	I2O_CLAIM_PRIMARY					0x01000000
-#define	I2O_CLAIM_MANAGEMENT					0x02000000
-#define	I2O_CLAIM_AUTHORIZED					0x03000000
-#define	I2O_CLAIM_SECONDARY					0x04000000
+/**
+ *	i2o_msg_post - Post I2O message to I2O controller
+ *	@c: I2O controller to which the message should be send
+ *	@msg: message returned by i2o_msg_get()
+ *
+ *	Post the message to the I2O controller and return immediately.
+ */
+static inline void i2o_msg_post(struct i2o_controller *c,
+				struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
 
-/* Message header defines for VersionOffset */
-#define I2OVER15	0x0001
-#define I2OVER20	0x0002
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
+	memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
+		    (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
+	writel(mmsg->mfa, c->in_port);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
 
-/* Default is 1.5 */
-#define I2OVERSION	I2OVER15
+/**
+ * 	i2o_msg_post_wait - Post and wait a message and wait until return
+ *	@c: controller
+ *	@m: message to post
+ *	@timeout: time in seconds to wait
+ *
+ * 	This API allows an OSM to post a message and then be told whether or
+ *	not the system received a successful reply. If the message times out
+ *	then the value '-ETIMEDOUT' is returned.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static inline int i2o_msg_post_wait(struct i2o_controller *c,
+				    struct i2o_message *msg,
+				    unsigned long timeout)
+{
+	return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
+};
 
-#define SGL_OFFSET_0    I2OVERSION
-#define SGL_OFFSET_4    (0x0040 | I2OVERSION)
-#define SGL_OFFSET_5    (0x0050 | I2OVERSION)
-#define SGL_OFFSET_6    (0x0060 | I2OVERSION)
-#define SGL_OFFSET_7    (0x0070 | I2OVERSION)
-#define SGL_OFFSET_8    (0x0080 | I2OVERSION)
-#define SGL_OFFSET_9    (0x0090 | I2OVERSION)
-#define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
-#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
-#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
-#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
+/**
+ *	i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
+ *	@c: I2O controller from which the MFA was fetched
+ *	@mfa: MFA which should be returned
+ *
+ *	This function must be used for preserved messages, because i2o_msg_nop()
+ *	also returns the allocated memory back to the msg_pool mempool.
+ */
+static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
+{
+	struct i2o_message __iomem *msg;
+	u32 nop[3] = {
+		THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
+		I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
+		0x00000000
+	};
+
+	msg = i2o_msg_in_to_virt(c, mfa);
+	memcpy_toio(msg, nop, sizeof(nop));
+	writel(mfa, c->in_port);
+};
 
-/* Transaction Reply Lists (TRL) Control Word structure */
-#define TRL_SINGLE_FIXED_LENGTH		0x00
-#define TRL_SINGLE_VARIABLE_LENGTH	0x40
-#define TRL_MULTIPLE_FIXED_LENGTH	0x80
+/**
+ *	i2o_msg_nop - Returns a message which is not used
+ *	@c: I2O controller from which the message was created
+ *	@msg: message which should be returned
+ *
+ *	If you fetch a message via i2o_msg_get, and can't use it, you must
+ *	return the message with this function. Otherwise the MFA is lost as well
+ *	as the allocated memory from the mempool.
+ */
+static inline void i2o_msg_nop(struct i2o_controller *c,
+			       struct i2o_message *msg)
+{
+	struct i2o_msg_mfa *mmsg;
+	mmsg = container_of(msg, struct i2o_msg_mfa, msg);
 
- /* msg header defines for MsgFlags */
-#define MSG_STATIC	0x0100
-#define MSG_64BIT_CNTXT	0x0200
-#define MSG_MULTI_TRANS	0x1000
-#define MSG_FAIL	0x2000
-#define MSG_FINAL	0x4000
-#define MSG_REPLY	0x8000
+	i2o_msg_nop_mfa(c, mmsg->mfa);
+	mempool_free(mmsg, c->in_msg.mempool);
+};
 
- /* minimum size msg */
-#define THREE_WORD_MSG_SIZE	0x00030000
-#define FOUR_WORD_MSG_SIZE	0x00040000
-#define FIVE_WORD_MSG_SIZE	0x00050000
-#define SIX_WORD_MSG_SIZE	0x00060000
-#define SEVEN_WORD_MSG_SIZE	0x00070000
-#define EIGHT_WORD_MSG_SIZE	0x00080000
-#define NINE_WORD_MSG_SIZE	0x00090000
-#define TEN_WORD_MSG_SIZE	0x000A0000
-#define ELEVEN_WORD_MSG_SIZE	0x000B0000
-#define I2O_MESSAGE_SIZE(x)	((x)<<16)
+/**
+ *	i2o_flush_reply - Flush reply from I2O controller
+ *	@c: I2O controller
+ *	@m: the message identifier
+ *
+ *	The I2O controller must be informed that the reply message is not needed
+ *	anymore. If you forget to flush the reply, the message frame can't be
+ *	used by the controller anymore and is therefore lost.
+ */
+static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
+{
+	writel(m, c->out_port);
+};
 
-/* special TID assignments */
-#define ADAPTER_TID		0
-#define HOST_TID		1
+/*
+ *	Endian handling wrapped into the macro - keeps the core code
+ *	cleaner.
+ */
 
-/* outbound queue defines */
-#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
-#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
+#define i2o_raw_writel(val, mem)	__raw_writel(cpu_to_le32(val), mem)
 
-#define I2O_POST_WAIT_OK	0
-#define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
+extern int i2o_parm_field_get(struct i2o_device *, int, int, void *, int);
+extern int i2o_parm_table_get(struct i2o_device *, int, int, int, void *, int,
+			      void *, int);
 
-#define I2O_CONTEXT_LIST_MIN_LENGTH	15
-#define I2O_CONTEXT_LIST_USED		0x01
-#define I2O_CONTEXT_LIST_DELETED	0x02
+/* debugging and troubleshooting/diagnostic helpers. */
+#define osm_printk(level, format, arg...)  \
+	printk(level "%s: " format, OSM_NAME , ## arg)
 
-/* timeouts */
-#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE	15
-#define I2O_TIMEOUT_MESSAGE_GET		5
-#define I2O_TIMEOUT_RESET		30
-#define I2O_TIMEOUT_STATUS_GET		5
-#define I2O_TIMEOUT_LCT_GET		360
-#define I2O_TIMEOUT_SCSI_SCB_ABORT	240
+#ifdef DEBUG
+#define osm_debug(format, arg...) \
+	osm_printk(KERN_DEBUG, format , ## arg)
+#else
+#define osm_debug(format, arg...) \
+        do { } while (0)
+#endif
 
-/* retries */
-#define I2O_HRT_GET_TRIES		3
-#define I2O_LCT_GET_TRIES		3
+#define osm_err(format, arg...)		\
+	osm_printk(KERN_ERR, format , ## arg)
+#define osm_info(format, arg...)		\
+	osm_printk(KERN_INFO, format , ## arg)
+#define osm_warn(format, arg...)		\
+	osm_printk(KERN_WARNING, format , ## arg)
 
-/* defines for max_sectors and max_phys_segments */
-#define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		256
-#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
+/* debugging functions */
+extern void i2o_report_status(const char *, const char *, struct i2o_message *);
+extern void i2o_dump_message(struct i2o_message *);
+extern void i2o_dump_hrt(struct i2o_controller *c);
+extern void i2o_debug_state(struct i2o_controller *c);
 
 #endif				/* __KERNEL__ */
 #endif				/* _I2O_H */
-- 
cgit v1.1


From 793fd15d9fafe5b1c71e50d3c041f1463895dbde Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:30 -0800
Subject: [PATCH] I2O: SPARC fixes

Fix lot of BE <-> LE bugs which prevent it from working on SPARC.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/Kconfig     |  12 ++++
 drivers/message/i2o/device.c    | 130 +++++++++++++++++++---------------------
 drivers/message/i2o/exec-osm.c  |  15 +++--
 drivers/message/i2o/i2o_block.c |  20 +++----
 drivers/message/i2o/i2o_scsi.c  |  35 ++++++-----
 5 files changed, 111 insertions(+), 101 deletions(-)

diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index 43a942a..fef6771 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -24,6 +24,18 @@ config I2O
 
 	  If unsure, say N.
 
+config I2O_LCT_NOTIFY_ON_CHANGES
+	bool "Enable LCT notification"
+	depends on I2O
+	default y
+	---help---
+	  Only say N here if you have a I2O controller from SUN. The SUN
+	  firmware doesn't support LCT notification on changes. If this option
+	  is enabled on such a controller the driver will hang up in a endless
+	  loop. On all other controllers say Y.
+
+	  If unsure, say Y.
+
 config I2O_EXT_ADAPTEC
 	bool "Enable Adaptec extensions"
 	depends on I2O
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 002ae0e..1db2621 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -341,56 +341,83 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 {
 	struct i2o_device *dev, *tmp;
 	i2o_lct *lct;
-	int i;
-	int max;
+	u32 *dlct = c->dlct.virt;
+	int max = 0, i = 0;
+	u16 table_size;
+	u32 buf;
 
 	down(&c->lct_lock);
 
 	kfree(c->lct);
 
-	lct = c->dlct.virt;
+	buf = le32_to_cpu(*dlct++);
+	table_size = buf & 0xffff;
 
-	c->lct = kmalloc(lct->table_size * 4, GFP_KERNEL);
-	if (!c->lct) {
+	lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
+	if (!lct) {
 		up(&c->lct_lock);
 		return -ENOMEM;
 	}
 
-	if (lct->table_size * 4 > c->dlct.len) {
-		memcpy(c->lct, c->dlct.virt, c->dlct.len);
-		up(&c->lct_lock);
-		return -EAGAIN;
-	}
-
-	memcpy(c->lct, c->dlct.virt, lct->table_size * 4);
+	lct->lct_ver = buf >> 28;
+	lct->boot_tid = buf >> 16 & 0xfff;
+	lct->table_size = table_size;
+	lct->change_ind = le32_to_cpu(*dlct++);
+	lct->iop_flags = le32_to_cpu(*dlct++);
 
-	lct = c->lct;
-
-	max = (lct->table_size - 3) / 9;
+	table_size -= 3;
 
 	pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
 		 lct->table_size);
 
-	/* remove devices, which are not in the LCT anymore */
-	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+	while (table_size > 0) {
+		i2o_lct_entry *entry = &lct->lct_entry[max];
 		int found = 0;
 
-		for (i = 0; i < max; i++) {
-			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
+		buf = le32_to_cpu(*dlct++);
+		entry->entry_size = buf & 0xffff;
+		entry->tid = buf >> 16 & 0xfff;
+
+		entry->change_ind = le32_to_cpu(*dlct++);
+		entry->device_flags = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->class_id = buf & 0xfff;
+		entry->version = buf >> 12 & 0xf;
+		entry->vendor_id = buf >> 16;
+
+		entry->sub_class = le32_to_cpu(*dlct++);
+
+		buf = le32_to_cpu(*dlct++);
+		entry->user_tid = buf & 0xfff;
+		entry->parent_tid = buf >> 12 & 0xfff;
+		entry->bios_info = buf >> 24;
+
+		memcpy(&entry->identity_tag, dlct, 8);
+		dlct += 2;
+
+		entry->event_capabilities = le32_to_cpu(*dlct++);
+
+		/* add new devices, which are new in the LCT */
+		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+			if (entry->tid == dev->lct_data.tid) {
 				found = 1;
 				break;
 			}
 		}
 
 		if (!found)
-			i2o_device_remove(dev);
+			i2o_device_add(c, entry);
+
+		table_size -= 9;
+		max++;
 	}
 
-	/* add new devices, which are new in the LCT */
-	for (i = 0; i < max; i++) {
+	/* remove devices, which are not in the LCT anymore */
+	list_for_each_entry_safe(dev, tmp, &c->devices, list) {
 		int found = 0;
 
-		list_for_each_entry_safe(dev, tmp, &c->devices, list) {
+		for (i = 0; i < max; i++) {
 			if (lct->lct_entry[i].tid == dev->lct_data.tid) {
 				found = 1;
 				break;
@@ -398,8 +425,9 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 		}
 
 		if (!found)
-			i2o_device_add(c, &lct->lct_entry[i]);
+			i2o_device_remove(dev);
 	}
+
 	up(&c->lct_lock);
 
 	return 0;
@@ -422,9 +450,6 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 		   int oplen, void *reslist, int reslen)
 {
 	struct i2o_message *msg;
-	u32 *res32 = (u32 *) reslist;
-	u32 *restmp = (u32 *) reslist;
-	int len = 0;
 	int i = 0;
 	int rc;
 	struct i2o_dma res;
@@ -448,7 +473,6 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 	msg->body[i++] = cpu_to_le32(0x00000000);
 	msg->body[i++] = cpu_to_le32(0x4C000000 | oplen);	/* OperationList */
 	memcpy(&msg->body[i], oplist, oplen);
-
 	i += (oplen / 4 + (oplen % 4 ? 1 : 0));
 	msg->body[i++] = cpu_to_le32(0xD0000000 | res.len);	/* ResultList */
 	msg->body[i++] = cpu_to_le32(res.phys);
@@ -466,36 +490,7 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 	memcpy(reslist, res.virt, res.len);
 	i2o_dma_free(dev, &res);
 
-	/* Query failed */
-	if (rc)
-		return rc;
-	/*
-	 * Calculate number of bytes of Result LIST
-	 * We need to loop through each Result BLOCK and grab the length
-	 */
-	restmp = res32 + 1;
-	len = 1;
-	for (i = 0; i < (res32[0] & 0X0000FFFF); i++) {
-		if (restmp[0] & 0x00FF0000) {	/* BlockStatus != SUCCESS */
-			printk(KERN_WARNING
-			       "%s - Error:\n  ErrorInfoSize = 0x%02x, "
-			       "BlockStatus = 0x%02x, BlockSize = 0x%04x\n",
-			       (cmd ==
-				I2O_CMD_UTIL_PARAMS_SET) ? "PARAMS_SET" :
-			       "PARAMS_GET", res32[1] >> 24,
-			       (res32[1] >> 16) & 0xFF, res32[1] & 0xFFFF);
-
-			/*
-			 *      If this is the only request,than we return an error
-			 */
-			if ((res32[0] & 0x0000FFFF) == 1) {
-				return -((res32[1] >> 16) & 0xFF);	/* -BlockStatus */
-			}
-		}
-		len += restmp[0] & 0x0000FFFF;	/* Length of res BLOCK */
-		restmp += restmp[0] & 0x0000FFFF;	/* Skip to next BLOCK */
-	}
-	return (len << 2);	/* bytes used by result list */
+	return rc;
 }
 
 /*
@@ -504,28 +499,25 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
 		       void *buf, int buflen)
 {
-	u16 opblk[] = { 1, 0, I2O_PARAMS_FIELD_GET, group, 1, field };
+	u32 opblk[] = { cpu_to_le32(0x00000001),
+		cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
+		cpu_to_le32((s16) field << 16 | 0x00000001)
+	};
 	u8 *resblk;		/* 8 bytes for header */
-	int size;
-
-	if (field == -1)	/* whole group */
-		opblk[4] = -1;
+	int rc;
 
 	resblk = kmalloc(buflen + 8, GFP_KERNEL | GFP_ATOMIC);
 	if (!resblk)
 		return -ENOMEM;
 
-	size = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			      sizeof(opblk), resblk, buflen + 8);
+	rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
+			    sizeof(opblk), resblk, buflen + 8);
 
 	memcpy(buf, resblk + 8, buflen);	/* cut off header */
 
 	kfree(resblk);
 
-	if (size > buflen)
-		return buflen;
-
-	return size;
+	return rc;
 }
 
 /*
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 71a0933..d24548f 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -77,7 +77,7 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
 
 	wait = kmalloc(sizeof(*wait), GFP_KERNEL);
 	if (!wait)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	memset(wait, 0, sizeof(*wait));
 
@@ -271,8 +271,8 @@ static ssize_t i2o_exec_show_vendor_id(struct device *d,
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
 
-	if (i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
-		sprintf(buf, "0x%04x", id);
+	if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
 		return strlen(buf) + 1;
 	}
 
@@ -293,8 +293,8 @@ static ssize_t i2o_exec_show_product_id(struct device *d,
 	struct i2o_device *dev = to_i2o_device(d);
 	u16 id;
 
-	if (i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
-		sprintf(buf, "0x%04x", id);
+	if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
+		sprintf(buf, "0x%04x", le16_to_cpu(id));
 		return strlen(buf) + 1;
 	}
 
@@ -364,7 +364,9 @@ static void i2o_exec_lct_modified(struct i2o_controller *c)
 	if (i2o_device_parse_lct(c) != -EAGAIN)
 		change_ind = c->lct->change_ind + 1;
 
+#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
 	i2o_exec_lct_notify(c, change_ind);
+#endif
 };
 
 /**
@@ -512,7 +514,8 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
 
 	dev = &c->pdev->dev;
 
-	if (i2o_dma_realloc(dev, &c->dlct, sb->expected_lct_size, GFP_KERNEL))
+	if (i2o_dma_realloc
+	    (dev, &c->dlct, le32_to_cpu(sb->expected_lct_size), GFP_KERNEL))
 		return -ENOMEM;
 
 	msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 2bd15c7..ed2df54 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1050,8 +1050,8 @@ static int i2o_block_probe(struct device *dev)
 	int rc;
 	u64 size;
 	u32 blocksize;
-	u32 flags, status;
 	u16 body_size = 4;
+	u16 power;
 	unsigned short max_sectors;
 
 #ifdef CONFIG_I2O_EXT_ADAPTEC
@@ -1109,22 +1109,20 @@ static int i2o_block_probe(struct device *dev)
 	 *      Ask for the current media data. If that isn't supported
 	 *      then we ask for the device capacity data
 	 */
-	if (i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
-	    i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_hardsect_size(queue, blocksize);
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
+		blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
 	} else
 		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
-	if (i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
-	    i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
-		set_capacity(gd, size >> KERNEL_SECTOR_SHIFT);
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
+	    !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
+		set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
 	} else
 		osm_warn("could not get size of %s\n", gd->disk_name);
 
-	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &i2o_blk_dev->power, 2))
-		i2o_blk_dev->power = 0;
-	i2o_parm_field_get(i2o_dev, 0x0000, 5, &flags, 4);
-	i2o_parm_field_get(i2o_dev, 0x0000, 6, &status, 4);
+	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
+		i2o_blk_dev->power = power;
 
 	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
 
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 7a784fd..24061df 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -113,7 +113,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 
 	list_for_each_entry(i2o_dev, &c->devices, list)
 	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
 		    && (type == 0x01))	/* SCSI bus */
 			max_channel++;
 	}
@@ -146,7 +146,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 	i = 0;
 	list_for_each_entry(i2o_dev, &c->devices, list)
 	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
 		    && (type == 0x01))	/* only SCSI bus */
 			i2o_shost->channel[i++] = i2o_dev;
 
@@ -238,13 +238,15 @@ static int i2o_scsi_probe(struct device *dev)
 			u8 type;
 			struct i2o_device *d = i2o_shost->channel[0];
 
-			if (i2o_parm_field_get(d, 0x0000, 0, &type, 1)
+			if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
 			    && (type == 0x01))	/* SCSI bus */
-				if (i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
+				if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
 					channel = 0;
 					if (i2o_dev->lct_data.class_id ==
 					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
-						lun = i2o_shost->lun++;
+						lun =
+						    cpu_to_le64(i2o_shost->
+								lun++);
 					else
 						lun = 0;
 				}
@@ -253,10 +255,10 @@ static int i2o_scsi_probe(struct device *dev)
 		break;
 
 	case I2O_CLASS_SCSI_PERIPHERAL:
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4) < 0)
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
 			return -EFAULT;
 
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8) < 0)
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
 			return -EFAULT;
 
 		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
@@ -281,20 +283,22 @@ static int i2o_scsi_probe(struct device *dev)
 		return -EFAULT;
 	}
 
-	if (id >= scsi_host->max_id) {
-		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)", id,
-			 scsi_host->max_id);
+	if (le32_to_cpu(id) >= scsi_host->max_id) {
+		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
+			 le32_to_cpu(id), scsi_host->max_id);
 		return -EFAULT;
 	}
 
-	if (lun >= scsi_host->max_lun) {
-		osm_warn("SCSI device id (%d) >= max_lun of I2O host (%d)",
-			 (unsigned int)lun, scsi_host->max_lun);
+	if (le64_to_cpu(lun) >= scsi_host->max_lun) {
+		osm_warn("SCSI device lun (%lu) >= max_lun of I2O host (%d)",
+			 (long unsigned int)le64_to_cpu(lun),
+			 scsi_host->max_lun);
 		return -EFAULT;
 	}
 
 	scsi_dev =
-	    __scsi_add_device(i2o_shost->scsi_host, channel, id, lun, i2o_dev);
+	    __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
+			      le64_to_cpu(lun), i2o_dev);
 
 	if (IS_ERR(scsi_dev)) {
 		osm_warn("can not add SCSI device %03x\n",
@@ -306,7 +310,8 @@ static int i2o_scsi_probe(struct device *dev)
 			  "scsi");
 
 	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
-		 i2o_dev->lct_data.tid, channel, id, (unsigned int)lun);
+		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
+		 (unsigned int)le64_to_cpu(lun));
 
 	return 0;
 };
-- 
cgit v1.1


From 24791bd48f643194d806654b587251b0f92233e8 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:31 -0800
Subject: [PATCH] I2O: Remove wrong I2O device class

Removed wrong I2O device class, which was only needed to add sysfs attributes.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/core.h   |   2 +
 drivers/message/i2o/device.c | 144 ++++++++++++++++++-------------------------
 drivers/message/i2o/driver.c |   2 -
 drivers/message/i2o/iop.c    |  34 ++--------
 include/linux/i2o.h          |   1 -
 5 files changed, 68 insertions(+), 115 deletions(-)

diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
index 9eefedb..9aa9b91 100644
--- a/drivers/message/i2o/core.h
+++ b/drivers/message/i2o/core.h
@@ -33,6 +33,8 @@ extern int __init i2o_pci_init(void);
 extern void __exit i2o_pci_exit(void);
 
 /* device */
+extern struct device_attribute i2o_device_attrs[];
+
 extern void i2o_device_remove(struct i2o_device *);
 extern int i2o_device_parse_lct(struct i2o_controller *);
 
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 1db2621..a5e260b 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -142,8 +142,9 @@ static void i2o_device_release(struct device *dev)
 
 
 /**
- *	i2o_device_class_show_class_id - Displays class id of I2O device
- *	@cd: class device of which the class id should be displayed
+ *	i2o_device_show_class_id - Displays class id of I2O device
+ *	@dev: device of which the class id should be displayed
+ *	@attr: pointer to device attribute
  *	@buf: buffer into which the class id should be printed
  *
  *	Returns the number of bytes which are printed into the buffer.
@@ -159,15 +160,15 @@ static ssize_t i2o_device_show_class_id(struct device *dev,
 }
 
 /**
- *	i2o_device_class_show_tid - Displays TID of I2O device
- *	@cd: class device of which the TID should be displayed
- *	@buf: buffer into which the class id should be printed
+ *	i2o_device_show_tid - Displays TID of I2O device
+ *	@dev: device of which the TID should be displayed
+ *	@attr: pointer to device attribute
+ *	@buf: buffer into which the TID should be printed
  *
  *	Returns the number of bytes which are printed into the buffer.
  */
 static ssize_t i2o_device_show_tid(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
+				   struct device_attribute *attr, char *buf)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
 
@@ -209,66 +210,6 @@ static struct i2o_device *i2o_device_alloc(void)
 }
 
 /**
- *	i2o_setup_sysfs_links - Adds attributes to the I2O device
- *	@cd: I2O class device which is added to the I2O device class
- *
- *	This function get called when a I2O device is added to the class. It
- *	creates the attributes for each device and creates user/parent symlink
- *	if necessary.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static void i2o_setup_sysfs_links(struct i2o_device *i2o_dev)
-{
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_device *tmp;
-
-	/* create user entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
-	if (tmp && tmp != i2o_dev)
-		sysfs_create_link(&i2o_dev->device.kobj,
-				  &tmp->device.kobj, "user");
-
-	/* create user entries refering to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid &&
-		    tmp != i2o_dev)
-			sysfs_create_link(&tmp->device.kobj,
-					  &i2o_dev->device.kobj, "user");
-
-	/* create parent entries for this device */
-	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
-	if (tmp && tmp != i2o_dev)
-		sysfs_create_link(&i2o_dev->device.kobj,
-				  &tmp->device.kobj, "parent");
-
-	/* create parent entries refering to this device */
-	list_for_each_entry(tmp, &c->devices, list)
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid &&
-		    tmp != i2o_dev)
-		sysfs_create_link(&tmp->device.kobj,
-				  &i2o_dev->device.kobj, "parent");
-}
-
-static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
-{
-	struct i2o_controller *c = i2o_dev->iop;
-	struct i2o_device *tmp;
-
-	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
-	sysfs_remove_link(&i2o_dev->device.kobj, "user");
-
-	list_for_each_entry(tmp, &c->devices, list) {
-		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "parent");
-		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-			sysfs_remove_link(&tmp->device.kobj, "user");
-	}
-}
-
-
-
-/**
  *	i2o_device_add - allocate a new I2O device and add it to the IOP
  *	@iop: I2O controller where the device is on
  *	@entry: LCT entry of the I2O device
@@ -282,33 +223,57 @@ static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
 static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 					 i2o_lct_entry * entry)
 {
-	struct i2o_device *dev;
+	struct i2o_device *i2o_dev, *tmp;
 
-	dev = i2o_device_alloc();
-	if (IS_ERR(dev)) {
+	i2o_dev = i2o_device_alloc();
+	if (IS_ERR(i2o_dev)) {
 		printk(KERN_ERR "i2o: unable to allocate i2o device\n");
-		return dev;
+		return i2o_dev;
 	}
 
-	dev->lct_data = *entry;
-	dev->iop = c;
+	i2o_dev->lct_data = *entry;
 
-	snprintf(dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit,
-		 dev->lct_data.tid);
+	snprintf(i2o_dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit,
+		 i2o_dev->lct_data.tid);
 
-	dev->device.parent = &c->device;
+	i2o_dev->iop = c;
+	i2o_dev->device.parent = &c->device;
 
-	device_register(&dev->device);
+	device_register(&i2o_dev->device);
 
-	list_add_tail(&dev->list, &c->devices);
+	list_add_tail(&i2o_dev->list, &c->devices);
 
-	i2o_setup_sysfs_links(dev);
+	/* create user entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
+	if (tmp && (tmp != i2o_dev))
+		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
+				  "user");
 
-	i2o_driver_notify_device_add_all(dev);
+	/* create user entries refering to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+		if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		    && (tmp != i2o_dev))
+		    sysfs_create_link(&tmp->device.kobj,
+				      &i2o_dev->device.kobj, "user");
 
-	pr_debug("i2o: device %s added\n", dev->device.bus_id);
+	/* create parent entries for this device */
+	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
+	if (tmp && (tmp != i2o_dev))
+		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
+				  "parent");
 
-	return dev;
+	/* create parent entries refering to this device */
+	list_for_each_entry(tmp, &c->devices, list)
+		if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		    && (tmp != i2o_dev))
+			sysfs_create_link(&tmp->device.kobj,
+					  &i2o_dev->device.kobj, "parent");
+
+	i2o_driver_notify_device_add_all(i2o_dev);
+
+	pr_debug("i2o: device %s added\n", i2o_dev->device.bus_id);
+
+	return i2o_dev;
 }
 
 /**
@@ -321,9 +286,22 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
  */
 void i2o_device_remove(struct i2o_device *i2o_dev)
 {
+	struct i2o_device *tmp;
+	struct i2o_controller *c = i2o_dev->iop;
+
 	i2o_driver_notify_device_remove_all(i2o_dev);
-	i2o_remove_sysfs_links(i2o_dev);
+
+	sysfs_remove_link(&i2o_dev->device.kobj, "parent");
+	sysfs_remove_link(&i2o_dev->device.kobj, "user");
+
+	list_for_each_entry(tmp, &c->devices, list) {
+		if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "parent");
+		if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+			sysfs_remove_link(&tmp->device.kobj, "user");
+	}
 	list_del(&i2o_dev->list);
+
 	device_unregister(&i2o_dev->device);
 }
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 0fb9c4e..25292b3 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -61,8 +61,6 @@ static int i2o_bus_match(struct device *dev, struct device_driver *drv)
 };
 
 /* I2O bus type */
-extern struct device_attribute i2o_device_attrs[];
-
 struct bus_type i2o_bus_type = {
 	.name = "i2o",
 	.match = i2o_bus_match,
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index f86abb4..7411a05 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -806,7 +806,6 @@ void i2o_iop_remove(struct i2o_controller *c)
 	list_for_each_entry_safe(dev, tmp, &c->devices, list)
 	    i2o_device_remove(dev);
 
-	class_device_unregister(c->classdev);
 	device_del(&c->device);
 
 	/* Ask the IOP to switch to RESET state */
@@ -1050,9 +1049,6 @@ static void i2o_iop_release(struct device *dev)
 	i2o_iop_free(c);
 };
 
-/* I2O controller class */
-static struct class *i2o_controller_class;
-
 /**
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
@@ -1124,36 +1120,29 @@ int i2o_iop_add(struct i2o_controller *c)
 		goto iop_reset;
 	}
 
-	c->classdev = class_device_create(i2o_controller_class, NULL, MKDEV(0,0),
-			&c->device, "iop%d", c->unit);
-	if (IS_ERR(c->classdev)) {
-		osm_err("%s: could not add controller class\n", c->name);
-		goto device_del;
-	}
-
 	osm_info("%s: Activating I2O controller...\n", c->name);
 	osm_info("%s: This may take a few minutes if there are many devices\n",
 		 c->name);
 
 	if ((rc = i2o_iop_activate(c))) {
 		osm_err("%s: could not activate controller\n", c->name);
-		goto class_del;
+		goto device_del;
 	}
 
 	osm_debug("%s: building sys table...\n", c->name);
 
 	if ((rc = i2o_systab_build()))
-		goto class_del;
+		goto device_del;
 
 	osm_debug("%s: online controller...\n", c->name);
 
 	if ((rc = i2o_iop_online(c)))
-		goto class_del;
+		goto device_del;
 
 	osm_debug("%s: getting LCT...\n", c->name);
 
 	if ((rc = i2o_exec_lct_get(c)))
-		goto class_del;
+		goto device_del;
 
 	list_add(&c->list, &i2o_controllers);
 
@@ -1163,9 +1152,6 @@ int i2o_iop_add(struct i2o_controller *c)
 
 	return 0;
 
-      class_del:
-	class_device_unregister(c->classdev);
-
       device_del:
 	device_del(&c->device);
 
@@ -1225,14 +1211,8 @@ static int __init i2o_iop_init(void)
 
 	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
 
-	i2o_controller_class = class_create(THIS_MODULE, "i2o_controller");
-	if (IS_ERR(i2o_controller_class)) {
-		osm_err("can't register class i2o_controller\n");
-		goto exit;
-	}
-
 	if ((rc = i2o_driver_init()))
-		goto class_exit;
+		goto exit;
 
 	if ((rc = i2o_exec_init()))
 		goto driver_exit;
@@ -1248,9 +1228,6 @@ static int __init i2o_iop_init(void)
       driver_exit:
 	i2o_driver_exit();
 
-      class_exit:
-	class_destroy(i2o_controller_class);
-
       exit:
 	return rc;
 }
@@ -1265,7 +1242,6 @@ static void __exit i2o_iop_exit(void)
 	i2o_pci_exit();
 	i2o_exec_exit();
 	i2o_driver_exit();
-	class_destroy(i2o_controller_class);
 };
 
 module_init(i2o_iop_init);
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 9e359a9..4c18b77 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -561,7 +561,6 @@ struct i2o_controller {
 	struct resource mem_resource;	/* Mem resource allocated to the IOP */
 
 	struct device device;
-	struct class_device *classdev;	/* I2O controller class device */
 	struct i2o_device *exec;	/* Executive */
 #if BITS_PER_LONG == 64
 	spinlock_t context_list_lock;	/* lock for context_list */
-- 
cgit v1.1


From dcceafe25a5f47cf69e5b46b4da6f15186ec8386 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:32 -0800
Subject: [PATCH] I2O: Bugfixes

- Removed some kmalloc's with __GFP_ZERO and replace it with memset()
  because it didn't work properly.

- Fixed returned message frame in i2o_cfg_passthru() which caused raidutils
  to display wrong error message in case a disk was missing.

- Fixed size of printk() in i2o_scsi.c.

- Fixed get_device() and put_device() in probing of the I2O controller.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/driver.c     |  5 +++--
 drivers/message/i2o/i2o_config.c | 29 ++++++++++++++---------------
 drivers/message/i2o/i2o_scsi.c   |  4 ++--
 drivers/message/i2o/pci.c        |  6 +-----
 include/linux/i2o.h              |  2 +-
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 25292b3..9c631c8 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -217,14 +217,15 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 		/* cut of header from message size (in 32-bit words) */
 		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
 
-		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC | __GFP_ZERO);
+		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
 		if (!evt)
 			return -ENOMEM;
+		memset(evt, 0, size * 4 + sizeof(*evt));
 
 		evt->size = size;
 		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
 		evt->event_indicator = le32_to_cpu(msg->body[0]);
-		memcpy(&evt->tcntxt, &msg->u.s.tcntxt, size * 4);
+		memcpy(&evt->data, &msg->body[1], size * 4);
 
 		list_for_each_entry_safe(dev, tmp, &c->devices, list)
 		    if (dev->lct_data.tid == tid) {
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 4fe73d6..286fef3 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -36,12 +36,12 @@
 
 #include <asm/uaccess.h>
 
-#include "core.h"
-
 #define SG_TABLESIZE		30
 
-static int i2o_cfg_ioctl(struct inode *inode, struct file *fp, unsigned int cmd,
-			 unsigned long arg);
+extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
+
+static int i2o_cfg_ioctl(struct inode *, struct file *, unsigned int,
+			 unsigned long);
 
 static spinlock_t i2o_config_lock;
 
@@ -593,9 +593,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
-
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
 		struct sg_simple_element *sg;
@@ -629,7 +626,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 				goto cleanup;
 			}
 			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[sg_index++]);
+			p = &(sg_list[sg_index]);
 			/* Allocate memory for the transfer */
 			if (i2o_dma_alloc
 			    (&c->pdev->dev, p, sg_size,
@@ -640,6 +637,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 				rcode = -ENOMEM;
 				goto sg_list_cleanup;
 			}
+			sg_index++;
 			/* Copy in the user's SG buffer if necessary */
 			if (sg[i].
 			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
@@ -661,8 +659,10 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 	}
 
 	rcode = i2o_msg_post_wait(c, msg, 60);
-	if (rcode)
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
 		goto sg_list_cleanup;
+	}
 
 	if (sg_offset) {
 		u32 msg[I2O_OUTBOUND_MSG_FRAME_SIZE];
@@ -712,6 +712,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
+      sg_list_cleanup:
 	/* Copy back the reply to user space */
 	if (reply_size) {
 		// we wrote our own values for context - now restore the user supplied ones
@@ -729,7 +730,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 		}
 	}
 
-      sg_list_cleanup:
 	for (i = 0; i < sg_index; i++)
 		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
 
@@ -827,9 +827,6 @@ static int i2o_cfg_passthru(unsigned long arg)
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
-	msg->u.s.icntxt = cpu_to_le32(i2o_config_driver.context);
-	msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, reply));
-
 	memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
 	if (sg_offset) {
 		struct sg_simple_element *sg;
@@ -892,8 +889,10 @@ static int i2o_cfg_passthru(unsigned long arg)
 	}
 
 	rcode = i2o_msg_post_wait(c, msg, 60);
-	if (rcode)
+	if (rcode) {
+		reply[4] = ((u32) rcode) << 24;
 		goto sg_list_cleanup;
+	}
 
 	if (sg_offset) {
 		u32 msg[128];
@@ -943,6 +942,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
+      sg_list_cleanup:
 	/* Copy back the reply to user space */
 	if (reply_size) {
 		// we wrote our own values for context - now restore the user supplied ones
@@ -959,7 +959,6 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
-      sg_list_cleanup:
 	for (i = 0; i < sg_index; i++)
 		kfree(sg_list[i]);
 
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 24061df..76b9516 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -309,9 +309,9 @@ static int i2o_scsi_probe(struct device *dev)
 	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj,
 			  "scsi");
 
-	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
+	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %ld\n",
 		 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
-		 (unsigned int)le64_to_cpu(lun));
+		 (long unsigned int)le64_to_cpu(lun));
 
 	return 0;
 };
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 329d482..c5b656c 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -339,7 +339,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 		       pci_name(pdev));
 
 	c->pdev = pdev;
-	c->device.parent = get_device(&pdev->dev);
+	c->device.parent = &pdev->dev;
 
 	/* Cards that fall apart if you hit them with large I/O loads... */
 	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
@@ -410,8 +410,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	if ((rc = i2o_iop_add(c)))
 		goto uninstall;
 
-	get_device(&c->device);
-
 	if (i960)
 		pci_write_config_word(i960, 0x42, 0x03ff);
 
@@ -424,7 +422,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	i2o_pci_free(c);
 
       free_controller:
-	put_device(c->device.parent);
 	i2o_iop_free(c);
 
       disable:
@@ -454,7 +451,6 @@ static void __devexit i2o_pci_remove(struct pci_dev *pdev)
 
 	printk(KERN_INFO "%s: Controller removed.\n", c->name);
 
-	put_device(c->device.parent);
 	put_device(&c->device);
 };
 
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 4c18b77..9ba8067 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -384,7 +384,7 @@
 
 /* defines for max_sectors and max_phys_segments */
 #define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SECTORS_LIMITED		256
+#define I2O_MAX_SECTORS_LIMITED		128
 #define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
 /*
-- 
cgit v1.1


From 2e1973a3cd0b9fe31469be62df3583bdc5a34f51 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:32 -0800
Subject: [PATCH] I2O: Beautifying

Fix some typos and minor code beautifying.

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/bus-osm.c    |  2 +-
 drivers/message/i2o/config-osm.c |  2 +-
 drivers/message/i2o/core.h       |  7 ++-----
 drivers/message/i2o/device.c     |  9 +++++----
 drivers/message/i2o/driver.c     |  2 +-
 drivers/message/i2o/i2o_block.c  |  8 +++++---
 drivers/message/i2o/i2o_proc.c   |  2 +-
 drivers/message/i2o/i2o_scsi.c   |  2 +-
 drivers/message/i2o/iop.c        | 12 +++---------
 9 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
index ce039d3..ac06f10 100644
--- a/drivers/message/i2o/bus-osm.c
+++ b/drivers/message/i2o/bus-osm.c
@@ -17,7 +17,7 @@
 #include <linux/i2o.h>
 
 #define OSM_NAME	"bus-osm"
-#define OSM_VERSION	"$Rev$"
+#define OSM_VERSION	"1.317"
 #define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
 
 static struct i2o_driver i2o_bus_driver;
diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
index 10432f6..b613890 100644
--- a/drivers/message/i2o/config-osm.c
+++ b/drivers/message/i2o/config-osm.c
@@ -22,7 +22,7 @@
 #include <asm/uaccess.h>
 
 #define OSM_NAME	"config-osm"
-#define OSM_VERSION	"1.248"
+#define OSM_VERSION	"1.317"
 #define OSM_DESCRIPTION	"I2O Configuration OSM"
 
 /* access mode user rw */
diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
index 9aa9b91..edab686 100644
--- a/drivers/message/i2o/core.h
+++ b/drivers/message/i2o/core.h
@@ -14,8 +14,6 @@
  */
 
 /* Exec-OSM */
-extern struct bus_type i2o_bus_type;
-
 extern struct i2o_driver i2o_exec_driver;
 extern int i2o_exec_lct_get(struct i2o_controller *);
 
@@ -23,6 +21,8 @@ extern int __init i2o_exec_init(void);
 extern void __exit i2o_exec_exit(void);
 
 /* driver */
+extern struct bus_type i2o_bus_type;
+
 extern int i2o_driver_dispatch(struct i2o_controller *, u32);
 
 extern int __init i2o_driver_init(void);
@@ -45,9 +45,6 @@ extern void i2o_iop_free(struct i2o_controller *);
 extern int i2o_iop_add(struct i2o_controller *);
 extern void i2o_iop_remove(struct i2o_controller *);
 
-/* config */
-extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
-
 /* control registers relative to c->base */
 #define I2O_IRQ_STATUS	0x30
 #define I2O_IRQ_MASK	0x34
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index a5e260b..773b0a4 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -176,6 +176,7 @@ static ssize_t i2o_device_show_tid(struct device *dev,
 	return strlen(buf) + 1;
 }
 
+/* I2O device attributes */
 struct device_attribute i2o_device_attrs[] = {
 	__ATTR(class_id, S_IRUGO, i2o_device_show_class_id, NULL),
 	__ATTR(tid, S_IRUGO, i2o_device_show_tid, NULL),
@@ -505,12 +506,12 @@ int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
  *		else return specific fields
  *			ibuf contains fieldindexes
  *
- * 	if oper == I2O_PARAMS_LIST_GET, get from specific rows
- * 		if fieldcount == -1 return all fields
+ *	if oper == I2O_PARAMS_LIST_GET, get from specific rows
+ *		if fieldcount == -1 return all fields
  *			ibuf contains rowcount, keyvalues
- * 		else return specific fields
+ *		else return specific fields
  *			fieldcount is # of fieldindexes
- *  			ibuf contains fieldindexes, rowcount, keyvalues
+ *			ibuf contains fieldindexes, rowcount, keyvalues
  *
  *	You could also use directly function i2o_issue_params().
  */
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 9c631c8..45f4119 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -64,7 +64,7 @@ static int i2o_bus_match(struct device *dev, struct device_driver *drv)
 struct bus_type i2o_bus_type = {
 	.name = "i2o",
 	.match = i2o_bus_match,
-	.dev_attrs = i2o_device_attrs,
+	.dev_attrs = i2o_device_attrs
 };
 
 /**
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index ed2df54..3e865b7 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -59,10 +59,12 @@
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 
+#include <scsi/scsi.h>
+
 #include "i2o_block.h"
 
 #define OSM_NAME	"block-osm"
-#define OSM_VERSION	"1.287"
+#define OSM_VERSION	"1.316"
 #define OSM_DESCRIPTION	"I2O Block Device OSM"
 
 static struct i2o_driver i2o_block_driver;
@@ -845,10 +847,10 @@ static int i2o_block_transfer(struct request *req)
 		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
 		 */
 		if (rq_data_dir(req) == READ) {
-			cmd[0] = 0x28;
+			cmd[0] = READ_10;
 			scsi_flags = 0x60a0000a;
 		} else {
-			cmd[0] = 0x2A;
+			cmd[0] = WRITE_10;
 			scsi_flags = 0xa0a0000a;
 		}
 
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index d559a17..2a0c42b 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -28,7 +28,7 @@
  */
 
 #define OSM_NAME	"proc-osm"
-#define OSM_VERSION	"1.145"
+#define OSM_VERSION	"1.316"
 #define OSM_DESCRIPTION	"I2O ProcFS OSM"
 
 #define I2O_MAX_MODULES 4
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 76b9516..f9e5a23 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -70,7 +70,7 @@
 #include <scsi/sg_request.h>
 
 #define OSM_NAME	"scsi-osm"
-#define OSM_VERSION	"1.282"
+#define OSM_VERSION	"1.316"
 #define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
 
 static struct i2o_driver i2o_scsi_driver;
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 7411a05..0e46518 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -32,7 +32,7 @@
 #include "core.h"
 
 #define OSM_NAME	"i2o"
-#define OSM_VERSION	"1.288"
+#define OSM_VERSION	"1.316"
 #define OSM_DESCRIPTION	"I2O subsystem"
 
 /* global I2O controller list */
@@ -730,10 +730,6 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 	 * Provide three SGL-elements:
 	 * System table (SysTab), Private memory space declaration and
 	 * Private i/o space declaration
-	 *
-	 * FIXME: is this still true?
-	 * Nasty one here. We can't use dma_alloc_coherent to send the
-	 * same table to everyone. We have to go remap it for them all
 	 */
 
 	msg->body[0] = cpu_to_le32(c->unit + 2);
@@ -756,8 +752,6 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 	else
 		osm_debug("%s: SysTab set.\n", c->name);
 
-	i2o_status_get(c);	// Entered READY state
-
 	return rc;
 }
 
@@ -767,7 +761,7 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
  *
  *	Send the system table and enable the I2O controller.
  *
- *	Returns 0 on success or negativer error code on failure.
+ *	Returns 0 on success or negative error code on failure.
  */
 static int i2o_iop_online(struct i2o_controller *c)
 {
@@ -977,7 +971,7 @@ int i2o_status_get(struct i2o_controller *c)
  *	The HRT contains information about possible hidden devices but is
  *	mostly useless to us.
  *
- *	Returns 0 on success or negativer error code on failure.
+ *	Returns 0 on success or negative error code on failure.
  */
 static int i2o_hrt_get(struct i2o_controller *c)
 {
-- 
cgit v1.1


From f6ed39a6e1a88240eec629a3da17c3a47ada3b89 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:33 -0800
Subject: [PATCH] I2O: Optimizing

- make i2o_iop_free() static inline (from Adrian Bunk)

- changed kmalloc() + memset(0) into kzalloc()

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/config-osm.c |  2 +-
 drivers/message/i2o/core.h       | 11 ++++++++++-
 drivers/message/i2o/device.c     |  4 +---
 drivers/message/i2o/driver.c     |  7 ++-----
 drivers/message/i2o/exec-osm.c   |  4 +---
 drivers/message/i2o/i2o_block.c  |  5 ++---
 drivers/message/i2o/i2o_config.c |  6 ++----
 drivers/message/i2o/iop.c        | 18 +++---------------
 8 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
index b613890..3bba7aa 100644
--- a/drivers/message/i2o/config-osm.c
+++ b/drivers/message/i2o/config-osm.c
@@ -22,7 +22,7 @@
 #include <asm/uaccess.h>
 
 #define OSM_NAME	"config-osm"
-#define OSM_VERSION	"1.317"
+#define OSM_VERSION	"1.323"
 #define OSM_DESCRIPTION	"I2O Configuration OSM"
 
 /* access mode user rw */
diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
index edab686..9062856 100644
--- a/drivers/message/i2o/core.h
+++ b/drivers/message/i2o/core.h
@@ -40,7 +40,16 @@ extern int i2o_device_parse_lct(struct i2o_controller *);
 
 /* IOP */
 extern struct i2o_controller *i2o_iop_alloc(void);
-extern void i2o_iop_free(struct i2o_controller *);
+
+/**
+ *	i2o_iop_free - Free the i2o_controller struct
+ *	@c: I2O controller to free
+ */
+static inline void i2o_iop_free(struct i2o_controller *c)
+{
+	i2o_pool_free(&c->in_msg);
+	kfree(c);
+}
 
 extern int i2o_iop_add(struct i2o_controller *);
 extern void i2o_iop_remove(struct i2o_controller *);
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 773b0a4..34976b2 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -195,12 +195,10 @@ static struct i2o_device *i2o_device_alloc(void)
 {
 	struct i2o_device *dev;
 
-	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
-	memset(dev, 0, sizeof(*dev));
-
 	INIT_LIST_HEAD(&dev->list);
 	init_MUTEX(&dev->lock);
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 45f4119..6413022 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -217,10 +217,9 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 		/* cut of header from message size (in 32-bit words) */
 		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
 
-		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
+		evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
 		if (!evt)
 			return -ENOMEM;
-		memset(evt, 0, size * 4 + sizeof(*evt));
 
 		evt->size = size;
 		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
@@ -348,12 +347,10 @@ int __init i2o_driver_init(void)
 	osm_info("max drivers = %d\n", i2o_max_drivers);
 
 	i2o_drivers =
-	    kmalloc(i2o_max_drivers * sizeof(*i2o_drivers), GFP_KERNEL);
+	    kzalloc(i2o_max_drivers * sizeof(*i2o_drivers), GFP_KERNEL);
 	if (!i2o_drivers)
 		return -ENOMEM;
 
-	memset(i2o_drivers, 0, i2o_max_drivers * sizeof(*i2o_drivers));
-
 	rc = bus_register(&i2o_bus_type);
 
 	if (rc < 0)
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index d24548f..d418ad3 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -75,12 +75,10 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
 {
 	struct i2o_exec_wait *wait;
 
-	wait = kmalloc(sizeof(*wait), GFP_KERNEL);
+	wait = kzalloc(sizeof(*wait), GFP_KERNEL);
 	if (!wait)
 		return NULL;
 
-	memset(wait, 0, sizeof(*wait));
-
 	INIT_LIST_HEAD(&wait->list);
 
 	return wait;
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 3e865b7..c5807d6 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -64,7 +64,7 @@
 #include "i2o_block.h"
 
 #define OSM_NAME	"block-osm"
-#define OSM_VERSION	"1.316"
+#define OSM_VERSION	"1.325"
 #define OSM_DESCRIPTION	"I2O Block Device OSM"
 
 static struct i2o_driver i2o_block_driver;
@@ -981,13 +981,12 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
 	struct request_queue *queue;
 	int rc;
 
-	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev) {
 		osm_err("Insufficient memory to allocate I2O Block disk.\n");
 		rc = -ENOMEM;
 		goto exit;
 	}
-	memset(dev, 0, sizeof(*dev));
 
 	INIT_LIST_HEAD(&dev->open_queue);
 	spin_lock_init(&dev->lock);
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 286fef3..89daf67 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -583,13 +583,12 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
 	reply_size >>= 16;
 	reply_size <<= 2;
 
-	reply = kmalloc(reply_size, GFP_KERNEL);
+	reply = kzalloc(reply_size, GFP_KERNEL);
 	if (!reply) {
 		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
 		       c->name);
 		return -ENOMEM;
 	}
-	memset(reply, 0, reply_size);
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
@@ -817,13 +816,12 @@ static int i2o_cfg_passthru(unsigned long arg)
 	reply_size >>= 16;
 	reply_size <<= 2;
 
-	reply = kmalloc(reply_size, GFP_KERNEL);
+	reply = kzalloc(reply_size, GFP_KERNEL);
 	if (!reply) {
 		printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
 		       c->name);
 		return -ENOMEM;
 	}
-	memset(reply, 0, reply_size);
 
 	sg_offset = (msg->u.head[0] >> 4) & 0x0f;
 
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 0e46518..4921674 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -32,7 +32,7 @@
 #include "core.h"
 
 #define OSM_NAME	"i2o"
-#define OSM_VERSION	"1.316"
+#define OSM_VERSION	"1.325"
 #define OSM_DESCRIPTION	"I2O subsystem"
 
 /* global I2O controller list */
@@ -838,12 +838,11 @@ static int i2o_systab_build(void)
 	i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
 	    sizeof(struct i2o_sys_tbl_entry);
 
-	systab = i2o_systab.virt = kmalloc(i2o_systab.len, GFP_KERNEL);
+	systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
 	if (!systab) {
 		osm_err("unable to allocate memory for System Table\n");
 		return -ENOMEM;
 	}
-	memset(systab, 0, i2o_systab.len);
 
 	systab->version = I2OVERSION;
 	systab->change_ind = change_ind + 1;
@@ -1020,16 +1019,6 @@ static int i2o_hrt_get(struct i2o_controller *c)
 }
 
 /**
- *	i2o_iop_free - Free the i2o_controller struct
- *	@c: I2O controller to free
- */
-void i2o_iop_free(struct i2o_controller *c)
-{
-	i2o_pool_free(&c->in_msg);
-	kfree(c);
-};
-
-/**
  *	i2o_iop_release - release the memory for a I2O controller
  *	@dev: I2O controller which should be released
  *
@@ -1058,13 +1047,12 @@ struct i2o_controller *i2o_iop_alloc(void)
 	struct i2o_controller *c;
 	char poolname[32];
 
-	c = kmalloc(sizeof(*c), GFP_KERNEL);
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
 	if (!c) {
 		osm_err("i2o: Insufficient memory to allocate a I2O controller."
 			"\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	memset(c, 0, sizeof(*c));
 
 	c->unit = unit++;
 	sprintf(c->name, "iop%d", c->unit);
-- 
cgit v1.1


From 524e3b623a9228efbdb70484b5214f27a1ca985d Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Fri, 6 Jan 2006 00:19:34 -0800
Subject: [PATCH] I2O: Lindent run

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/device.c   | 21 +++++++++------------
 drivers/message/i2o/exec-osm.c |  2 +-
 drivers/message/i2o/i2o_lan.h  | 38 +++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 34976b2..ee18305 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -43,7 +43,7 @@ static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
 
 	msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
 	msg->u.head[1] =
-		cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
+	    cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
 	msg->body[0] = cpu_to_le32(type);
 
 	return i2o_msg_post_wait(dev->iop, msg, 60);
@@ -123,7 +123,6 @@ int i2o_device_claim_release(struct i2o_device *dev)
 	return rc;
 }
 
-
 /**
  *	i2o_device_release - release the memory for a I2O device
  *	@dev: I2O device which should be released
@@ -140,7 +139,6 @@ static void i2o_device_release(struct device *dev)
 	kfree(i2o_dev);
 }
 
-
 /**
  *	i2o_device_show_class_id - Displays class id of I2O device
  *	@dev: device of which the class id should be displayed
@@ -250,10 +248,10 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 
 	/* create user entries refering to this device */
 	list_for_each_entry(tmp, &c->devices, list)
-		if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
-		    && (tmp != i2o_dev))
-		    sysfs_create_link(&tmp->device.kobj,
-				      &i2o_dev->device.kobj, "user");
+	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev))
+		sysfs_create_link(&tmp->device.kobj,
+				  &i2o_dev->device.kobj, "user");
 
 	/* create parent entries for this device */
 	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
@@ -263,10 +261,10 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
 
 	/* create parent entries refering to this device */
 	list_for_each_entry(tmp, &c->devices, list)
-		if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
-		    && (tmp != i2o_dev))
-			sysfs_create_link(&tmp->device.kobj,
-					  &i2o_dev->device.kobj, "parent");
+	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev))
+		sysfs_create_link(&tmp->device.kobj,
+				  &i2o_dev->device.kobj, "parent");
 
 	i2o_driver_notify_device_add_all(i2o_dev);
 
@@ -410,7 +408,6 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 	return 0;
 }
 
-
 /*
  *	Run time support routines
  */
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index d418ad3..9bb9859 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -33,7 +33,7 @@
 #include <linux/workqueue.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/sched.h>   /* wait_event_interruptible_timeout() needs this */
+#include <linux/sched.h>	/* wait_event_interruptible_timeout() needs this */
 #include <asm/param.h>		/* HZ */
 #include "core.h"
 
diff --git a/drivers/message/i2o/i2o_lan.h b/drivers/message/i2o/i2o_lan.h
index 561d633..6502b81 100644
--- a/drivers/message/i2o/i2o_lan.h
+++ b/drivers/message/i2o/i2o_lan.h
@@ -103,14 +103,14 @@
 #define I2O_LAN_DSC_SUSPENDED			0x11
 
 struct i2o_packet_info {
-	u32 offset : 24;
-	u32 flags  : 8;
-	u32 len    : 24;
-	u32 status : 8;
+	u32 offset:24;
+	u32 flags:8;
+	u32 len:24;
+	u32 status:8;
 };
 
 struct i2o_bucket_descriptor {
-	u32 context; 			/* FIXME: 64bit support */
+	u32 context;		/* FIXME: 64bit support */
 	struct i2o_packet_info packet_info[1];
 };
 
@@ -127,14 +127,14 @@ struct i2o_lan_local {
 	u8 unit;
 	struct i2o_device *i2o_dev;
 
-	struct fddi_statistics stats;   /* see also struct net_device_stats */
-	unsigned short (*type_trans)(struct sk_buff *, struct net_device *);
-	atomic_t buckets_out;  		/* nbr of unused buckets on DDM */
-	atomic_t tx_out;		/* outstanding TXes */
-	u8 tx_count;  			/* packets in one TX message frame */
-	u16 tx_max_out;	   		/* DDM's Tx queue len */
-	u8 sgl_max;			/* max SGLs in one message frame */
-	u32 m;				/* IOP address of the batch msg frame */
+	struct fddi_statistics stats;	/* see also struct net_device_stats */
+	unsigned short (*type_trans) (struct sk_buff *, struct net_device *);
+	atomic_t buckets_out;	/* nbr of unused buckets on DDM */
+	atomic_t tx_out;	/* outstanding TXes */
+	u8 tx_count;		/* packets in one TX message frame */
+	u16 tx_max_out;		/* DDM's Tx queue len */
+	u8 sgl_max;		/* max SGLs in one message frame */
+	u32 m;			/* IOP address of the batch msg frame */
 
 	struct work_struct i2o_batch_send_task;
 	int send_active;
@@ -144,16 +144,16 @@ struct i2o_lan_local {
 
 	spinlock_t tx_lock;
 
-	u32 max_size_mc_table;		/* max number of multicast addresses */
+	u32 max_size_mc_table;	/* max number of multicast addresses */
 
 	/* LAN OSM configurable parameters are here: */
 
-	u16 max_buckets_out;		/* max nbr of buckets to send to DDM */
-	u16 bucket_thresh;		/* send more when this many used */
+	u16 max_buckets_out;	/* max nbr of buckets to send to DDM */
+	u16 bucket_thresh;	/* send more when this many used */
 	u16 rx_copybreak;
 
-	u8  tx_batch_mode;		/* Set when using batch mode sends */
-	u32 i2o_event_mask;		/* To turn on interesting event flags */
+	u8 tx_batch_mode;	/* Set when using batch mode sends */
+	u32 i2o_event_mask;	/* To turn on interesting event flags */
 };
 
-#endif /* _I2O_LAN_H */
+#endif				/* _I2O_LAN_H */
-- 
cgit v1.1


From 0aa7c6990e7de06131cdc14ef4abfcab017c24a0 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:34 -0800
Subject: [PATCH] fuse: clean up fuse_lookup()

Simplify fuse_lookup() and related functions.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 75 ++++++++++++++++++-----------------------------------------
 1 file changed, 23 insertions(+), 52 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51f5da6..0d1438a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,7 +13,6 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
-#include <linux/mount.h>
 
 static inline unsigned long time_to_jiffies(unsigned long sec,
 					    unsigned long nsec)
@@ -22,6 +21,13 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 	return jiffies + timespec_to_jiffies(&ts);
 }
 
+static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
+{
+	struct fuse_inode *fi = get_fuse_inode(entry->d_inode);
+	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
+	fi->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
 static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 			     struct dentry *entry,
 			     struct fuse_entry_out *outarg)
@@ -66,10 +72,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fuse_change_attributes(inode, &outarg.attr);
-		entry->d_time = time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+		fuse_change_timeout(entry, &outarg);
 	}
 	return 1;
 }
@@ -96,8 +99,8 @@ static struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
-static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
-			    struct inode **inodep)
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
 {
 	int err;
 	struct fuse_entry_out outarg;
@@ -106,11 +109,11 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 	struct fuse_req *req;
 
 	if (entry->d_name.len > FUSE_NAME_MAX)
-		return -ENAMETOOLONG;
+		return ERR_PTR(-ENAMETOOLONG);
 
 	req = fuse_get_request(fc);
 	if (!req)
-		return -EINTR;
+		return ERR_PTR(-EINTR);
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -122,24 +125,22 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 				  &outarg.attr);
 		if (!inode) {
 			fuse_send_forget(fc, req, outarg.nodeid, 1);
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 		}
 	}
 	fuse_put_request(fc, req);
 	if (err && err != -ENOENT)
-		return err;
+		return ERR_PTR(err);
 
-	if (inode) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		entry->d_time =	time_to_jiffies(outarg.entry_valid,
-						outarg.entry_valid_nsec);
-		fi->i_time = time_to_jiffies(outarg.attr_valid,
-					     outarg.attr_valid_nsec);
+	if (inode && dir_alias(inode)) {
+		iput(inode);
+		return ERR_PTR(-EIO);
 	}
-
+	d_add(entry, inode);
 	entry->d_op = &fuse_dentry_operations;
-	*inodep = inode;
-	return 0;
+	if (inode)
+		fuse_change_timeout(entry, &outarg);
+	return NULL;
 }
 
 void fuse_invalidate_attr(struct inode *inode)
@@ -163,7 +164,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct fuse_open_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
-	struct fuse_inode *fi;
 	struct fuse_file *ff;
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
@@ -224,13 +224,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 		goto out_put_request;
 	}
 	fuse_put_request(fc, req);
-	entry->d_time =	time_to_jiffies(outentry.entry_valid,
-					outentry.entry_valid_nsec);
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outentry.attr_valid,
-				     outentry.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outentry);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
 		ff->fh = outopen.fh;
@@ -254,7 +249,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 {
 	struct fuse_entry_out outarg;
 	struct inode *inode;
-	struct fuse_inode *fi;
 	int err;
 
 	req->in.h.nodeid = get_node_id(dir);
@@ -286,14 +280,8 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		return -EIO;
 	}
 
-	entry->d_time = time_to_jiffies(outarg.entry_valid,
-					outarg.entry_valid_nsec);
-
-	fi = get_fuse_inode(inode);
-	fi->i_time = time_to_jiffies(outarg.attr_valid,
-				     outarg.attr_valid_nsec);
-
 	d_instantiate(entry, inode);
+	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
 }
@@ -883,23 +871,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	return err;
 }
 
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
-{
-	struct inode *inode;
-	int err;
-
-	err = fuse_lookup_iget(dir, entry, &inode);
-	if (err)
-		return ERR_PTR(err);
-	if (inode && dir_alias(inode)) {
-		iput(inode);
-		return ERR_PTR(-EIO);
-	}
-	d_add(entry, inode);
-	return NULL;
-}
-
 static int fuse_setxattr(struct dentry *entry, const char *name,
 			 const void *value, size_t size, int flags)
 {
-- 
cgit v1.1


From 4633a22e7added835fd1d4b072dbcc4474aa3017 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:36 -0800
Subject: [PATCH] fuse: clean up page offset calculation

Use page_offset() instead of doing page offset calculation by hand.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca8614..18aafa6 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -272,7 +272,6 @@ static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT;
 	struct fuse_req *req = fuse_get_request(fc);
 	int err = -EINTR;
 	if (!req)
@@ -281,7 +280,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	req->out.page_zeroing = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE);
+	fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
@@ -295,7 +294,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 static int fuse_send_readpages(struct fuse_req *req, struct file *file,
 			       struct inode *inode)
 {
-	loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT;
+	loff_t pos = page_offset(req->pages[0]);
 	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
 	unsigned i;
 	req->out.page_zeroing = 1;
@@ -402,7 +401,7 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	unsigned count = to - offset;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset;
+	loff_t pos = page_offset(page) + offset;
 	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
-- 
cgit v1.1


From 45714d65618407bce1fd0271bc58303ce14b0785 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:36 -0800
Subject: [PATCH] fuse: bump interface version

Change interface version to 7.4.

Following changes will need backward compatibility support, so store the minor
version returned by userspace.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 2 ++
 fs/fuse/fuse_i.h     | 3 +++
 include/linux/fuse.h | 2 +-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e6..e5bc3f8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -178,6 +178,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
 			fc->conn_error = 1;
 
+		fc->minor = req->misc.init_in_out.minor;
+
 		/* After INIT reply is received other requests can go
 		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
 		   up()s on outstanding_sem.  The last up() is done in
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301..2d4835e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Negotiated minor version */
+	unsigned minor;
+
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 };
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b76b558..3c85f1a 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 3
+#define FUSE_KERNEL_MINOR_VERSION 4
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
-- 
cgit v1.1


From de5f12025572ef8fcffa4be5453061725acfb754 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:37 -0800
Subject: [PATCH] fuse: add frsize to statfs reply

Add 'frsize' member to the statfs reply.

I'm not sure if sending f_fsid will ever be needed, but just in case leave
some space at the end of the structure, so less compatibility mess would be
required.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/inode.c      | 5 ++++-
 include/linux/fuse.h | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546..3b928a0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -218,6 +218,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 {
 	stbuf->f_type    = FUSE_SUPER_MAGIC;
 	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
 	stbuf->f_blocks  = attr->blocks;
 	stbuf->f_bfree   = attr->bfree;
 	stbuf->f_bavail  = attr->bavail;
@@ -238,10 +239,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	if (!req)
 		return -EINTR;
 
+	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].size =
+		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
 	request_send(fc, req);
 	err = req->out.h.error;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 3c85f1a..9d5177c 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -53,6 +53,9 @@ struct fuse_kstatfs {
 	__u64	ffree;
 	__u32	bsize;
 	__u32	namelen;
+	__u32	frsize;
+	__u32	padding;
+	__u32	spare[6];
 };
 
 #define FATTR_MODE	(1 << 0)
@@ -213,6 +216,8 @@ struct fuse_write_out {
 	__u32	padding;
 };
 
+#define FUSE_COMPAT_STATFS_SIZE 48
+
 struct fuse_statfs_out {
 	struct fuse_kstatfs st;
 };
-- 
cgit v1.1


From 8cbdf1e6f6876b37d2a0d96fd15ea9f90f7d51c1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:38 -0800
Subject: [PATCH] fuse: support caching negative dentries

Add support for caching negative dentries.

Up till now, ->d_revalidate() always forced a new lookup on these.  Now let
the lookup method return a zero node ID (not used for anything else) meaning a
negative entry, but with a positive cache timeout.  The old way of signaling
negative entry (replying ENOENT) still works.

Userspace should check the ABI minor version to see whether sending a zero ID
is allowed by the kernel or not.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 64 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d1438a..4c127f2 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -23,9 +23,26 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 
 static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 {
-	struct fuse_inode *fi = get_fuse_inode(entry->d_inode);
 	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
-	fi->i_time = time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+	if (entry->d_inode)
+		get_fuse_inode(entry->d_inode)->i_time =
+			time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+void fuse_invalidate_attr(struct inode *inode)
+{
+	get_fuse_inode(inode)->i_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+	entry->d_time = jiffies - 1;
+}
+
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+	d_invalidate(entry);
+	fuse_invalidate_entry_cache(entry);
 }
 
 static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
@@ -45,15 +62,22 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
-	if (!entry->d_inode || is_bad_inode(entry->d_inode))
+	struct inode *inode = entry->d_inode;
+
+	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (time_after(jiffies, entry->d_time)) {
 		int err;
 		struct fuse_entry_out outarg;
-		struct inode *inode = entry->d_inode;
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_req *req = fuse_get_request(fc);
+		struct fuse_conn *fc;
+		struct fuse_req *req;
+
+		fuse_invalidate_entry_cache(entry);
+		if (!inode)
+			return 0;
+
+		fc = get_fuse_conn(inode);
+		req = fuse_get_request(fc);
 		if (!req)
 			return 0;
 
@@ -61,6 +85,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		request_send(fc, req);
 		err = req->out.h.error;
 		if (!err) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
 				fuse_send_forget(fc, req, outarg.nodeid, 1);
 				return 0;
@@ -118,9 +143,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && invalid_nodeid(outarg.nodeid))
+	if (!err && outarg.nodeid && invalid_nodeid(outarg.nodeid))
 		err = -EIO;
-	if (!err) {
+	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 				  &outarg.attr);
 		if (!inode) {
@@ -138,22 +163,13 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	}
 	d_add(entry, inode);
 	entry->d_op = &fuse_dentry_operations;
-	if (inode)
+	if (!err)
 		fuse_change_timeout(entry, &outarg);
+	else
+		fuse_invalidate_entry_cache(entry);
 	return NULL;
 }
 
-void fuse_invalidate_attr(struct inode *inode)
-{
-	get_fuse_inode(inode)->i_time = jiffies - 1;
-}
-
-static void fuse_invalidate_entry(struct dentry *entry)
-{
-	d_invalidate(entry);
-	entry->d_time = jiffies - 1;
-}
-
 static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 			    struct nameidata *nd)
 {
@@ -387,6 +403,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 		inode->i_nlink = 0;
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -412,6 +429,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (!err) {
 		entry->d_inode->i_nlink = 0;
 		fuse_invalidate_attr(dir);
+		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -447,6 +465,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 		fuse_invalidate_attr(olddir);
 		if (olddir != newdir)
 			fuse_invalidate_attr(newdir);
+
+		/* newent will end up negative */
+		if (newent->d_inode)
+			fuse_invalidate_entry_cache(newent);
 	} else if (err == -EINTR) {
 		/* If request was interrupted, DEITY only knows if the
 		   rename actually took place.  If the invalidation
-- 
cgit v1.1


From 6f9f11806af8ad3a107714a3ece56c1c4fafd047 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:39 -0800
Subject: [PATCH] fuse: add code documentation

Document some not-so-trivial functions.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 9 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4c127f2..fead7f4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,15 @@
 #include <linux/sched.h>
 #include <linux/namei.h>
 
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
 static inline unsigned long time_to_jiffies(unsigned long sec,
 					    unsigned long nsec)
 {
@@ -21,6 +30,10 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
 	return jiffies + timespec_to_jiffies(&ts);
 }
 
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
 static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 {
 	entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
@@ -29,16 +42,32 @@ static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
 			time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
 }
 
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
 void fuse_invalidate_attr(struct inode *inode)
 {
 	get_fuse_inode(inode)->i_time = jiffies - 1;
 }
 
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
 static void fuse_invalidate_entry_cache(struct dentry *entry)
 {
 	entry->d_time = jiffies - 1;
 }
 
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
 static void fuse_invalidate_entry(struct dentry *entry)
 {
 	d_invalidate(entry);
@@ -60,6 +89,15 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 	req->out.args[0].value = outarg;
 }
 
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup.  If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more.  If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode = entry->d_inode;
@@ -72,6 +110,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct fuse_conn *fc;
 		struct fuse_req *req;
 
+		/* Doesn't hurt to "reset" the validity timeout */
 		fuse_invalidate_entry_cache(entry);
 		if (!inode)
 			return 0;
@@ -102,10 +141,13 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 	return 1;
 }
 
+/*
+ * Check if there's already a hashed alias of this directory inode.
+ * If yes, then lookup and mkdir must not create a new alias.
+ */
 static int dir_alias(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
-		/* Don't allow creating an alias to a directory  */
 		struct dentry *alias = d_find_alias(inode);
 		if (alias) {
 			dput(alias);
@@ -170,6 +212,12 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	return NULL;
 }
 
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
 static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 			    struct nameidata *nd)
 {
@@ -236,6 +284,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		ff->fh = outopen.fh;
+		/* Special release, with inode = NULL, this will
+		   trigger a 'forget' request when the release is
+		   complete */
 		fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
 		goto out_put_request;
 	}
@@ -259,6 +310,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	return err;
 }
 
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
 static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 			    struct inode *dir, struct dentry *entry,
 			    int mode)
@@ -576,6 +630,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	return 0;
 }
 
+/*
+ * Check whether the inode attributes are still valid
+ *
+ * If the attribute validity timeout has expired, then fetch the fresh
+ * attributes with a 'getattr' request
+ *
+ * I'm not sure why cached attributes are never returned for the root
+ * inode, this is probably being too cautious.
+ */
 static int fuse_revalidate(struct dentry *entry)
 {
 	struct inode *inode = entry->d_inode;
@@ -623,6 +686,19 @@ static int fuse_access(struct inode *inode, int mask)
 	return err;
 }
 
+/*
+ * Check permission.  The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode.  This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation.  An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent.  Execute permission is still checked
+ * locally based on file mode.
+ */
 static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -641,14 +717,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 				err = generic_permission(inode, mask, NULL);
 		}
 
-		/* FIXME: Need some mechanism to revoke permissions:
-		   currently if the filesystem suddenly changes the
-		   file mode, we will not be informed about it, and
-		   continue to allow access to the file/directory.
-
-		   This is actually not so grave, since the user can
-		   simply keep access to the file/directory anyway by
-		   keeping it open... */
+		/* Note: the opposite of the above test does not
+		   exist.  So if permissions are revoked this won't be
+		   noticed immediately, only after the attribute
+		   timeout has expired */
 
 		return err;
 	} else {
@@ -816,6 +888,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	}
 }
 
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case.  So do the rlimit
+ * checking by hand and call vmtruncate() only after the file has
+ * actually been truncated.
+ */
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
 	struct inode *inode = entry->d_inode;
-- 
cgit v1.1


From 248d86e87d12da19eee602075f05a49a5215288b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:39 -0800
Subject: [PATCH] fuse: fail file operations on bad inode

Make file operations on a bad inode fail.  This just makes things a
bit more consistent.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c  |  7 ++++++-
 fs/fuse/file.c | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fead7f4..9a6075d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -773,7 +773,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	struct page *page;
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 18aafa6..c989f0e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -163,6 +163,9 @@ static int fuse_flush(struct file *file)
 	struct fuse_flush_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if (fc->no_flush)
 		return 0;
 
@@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	struct fuse_fsync_in inarg;
 	int err;
 
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
@@ -272,8 +278,15 @@ static int fuse_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	int err = -EINTR;
+	struct fuse_req *req;
+	int err;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = -EINTR;
+	req = fuse_get_request(fc);
 	if (!req)
 		goto out;
 
@@ -344,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_readpages_data data;
 	int err;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
 	data.file = file;
 	data.inode = inode;
 	data.req = fuse_get_request(fc);
@@ -402,7 +419,12 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	loff_t pos = page_offset(page) + offset;
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -474,7 +496,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	ssize_t res = 0;
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
-- 
cgit v1.1


From 1d3d752b471d2a3a1d5e4fe177e5e7d52abb4e4c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:40 -0800
Subject: [PATCH] fuse: clean up request size limit checking

Change the way a too large request is handled.  Until now in this case the
device read returned -EINVAL and the operation returned -EIO.

Make it more flexibible by not returning -EINVAL from the read, but restarting
it instead.

Also remove the fixed limit on setxattr data and let the filesystem provide as
large a read buffer as it needs to handle the extended attribute data.

The symbolic link length is already checked by VFS to be less than PATH_MAX,
so the extra check against FUSE_SYMLINK_MAX is not needed.

The check in fuse_create_open() against FUSE_NAME_MAX is not needed, since the
dentry has already been looked up, and hence the name already checked.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 26 ++++++++++++++++----------
 fs/fuse/dir.c        | 14 +-------------
 fs/fuse/fuse_i.h     |  9 ++++++---
 fs/fuse/inode.c      |  2 +-
 include/linux/fuse.h |  8 ++------
 5 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e5bc3f8..1afdffd 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -617,6 +617,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	unsigned reqsize;
 
+ restart:
 	spin_lock(&fuse_lock);
 	fc = file->private_data;
 	err = -EPERM;
@@ -632,20 +633,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
 
 	in = &req->in;
-	reqsize = req->in.h.len;
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
-	err = -EINVAL;
-	if (iov_length(iov, nr_segs) >= reqsize) {
-		err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
-		if (!err)
-			err = fuse_copy_args(&cs, in->numargs, in->argpages,
-					     (struct fuse_arg *) in->args, 0);
+	reqsize = in->h.len;
+	/* If request is too large, reply with an error and restart the read */
+	if (iov_length(iov, nr_segs) < reqsize) {
+		req->out.h.error = -EIO;
+		/* SETXATTR is special, since it may contain too large data */
+		if (in->h.opcode == FUSE_SETXATTR)
+			req->out.h.error = -E2BIG;
+		request_end(fc, req);
+		goto restart;
 	}
+	spin_unlock(&fuse_lock);
+	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+	if (!err)
+		err = fuse_copy_args(&cs, in->numargs, in->argpages,
+				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-
 	spin_lock(&fuse_lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9a6075d..f156392 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -236,10 +236,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		goto out;
 
-	err = -ENAMETOOLONG;
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		goto out;
-
 	err = -EINTR;
 	req = fuse_get_request(fc);
 	if (!req)
@@ -413,12 +409,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req;
-
-	if (len > FUSE_SYMLINK_MAX)
-		return -ENAMETOOLONG;
-
-	req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_request(fc);
 	if (!req)
 		return -EINTR;
 
@@ -988,9 +979,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	struct fuse_setxattr_in inarg;
 	int err;
 
-	if (size > FUSE_XATTR_SIZE_MAX)
-		return -E2BIG;
-
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2d4835e..17fd368 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,12 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
+/** Maximum size of data in a write request */
+#define FUSE_MAX_WRITE 4096
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -108,9 +114,6 @@ struct fuse_out {
 	struct fuse_arg args[3];
 };
 
-struct fuse_req;
-struct fuse_conn;
-
 /**
  * A request to the client
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3b928a0..3580b9e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_IN / 2;
+	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 9d5177c..8f64cc2 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -108,12 +108,8 @@ enum fuse_opcode {
 	FUSE_CREATE        = 35
 };
 
-/* Conservative buffer size for the client */
-#define FUSE_MAX_IN 8192
-
-#define FUSE_NAME_MAX 1024
-#define FUSE_SYMLINK_MAX 4096
-#define FUSE_XATTR_SIZE_MAX 4096
+/* The read buffer is required to be at least 8k, but may be much larger */
+#define FUSE_MIN_READ_BUFFER 8192
 
 struct fuse_entry_out {
 	__u64	nodeid;		/* Inode ID */
-- 
cgit v1.1


From 3ec870d524c9150add120475c8ddcfa50574f98e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:41 -0800
Subject: [PATCH] fuse: make maximum write data configurable

Make the maximum size of write data configurable by the filesystem.  The
previous fixed 4096 limit only worked on architectures where the page size is
less or equal to this.  This change make writing work on other architectures
too, and also lets the filesystem receive bigger write requests in direct_io
mode.

Normal writes which go through the page cache are still limited to a page
sized chunk per request.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c        | 48 ++++++++++++++++++++++++++++++------------------
 fs/fuse/fuse_i.h     |  6 ++----
 fs/fuse/inode.c      |  1 -
 include/linux/fuse.h | 11 +++++++++--
 4 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1afdffd..e08ab47 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
 	spin_unlock(&fuse_lock);
 }
 
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+	int i;
+	struct fuse_init_out *arg = &req->misc.init_out;
+
+	if (arg->major != FUSE_KERNEL_VERSION)
+		fc->conn_error = 1;
+	else {
+		fc->minor = arg->minor;
+		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+	}
+
+	/* After INIT reply is received other requests can go
+	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
+	   up()s on outstanding_sem.  The last up() is done in
+	   fuse_putback_request() */
+	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
+		up(&fc->outstanding_sem);
+}
+
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
@@ -172,21 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		up_read(&fc->sbput_sem);
 	}
 	wake_up(&req->waitq);
-	if (req->in.h.opcode == FUSE_INIT) {
-		int i;
-
-		if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
-			fc->conn_error = 1;
-
-		fc->minor = req->misc.init_in_out.minor;
-
-		/* After INIT reply is received other requests can go
-		   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-		   up()s on outstanding_sem.  The last up() is done in
-		   fuse_putback_request() */
-		for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-			up(&fc->outstanding_sem);
-	} else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
+	if (req->in.h.opcode == FUSE_INIT)
+		process_init_reply(fc, req);
+	else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
 		/* Special case for failed iget in CREATE */
 		u64 nodeid = req->in.h.nodeid;
 		__fuse_get_request(req);
@@ -359,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
 	/* This is called from fuse_read_super() so there's guaranteed
 	   to be a request available */
 	struct fuse_req *req = do_get_request(fc);
-	struct fuse_init_in_out *arg = &req->misc.init_in_out;
+	struct fuse_init_in *arg = &req->misc.init_in;
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	req->in.h.opcode = FUSE_INIT;
@@ -367,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
 	req->in.args[0].size = sizeof(*arg);
 	req->in.args[0].value = arg;
 	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(*arg);
-	req->out.args[0].value = arg;
+	/* Variable length arguement used for backward compatibility
+	   with interface version < 7.5.  Rest of init_out is zeroed
+	   by do_get_request(), so a short reply is not a problem */
+	req->out.argvar = 1;
+	req->out.args[0].size = sizeof(struct fuse_init_out);
+	req->out.args[0].value = &req->misc.init_out;
 	request_send_background(fc, req);
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 17fd368..74c8d09 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,9 +21,6 @@
 /** If more requests are outstanding, then the operation will block */
 #define FUSE_MAX_OUTSTANDING 10
 
-/** Maximum size of data in a write request */
-#define FUSE_MAX_WRITE 4096
-
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
@@ -162,7 +159,8 @@ struct fuse_req {
 	union {
 		struct fuse_forget_in forget_in;
 		struct fuse_release_in release_in;
-		struct fuse_init_in_out init_in_out;
+		struct fuse_init_in init_in;
+		struct fuse_init_out init_out;
 	} misc;
 
 	/** page vector */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3580b9e..e454186 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,7 +485,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->max_read = d.max_read;
 	if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
 		fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
-	fc->max_write = FUSE_MAX_WRITE;
 
 	err = -ENOMEM;
 	root = get_root_inode(sb, d.rootmode);
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8f64cc2..528959c 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 4
+#define FUSE_KERNEL_MINOR_VERSION 5
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -244,11 +244,18 @@ struct fuse_access_in {
 	__u32	padding;
 };
 
-struct fuse_init_in_out {
+struct fuse_init_in {
 	__u32	major;
 	__u32	minor;
 };
 
+struct fuse_init_out {
+	__u32	major;
+	__u32	minor;
+	__u32	unused[3];
+	__u32	max_write;
+};
+
 struct fuse_in_header {
 	__u32	len;
 	__u32	opcode;
-- 
cgit v1.1


From 6ad84acab972f4dfc78e6fdb04c419f82c497d29 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:42 -0800
Subject: [PATCH] fuse: ensure progress in read and write

In direct_io mode, send at least one page per reqest.  Previously it was
possible that reqests with zero data were sent, and hence the read/write
didn't make any progress, resulting in an infinite (though interruptible)
loop.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c989f0e..05deddd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -475,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 
 	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	npages = min(npages, FUSE_MAX_PAGES_PER_REQ);
+	npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
 	npages = get_user_pages(current, current->mm, user_addr, npages, write,
 				0, req->pages, NULL);
@@ -506,7 +506,6 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		return -EINTR;
 
 	while (count) {
-		size_t tmp;
 		size_t nres;
 		size_t nbytes = min(count, nmax);
 		int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -514,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 			res = err;
 			break;
 		}
-		tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-		nbytes = min(nbytes, tmp);
+		nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+		nbytes = min(count, nbytes);
 		if (write)
 			nres = fuse_send_write(req, file, inode, pos, nbytes);
 		else
-- 
cgit v1.1


From 39ee059affaf57a152c64cd3a0adc3f48f02ed71 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 6 Jan 2006 00:19:43 -0800
Subject: [PATCH] fuse: check file type in lookup

Previously invalid types were quietly changed to regular files, but at
revalidation the inode was changed to bad.  This was rather inconsistent
behavior.

Now check if the type is valid on initial lookup, and return -EIO if not.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dir.c   | 27 ++++++++++++++++++++-------
 fs/fuse/inode.c |  8 ++------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f156392..417bcee 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -166,6 +166,12 @@ static struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
+static inline int valid_mode(int m)
+{
+	return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  struct nameidata *nd)
 {
@@ -185,7 +191,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
-	if (!err && outarg.nodeid && invalid_nodeid(outarg.nodeid))
+	if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
+		     !valid_mode(outarg.attr.mode)))
 		err = -EIO;
 	if (!err && outarg.nodeid) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
@@ -328,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		fuse_put_request(fc, req);
 		return err;
 	}
-	if (invalid_nodeid(outarg.nodeid)) {
-		fuse_put_request(fc, req);
-		return -EIO;
-	}
+	err = -EIO;
+	if (invalid_nodeid(outarg.nodeid))
+		goto out_put_request;
+
+	if ((outarg.attr.mode ^ mode) & S_IFMT)
+		goto out_put_request;
+
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 			  &outarg.attr);
 	if (!inode) {
@@ -340,8 +350,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	fuse_put_request(fc, req);
 
-	/* Don't allow userspace to do really stupid things... */
-	if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) {
+	if (dir_alias(inode)) {
 		iput(inode);
 		return -EIO;
 	}
@@ -350,6 +359,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
+
+ out_put_request:
+	fuse_put_request(fc, req);
+	return err;
 }
 
 static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e454186..04c80cc 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 		fuse_init_common(inode);
 		init_special_inode(inode, inode->i_mode,
 				   new_decode_dev(attr->rdev));
-	} else {
-		/* Don't let user create weird files */
-		inode->i_mode = S_IFREG;
-		fuse_init_common(inode);
-		fuse_init_file_inode(inode);
-	}
+	} else
+		BUG();
 }
 
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
-- 
cgit v1.1


From c660629059abbbd0eb56e12f9bb4494f01800bbc Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:43 -0800
Subject: [PATCH] parport: buffer overflow fix

Fix potential buffer overflow in case the device ID did not end in semicolon.
Also might fail to negotiate back to IEEE1284_MODE_COMPAT in case of failure.
parport_device_id did not return what Documentation/parport-lowlevel.txt said,
so I changed it to match it.

Determining device ID length is overly complicated, but Tim Waugh recalled on
linux-parport seeing some buggy device that might need it.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/probe.c | 193 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 130 insertions(+), 63 deletions(-)

diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index 4b48b31..5c29e82 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -128,8 +128,131 @@ static void parse_data(struct parport *port, int device, char *str)
 	kfree(txt);
 }
 
+/* Read up to count-1 bytes of device id. Terminate buffer with
+ * '\0'. Buffer begins with two Device ID length bytes as given by
+ * device. */
+static ssize_t parport_read_device_id (struct parport *port, char *buffer,
+				       size_t count)
+{
+	unsigned char length[2];
+	unsigned lelen, belen;
+	size_t idlens[4];
+	unsigned numidlens;
+	unsigned current_idlen;
+	ssize_t retval;
+	size_t len;
+
+	/* First two bytes are MSB,LSB of inclusive length. */
+	retval = parport_read (port, length, 2);
+
+	if (retval < 0)
+		return retval;
+	if (retval != 2)
+		return -EIO;
+
+	if (count < 2)
+		return 0;
+	memcpy(buffer, length, 2);
+	len = 2;
+
+	/* Some devices wrongly send LE length, and some send it two
+	 * bytes short. Construct a sorted array of lengths to try. */
+	belen = (length[0] << 8) + length[1];
+	lelen = (length[1] << 8) + length[0];
+	idlens[0] = min(belen, lelen);
+	idlens[1] = idlens[0]+2;
+	if (belen != lelen) {
+		int off = 2;
+		/* Don't try lenghts of 0x100 and 0x200 as 1 and 2 */
+		if (idlens[0] <= 2)
+			off = 0;
+		idlens[off] = max(belen, lelen);
+		idlens[off+1] = idlens[off]+2;
+		numidlens = off+2;
+	}
+	else {
+		/* Some devices don't truly implement Device ID, but
+		 * just return constant nibble forever. This catches
+		 * also those cases. */
+		if (idlens[0] == 0 || idlens[0] > 0xFFF) {
+			printk (KERN_DEBUG "%s: reported broken Device ID"
+				" length of %#zX bytes\n",
+				port->name, idlens[0]);
+			return -EIO;
+		}
+		numidlens = 2;
+	}
+
+	/* Try to respect the given ID length despite all the bugs in
+	 * the ID length. Read according to shortest possible ID
+	 * first. */
+	for (current_idlen = 0; current_idlen < numidlens; ++current_idlen) {
+		size_t idlen = idlens[current_idlen];
+		if (idlen+1 >= count)
+			break;
+
+		retval = parport_read (port, buffer+len, idlen-len);
+
+		if (retval < 0)
+			return retval;
+		len += retval;
+
+		if (port->physport->ieee1284.phase != IEEE1284_PH_HBUSY_DAVAIL) {
+			if (belen != len) {
+				printk (KERN_DEBUG "%s: Device ID was %d bytes"
+					" while device told it would be %d"
+					" bytes\n",
+					port->name, len, belen);
+			}
+			goto done;
+		}
+
+		/* This might end reading the Device ID too
+		 * soon. Hopefully the needed fields were already in
+		 * the first 256 bytes or so that we must have read so
+		 * far. */
+		if (buffer[len-1] == ';') {
+ 			printk (KERN_DEBUG "%s: Device ID reading stopped"
+				" before device told data not available. "
+				"Current idlen %d of %d, len bytes %02X %02X\n",
+				port->name, current_idlen, numidlens,
+				length[0], length[1]);
+			goto done;
+		}
+	}
+	if (current_idlen < numidlens) {
+		/* Buffer not large enough, read to end of buffer. */
+		size_t idlen, len2;
+		if (len+1 < count) {
+			retval = parport_read (port, buffer+len, count-len-1);
+			if (retval < 0)
+				return retval;
+			len += retval;
+		}
+		/* Read the whole ID since some devices would not
+		 * otherwise give back the Device ID from beginning
+		 * next time when asked. */
+		idlen = idlens[current_idlen];
+		len2 = len;
+		while(len2 < idlen && retval > 0) {
+			char tmp[4];
+			retval = parport_read (port, tmp,
+					       min(sizeof tmp, idlen-len2));
+			if (retval < 0)
+				return retval;
+			len2 += retval;
+		}
+	}
+	/* In addition, there are broken devices out there that don't
+	   even finish off with a semi-colon. We do not need to care
+	   about those at this time. */
+ done:
+	buffer[len] = '\0';
+	return len;
+}
+
 /* Get Std 1284 Device ID. */
-ssize_t parport_device_id (int devnum, char *buffer, size_t len)
+ssize_t parport_device_id (int devnum, char *buffer, size_t count)
 {
 	ssize_t retval = -ENXIO;
 	struct pardevice *dev = parport_open (devnum, "Device ID probe",
@@ -139,76 +262,20 @@ ssize_t parport_device_id (int devnum, char *buffer, size_t len)
 
 	parport_claim_or_block (dev);
 
-	/* Negotiate to compatibility mode, and then to device ID mode.
-	 * (This is in case we are already in device ID mode.) */
+	/* Negotiate to compatibility mode, and then to device ID
+	 * mode. (This so that we start form beginning of device ID if
+	 * already in device ID mode.) */
 	parport_negotiate (dev->port, IEEE1284_MODE_COMPAT);
 	retval = parport_negotiate (dev->port,
 				    IEEE1284_MODE_NIBBLE | IEEE1284_DEVICEID);
 
 	if (!retval) {
-		int idlen;
-		unsigned char length[2];
-
-		/* First two bytes are MSB,LSB of inclusive length. */
-		retval = parport_read (dev->port, length, 2);
-
-		if (retval != 2) goto end_id;
-
-		idlen = (length[0] << 8) + length[1] - 2;
-		/*
-		 * Check if the caller-allocated buffer is large enough
-		 * otherwise bail out or there will be an at least off by one.
-		 */
-		if (idlen + 1 < len)
-			len = idlen;
-		else {
-			retval = -EINVAL;
-			goto out;
-		}
-		retval = parport_read (dev->port, buffer, len);
-
-		if (retval != len)
-			printk (KERN_DEBUG "%s: only read %Zd of %Zd ID bytes\n",
-				dev->port->name, retval,
-				len);
-
-		/* Some printer manufacturers mistakenly believe that
-                   the length field is supposed to be _exclusive_.
-		   In addition, there are broken devices out there
-                   that don't even finish off with a semi-colon. */
-		if (buffer[len - 1] != ';') {
-			ssize_t diff;
-			diff = parport_read (dev->port, buffer + len, 2);
-			retval += diff;
-
-			if (diff)
-				printk (KERN_DEBUG
-					"%s: device reported incorrect "
-					"length field (%d, should be %Zd)\n",
-					dev->port->name, idlen, retval);
-			else {
-				/* One semi-colon short of a device ID. */
-				buffer[len++] = ';';
-				printk (KERN_DEBUG "%s: faking semi-colon\n",
-					dev->port->name);
-
-				/* If we get here, I don't think we
-                                   need to worry about the possible
-                                   standard violation of having read
-                                   more than we were told to.  The
-                                   device is non-compliant anyhow. */
-			}
-		}
-
-	end_id:
-		buffer[len] = '\0';
+		retval = parport_read_device_id (dev->port, buffer, count);
 		parport_negotiate (dev->port, IEEE1284_MODE_COMPAT);
+		if (retval > 2)
+			parse_data (dev->port, dev->daisy, buffer+2);
 	}
 
-	if (retval > 2)
-		parse_data (dev->port, dev->daisy, buffer);
-
-out:
 	parport_release (dev);
 	parport_close (dev);
 	return retval;
-- 
cgit v1.1


From 742ec650e9b63ea61891455bb6f76bac37025c78 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:44 -0800
Subject: [PATCH] parport: phase fixes

Did not move the parport interface properly into IEEE1284_PH_REV_IDLE phase at
end of data due to comparing bytes with nibbles.  Internal phase
IEEE1284_PH_HBUSY_DNA became unused, so remove it.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/video/cpia_pp.c  | 30 +++++++++-----------
 drivers/parport/ieee1284_ops.c | 62 ++++++++++++++++++++----------------------
 include/linux/parport.h        |  1 -
 3 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/drivers/media/video/cpia_pp.c b/drivers/media/video/cpia_pp.c
index ddf184f..6861d40 100644
--- a/drivers/media/video/cpia_pp.c
+++ b/drivers/media/video/cpia_pp.c
@@ -170,16 +170,9 @@ static size_t cpia_read_nibble (struct parport *port,
 		/* Does the error line indicate end of data? */
 		if (((i /*& 1*/) == 0) &&
 		    (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-				DBG("%s: No more nibble data (%d bytes)\n",
-				port->name, i/2);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			DBG("%s: No more nibble data (%d bytes)\n",
+			    port->name, i/2);
+			goto end_of_data;
 		}
 
 		/* Event 7: Set nAutoFd low. */
@@ -227,18 +220,21 @@ static size_t cpia_read_nibble (struct parport *port,
 			byte = nibble;
 	}
 
-	i /= 2; /* i is now in bytes */
-
 	if (i == len) {
 		/* Read the last nibble without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
-	return i;
+	return i/2;
 }
 
 /* CPiA nonstandard "Nibble Stream" mode (2 nibbles per cycle, instead of 1)
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index ce1e2aa..d6c7765 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -165,17 +165,7 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
 		/* Does the error line indicate end of data? */
 		if (((i & 1) == 0) &&
 		    (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-			DPRINTK (KERN_DEBUG
-				"%s: No more nibble data (%d bytes)\n",
-				port->name, i/2);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			goto end_of_data;
 		}
 
 		/* Event 7: Set nAutoFd low. */
@@ -225,18 +215,25 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
 			byte = nibble;
 	}
 
-	i /= 2; /* i is now in bytes */
-
 	if (i == len) {
 		/* Read the last nibble without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			DPRINTK (KERN_DEBUG
+				"%s: No more nibble data (%d bytes)\n",
+				port->name, i/2);
+
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
-	return i;
+	return i/2;
 #endif /* IEEE1284 support */
 }
 
@@ -256,17 +253,7 @@ size_t parport_ieee1284_read_byte (struct parport *port,
 
 		/* Data available? */
 		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
-			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
-			DPRINTK (KERN_DEBUG
-				 "%s: No more byte data (%Zd bytes)\n",
-				 port->name, count);
-
-			/* Go to reverse idle phase. */
-			parport_frob_control (port,
-					      PARPORT_CONTROL_AUTOFD,
-					      PARPORT_CONTROL_AUTOFD);
-			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-			break;
+			goto end_of_data;
 		}
 
 		/* Event 14: Place data bus in high impedance state. */
@@ -318,11 +305,20 @@ size_t parport_ieee1284_read_byte (struct parport *port,
 
 	if (count == len) {
 		/* Read the last byte without checking data avail. */
-		port = port->physport;
-		if (parport_read_status (port) & PARPORT_STATUS_ERROR)
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA;
+		if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
+		end_of_data:
+			DPRINTK (KERN_DEBUG
+				 "%s: No more byte data (%Zd bytes)\n",
+				 port->name, count);
+
+			/* Go to reverse idle phase. */
+			parport_frob_control (port,
+					      PARPORT_CONTROL_AUTOFD,
+					      PARPORT_CONTROL_AUTOFD);
+			port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
+		}
 		else
-			port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
+			port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
 	}
 
 	return count;
diff --git a/include/linux/parport.h b/include/linux/parport.h
index d2a4d9e..f7ff0b0 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -242,7 +242,6 @@ enum ieee1284_phase {
 	IEEE1284_PH_FWD_IDLE,
 	IEEE1284_PH_TERMINATE,
 	IEEE1284_PH_NEGOTIATION,
-	IEEE1284_PH_HBUSY_DNA,
 	IEEE1284_PH_REV_IDLE,
 	IEEE1284_PH_HBUSY_DAVAIL,
 	IEEE1284_PH_REV_DATA,
-- 
cgit v1.1


From 310c8c324f988625a2880deab67607bf4e5aeb8a Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:45 -0800
Subject: [PATCH] parport: daisy chain end detection fix

Daisy chain end detection failed at least with older daisy chain devices that
do not implement the last device signal.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/daisy.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 075c7eb..6915114 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -436,7 +436,7 @@ static int select_port (struct parport *port)
 
 static int assign_addrs (struct parport *port)
 {
-	unsigned char s, last_dev;
+	unsigned char s;
 	unsigned char daisy;
 	int thisdev = numdevs;
 	int detected;
@@ -472,10 +472,13 @@ static int assign_addrs (struct parport *port)
 	}
 
 	parport_write_data (port, 0x78); udelay (2);
-	last_dev = 0; /* We've just been speaking to a device, so we
-			 know there must be at least _one_ out there. */
+	s = parport_read_status (port);
 
-	for (daisy = 0; daisy < 4; daisy++) {
+	for (daisy = 0;
+	     (s & (PARPORT_STATUS_PAPEROUT|PARPORT_STATUS_SELECT))
+		     == (PARPORT_STATUS_PAPEROUT|PARPORT_STATUS_SELECT)
+		     && daisy < 4;
+	     ++daisy) {
 		parport_write_data (port, daisy);
 		udelay (2);
 		parport_frob_control (port,
@@ -485,14 +488,18 @@ static int assign_addrs (struct parport *port)
 		parport_frob_control (port, PARPORT_CONTROL_STROBE, 0);
 		udelay (1);
 
-		if (last_dev)
-			/* No more devices. */
-			break;
+		add_dev (numdevs++, port, daisy);
 
-		last_dev = !(parport_read_status (port)
-			     & PARPORT_STATUS_BUSY);
+		/* See if this device thought it was the last in the
+		 * chain. */
+		if (!(s & PARPORT_STATUS_BUSY))
+			break;
 
-		add_dev (numdevs++, port, daisy);
+		/* We are seeing pass through status now. We see
+		   last_dev from next device or if last_dev does not
+		   work status lines from some non-daisy chain
+		   device. */
+		s = parport_read_status (port);
 	}
 
 	parport_write_data (port, 0xff); udelay (2);
-- 
cgit v1.1


From c29a75ed0d94fae64b59345ea96e52424ae9c6a2 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:46 -0800
Subject: [PATCH] parport: daisy chain device id reading fix

Device ID reading from daisy chain devices failed because the daisy
device could not be opened.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/daisy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 6915114..37dc179 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -252,7 +252,7 @@ struct pardevice *parport_open (int devnum, const char *name,
 		selected = port->daisy;
 		parport_release (dev);
 
-		if (selected != port->daisy) {
+		if (selected != daisy) {
 			/* No corresponding device. */
 			parport_unregister_device (dev);
 			return NULL;
-- 
cgit v1.1


From 7c9cc3be1094b267a2da2e0016cbd6ced663da6d Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:46 -0800
Subject: [PATCH] parport: parport_daisy_select return value fix

parport_daisy_select returned wrong status that is read at wrong time
during daisy command execution.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/daisy.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 37dc179..9109a40 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -344,9 +344,9 @@ static int cpp_daisy (struct parport *port, int cmd)
 			      PARPORT_CONTROL_STROBE,
 			      PARPORT_CONTROL_STROBE);
 	udelay (1);
+	s = parport_read_status (port);
 	parport_frob_control (port, PARPORT_CONTROL_STROBE, 0);
 	udelay (1);
-	s = parport_read_status (port);
 	parport_write_data (port, 0xff); udelay (2);
 
 	return s;
@@ -395,15 +395,15 @@ int parport_daisy_select (struct parport *port, int daisy, int mode)
 		case IEEE1284_MODE_EPP:
 		case IEEE1284_MODE_EPPSL:
 		case IEEE1284_MODE_EPPSWE:
-			return (cpp_daisy (port, 0x20 + daisy) &
-				PARPORT_STATUS_ERROR);
+			return !(cpp_daisy (port, 0x20 + daisy) &
+				 PARPORT_STATUS_ERROR);
 
 		// For these modes we should switch to ECP mode:
 		case IEEE1284_MODE_ECP:
 		case IEEE1284_MODE_ECPRLE:
 		case IEEE1284_MODE_ECPSWE: 
-			return (cpp_daisy (port, 0xd0 + daisy) &
-				PARPORT_STATUS_ERROR);
+			return !(cpp_daisy (port, 0xd0 + daisy) &
+				 PARPORT_STATUS_ERROR);
 
 		// Nothing was told for BECP in Daisy chain specification.
 		// May be it's wise to use ECP?
@@ -413,8 +413,8 @@ int parport_daisy_select (struct parport *port, int daisy, int mode)
 		case IEEE1284_MODE_BYTE:
 		case IEEE1284_MODE_COMPAT:
 		default:
-			return (cpp_daisy (port, 0xe0 + daisy) &
-				PARPORT_STATUS_ERROR);
+			return !(cpp_daisy (port, 0xe0 + daisy) &
+				 PARPORT_STATUS_ERROR);
 	}
 }
 
-- 
cgit v1.1


From b44d3bdd6fcf6233b381bf5bd0893ed235f497a9 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:47 -0800
Subject: [PATCH] parport: use complete slab buffer

Use the complete slab buffer that is allocated by kmalloc.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/daisy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 9109a40..9ee6732 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -144,9 +144,9 @@ again:
 	add_dev (numdevs++, port, -1);
 
 	/* Find out the legacy device's IEEE 1284 device ID. */
-	deviceid = kmalloc (1000, GFP_KERNEL);
+	deviceid = kmalloc (1024, GFP_KERNEL);
 	if (deviceid) {
-		if (parport_device_id (numdevs - 1, deviceid, 1000) > 2)
+		if (parport_device_id (numdevs - 1, deviceid, 1024) > 2)
 			detected++;
 
 		kfree (deviceid);
@@ -508,11 +508,11 @@ static int assign_addrs (struct parport *port)
 		 detected);
 
 	/* Ask the new devices to introduce themselves. */
-	deviceid = kmalloc (1000, GFP_KERNEL);
+	deviceid = kmalloc (1024, GFP_KERNEL);
 	if (!deviceid) return 0;
 
 	for (daisy = 0; thisdev < numdevs; thisdev++, daisy++)
-		parport_device_id (thisdev, deviceid, 1000);
+		parport_device_id (thisdev, deviceid, 1024);
 
 	kfree (deviceid);
 	return detected;
-- 
cgit v1.1


From a6767b7cc674ee39635db75ed2f6f65ed0012239 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:48 -0800
Subject: [PATCH] parport: constification

Trivial "const" additions to places in parport that truly are const.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/parport_pc.c | 30 ++++++++++++++++++------------
 drivers/parport/probe.c      |  6 +++---
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index c6493ad..18e85cc 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1169,7 +1169,7 @@ dump_parport_state ("fwd idle", port);
 
 /* GCC is not inlining extern inline function later overwriten to non-inline,
    so we use outlined_ variants here.  */
-static struct parport_operations parport_pc_ops =
+static const struct parport_operations parport_pc_ops =
 {
 	.write_data	= parport_pc_write_data,
 	.read_data	= parport_pc_read_data,
@@ -1211,10 +1211,11 @@ static struct parport_operations parport_pc_ops =
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
 	int cr1,cr4,cra,cr23,cr26,cr27,i=0;
-	static const char *modes[]={ "SPP and Bidirectional (PS/2)",	
-				     "EPP and SPP",
-				     "ECP",
-				     "ECP and EPP" };
+	static const char *const modes[]={
+		"SPP and Bidirectional (PS/2)",
+		"EPP and SPP",
+		"ECP",
+		"ECP and EPP" };
 
 	outb(key,io);
 	outb(key,io);
@@ -1288,7 +1289,7 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 static void __devinit show_parconfig_winbond(int io, int key)
 {
 	int cr30,cr60,cr61,cr70,cr74,crf0,i=0;
-	static const char *modes[] = {
+	static const char *const modes[] = {
 		"Standard (SPP) and Bidirectional(PS/2)", /* 0 */
 		"EPP-1.9 and SPP",
 		"ECP",
@@ -1297,7 +1298,9 @@ static void __devinit show_parconfig_winbond(int io, int key)
 		"EPP-1.7 and SPP",		/* 5 */
 		"undefined!",
 		"ECP and EPP-1.7" };
-	static char *irqtypes[] = { "pulsed low, high-Z", "follows nACK" };
+	static char *const irqtypes[] = {
+		"pulsed low, high-Z",
+		"follows nACK" };
 		
 	/* The registers are called compatible-PnP because the
            register layout is modelled after ISA-PnP, the access
@@ -2396,7 +2399,8 @@ EXPORT_SYMBOL (parport_pc_unregister_port);
 
 /* ITE support maintained by Rich Liu <richliu@poorman.org> */
 static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
-					 int autodma, struct parport_pc_via_data *via)
+					 int autodma,
+					 const struct parport_pc_via_data *via)
 {
 	short inta_addr[6] = { 0x2A0, 0x2C0, 0x220, 0x240, 0x1E0 };
 	struct resource *base_res;
@@ -2524,7 +2528,8 @@ static struct parport_pc_via_data via_8231_data __devinitdata = {
 };
 
 static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
-					 int autodma, struct parport_pc_via_data *via)
+				    int autodma,
+				    const struct parport_pc_via_data *via)
 {
 	u8 tmp, tmp2, siofunc;
 	u8 ppcontrol = 0;
@@ -2694,8 +2699,9 @@ enum parport_pc_sio_types {
 
 /* each element directly indexed from enum list, above */
 static struct parport_pc_superio {
-	int (*probe) (struct pci_dev *pdev, int autoirq, int autodma, struct parport_pc_via_data *via);
-	struct parport_pc_via_data *via;
+	int (*probe) (struct pci_dev *pdev, int autoirq, int autodma,
+		      const struct parport_pc_via_data *via);
+	const struct parport_pc_via_data *via;
 } parport_pc_superio_info[] __devinitdata = {
 	{ sio_via_probe, &via_686a_data, },
 	{ sio_via_probe, &via_8231_data, },
@@ -2828,7 +2834,7 @@ static struct parport_pc_pci {
 	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */
 };
 
-static struct pci_device_id parport_pc_pci_tbl[] = {
+static const struct pci_device_id parport_pc_pci_tbl[] = {
 	/* Super-IO onboard chips */
 	{ 0x1106, 0x0686, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_686a },
 	{ 0x1106, 0x8231, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_8231 },
diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index 5c29e82..b62aee8 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -11,9 +11,9 @@
 #include <linux/string.h>
 #include <asm/uaccess.h>
 
-static struct {
-	char *token;
-	char *descr;
+static const struct {
+	const char *token;
+	const char *descr;
 } classes[] = {
 	{ "",            "Legacy device" },
 	{ "PRINTER",     "Printer" },
-- 
cgit v1.1


From 110bee75d2e03d3b4bcc74743dee5a21fe7b43bd Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:49 -0800
Subject: [PATCH] parport: DEBUG_PARPORT build fix

Add missing "struct" keyword preventing compilation with DEBUG_PARPORT
defined.  Also add some "const".

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport_pc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index c6f7624..7e62b34 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -85,7 +85,7 @@ extern __inline__ void dump_parport_state (char *str, struct parport *p)
 	unsigned char ecr = inb (ECONTROL (p));
 	unsigned char dcr = inb (CONTROL (p));
 	unsigned char dsr = inb (STATUS (p));
-	static char *ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"};
+	static const char *const ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"};
 	const struct parport_pc_private *priv = p->physport->private_data;
 	int i;
 
-- 
cgit v1.1


From 6a19b41b35bf45fc27a46dccf26005b3f44c1aa1 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:49 -0800
Subject: [PATCH] parport: Kconfig dependency fixes

Make drivers that use directly PC parport HW depend on PARPORT_PC rather than
HW independent PARPORT.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/Kconfig        | 2 +-
 drivers/block/paride/Kconfig | 5 +++--
 drivers/scsi/Kconfig         | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index c4b9d2a..139cbba 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -117,7 +117,7 @@ config BLK_DEV_XD
 
 config PARIDE
 	tristate "Parallel port IDE device support"
-	depends on PARPORT
+	depends on PARPORT_PC
 	---help---
 	  There are many external CD-ROM and disk devices that connect through
 	  your computer's parallel port. Most of them are actually IDE devices
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index 17ff405..c0d2854 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -4,11 +4,12 @@
 # PARIDE doesn't need PARPORT, but if PARPORT is configured as a module,
 # PARIDE must also be a module.  The bogus CONFIG_PARIDE_PARPORT option
 # controls the choices given to the user ...
+# PARIDE only supports PC style parports. Tough for USB or other parports...
 config PARIDE_PARPORT
 	tristate
 	depends on PARIDE!=n
-	default m if PARPORT=m
-	default y if PARPORT!=m
+	default m if PARPORT_PC=m
+	default y if PARPORT_PC!=m
 
 comment "Parallel IDE high-level drivers"
 	depends on PARIDE
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 9e8254f..3c606cf 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -914,7 +914,7 @@ config SCSI_INIA100
 
 config SCSI_PPA
 	tristate "IOMEGA parallel port (ppa - older drives)"
-	depends on SCSI && PARPORT
+	depends on SCSI && PARPORT_PC
 	---help---
 	  This driver supports older versions of IOMEGA's parallel port ZIP
 	  drive (a 100 MB removable media device).
@@ -941,7 +941,7 @@ config SCSI_PPA
 
 config SCSI_IMM
 	tristate "IOMEGA parallel port (imm - newer drives)"
-	depends on SCSI && PARPORT
+	depends on SCSI && PARPORT_PC
 	---help---
 	  This driver supports newer versions of IOMEGA's parallel port ZIP
 	  drive (a 100 MB removable media device).
@@ -968,7 +968,7 @@ config SCSI_IMM
 
 config SCSI_IZIP_EPP16
 	bool "ppa/imm option - Use slow (but safe) EPP-16"
-	depends on PARPORT && (SCSI_PPA || SCSI_IMM)
+	depends on SCSI_PPA || SCSI_IMM
 	---help---
 	  EPP (Enhanced Parallel Port) is a standard for parallel ports which
 	  allows them to act as expansion buses that can handle up to 64
@@ -983,7 +983,7 @@ config SCSI_IZIP_EPP16
 
 config SCSI_IZIP_SLOW_CTR
 	bool "ppa/imm option - Assume slow parport control register"
-	depends on PARPORT && (SCSI_PPA || SCSI_IMM)
+	depends on SCSI_PPA || SCSI_IMM
 	help
 	  Some parallel ports are known to have excessive delays between
 	  changing the parallel port control register and good data being
-- 
cgit v1.1


From 6a85081d1c3ab7935c3ade8f4b2700a860d6fb2e Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:51 -0800
Subject: [PATCH] parport: include fixes

Small cleanup of includes meant for older implementation.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/plip.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/plip.c b/drivers/net/plip.c
index 1bd22cd..87ee327 100644
--- a/drivers/net/plip.c
+++ b/drivers/net/plip.c
@@ -98,7 +98,6 @@ static const char version[] = "NET3 PLIP version 2.4-parport gniibe@mri.co.jp\n"
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <linux/lp.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
@@ -106,7 +105,6 @@ static const char version[] = "NET3 PLIP version 2.4-parport gniibe@mri.co.jp\n"
 #include <linux/skbuff.h>
 #include <linux/if_plip.h>
 #include <linux/workqueue.h>
-#include <linux/ioport.h>
 #include <linux/spinlock.h>
 #include <linux/parport.h>
 #include <linux/bitops.h>
-- 
cgit v1.1


From 94b82095d0f5d6a72a0c619f54645727ebf66642 Mon Sep 17 00:00:00 2001
From: Marko Kohtala <marko.kohtala@gmail.com>
Date: Fri, 6 Jan 2006 00:19:51 -0800
Subject: [PATCH] parport: export parport_get_port()

Help external ppSCSI driver by exporting parport_get_port to match the
parport_put_port.

Signed-off-by: Marko Kohtala <marko.kohtala@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/share.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 9cb3ab1..ea62bed 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -1002,6 +1002,7 @@ EXPORT_SYMBOL(parport_register_driver);
 EXPORT_SYMBOL(parport_unregister_driver);
 EXPORT_SYMBOL(parport_register_device);
 EXPORT_SYMBOL(parport_unregister_device);
+EXPORT_SYMBOL(parport_get_port);
 EXPORT_SYMBOL(parport_put_port);
 EXPORT_SYMBOL(parport_find_number);
 EXPORT_SYMBOL(parport_find_base);
-- 
cgit v1.1


From a1b9168d83962fbb05859c1ecaa57fd4f53cf38e Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:19:52 -0800
Subject: [PATCH] simplify PARPORT_PC_PCMCIA dependencies

Unless I miss something, this should be the simplest way to express the
intended dependencies.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/parport/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index 725a141..b824156 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -77,7 +77,7 @@ config PARPORT_PC_SUPERIO
 
 config PARPORT_PC_PCMCIA
 	tristate "Support for PCMCIA management for PC-style ports"
-	depends on PARPORT!=n && (PCMCIA!=n && PARPORT_PC=m && PARPORT_PC || PARPORT_PC=y && PCMCIA)
+	depends on PCMCIA && PARPORT_PC
 	help
 	  Say Y here if you need PCMCIA support for your PC-style parallel
 	  ports. If unsure, say N.
-- 
cgit v1.1


From 81684ee645e15601ec935461d9069a3086179c06 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:19:53 -0800
Subject: [PATCH] include/linux/parport_pc.h: "extern inline" -> "static
 inline"

"extern inline" doesn't make much sense.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/parport_pc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index 7e62b34..1cc0f6b 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -79,7 +79,7 @@ static __inline__ unsigned char parport_pc_read_data(struct parport *p)
 }
 
 #ifdef DEBUG_PARPORT
-extern __inline__ void dump_parport_state (char *str, struct parport *p)
+static inline void dump_parport_state (char *str, struct parport *p)
 {
 	/* here's hoping that reading these ports won't side-effect anything underneath */
 	unsigned char ecr = inb (ECONTROL (p));
-- 
cgit v1.1


From 066bb8d03b6e52e4844d37145573d6a2bedaa339 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 6 Jan 2006 00:19:53 -0800
Subject: [PATCH] fix remaining list_for_each_safe_rcu in -mm (take 2)

I missed a use of list_for_each_rcu_safe() in -mm tree.  Here is an updated
patch to fix it.  This time tested on a machine that actually uses IPMI...
(Thanks to Serge Hallyn for spotting this.)

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Corey Minyard <minyard@acm.org>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_msghandler.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 1f56b4c..561430e 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -787,7 +787,6 @@ int ipmi_destroy_user(ipmi_user_t user)
 	int              i;
 	unsigned long    flags;
 	struct cmd_rcvr  *rcvr;
-	struct list_head *entry1, *entry2;
 	struct cmd_rcvr  *rcvrs = NULL;
 
 	user->valid = 1;
@@ -812,8 +811,7 @@ int ipmi_destroy_user(ipmi_user_t user)
 	 * synchronize_rcu()) then free everything in that list.
 	 */
 	down(&intf->cmd_rcvrs_lock);
-	list_for_each_safe_rcu(entry1, entry2, &intf->cmd_rcvrs) {
-		rcvr = list_entry(entry1, struct cmd_rcvr, link);
+	list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link) {
 		if (rcvr->user == user) {
 			list_del_rcu(&rcvr->link);
 			rcvr->next = rcvrs;
-- 
cgit v1.1


From 6fe2e70bbed3995d930f39452fb6ce3be7dc47dc Mon Sep 17 00:00:00 2001
From: Jayachandran C <c.jayachandran@gmail.com>
Date: Fri, 6 Jan 2006 00:19:54 -0800
Subject: [PATCH] kernel/module.c: removed dead code

This patch fixes an issue reported by Coverity in kernel/module.c

Error reported: Cannot reach this line of code "else return ptr;"

Patch description:
  This is the error path, so 'err' will be negative, the else case
  is not required, this patch removes it.

Signed-off-by: Jayachandran C. <c.jayachandran@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d..4b06bba 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	if (err < 0) return ERR_PTR(err);
-	else return ptr;
+	return ERR_PTR(err);
 
  truncated:
 	printk(KERN_ERR "Module len %lu truncated\n", len);
-- 
cgit v1.1


From f93ea411b73594f7d144855fd34278bcf34a9afc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 6 Jan 2006 00:19:55 -0800
Subject: [PATCH] jbd: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 418 ++++++++++++++++++++++++++++++----------------------
 include/linux/jbd.h |   8 +-
 2 files changed, 248 insertions(+), 178 deletions(-)

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51f..cb3cef5 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction. 
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
 
-static inline void __buffer_unlink(struct journal_head *jh)
+static void __buffer_unlink_first(struct journal_head *jh)
 {
 	transaction_t *transaction;
 
 	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction;
+
+	transaction = jh->b_cp_transaction;
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.  
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction. 
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Didn't somebody clean up the transaction in the meanwhile */
+	if (journal->j_checkpoint_transactions != transaction ||
+		transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
-		}
-
-		/*
-		 * This is foul
-		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
 		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		get_bh(bh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		put_bh(bh);
+		ret = 1;
+	}
+	else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	}
+	else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		put_bh(bh);
+		ret = 1;
+	}
+	else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.  
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+ 	if (journal->j_checkpoint_transactions == transaction ||
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,
+						&batch_count);
+			if (!retry &&
+			    lock_need_resched(&journal->j_list_lock)) {
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble. 
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list.  Let's clean up the second one.
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal)
 /* Checkpoint list management */
 
 /*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	int ret = 0;
+	int ret = 0, released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +571,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.  
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index dcde7ad..558cb4c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -498,6 +498,12 @@ struct transaction_s
 	struct journal_head	*t_checkpoint_list;
 
 	/*
+	 * Doubly-linked circular list of all buffers submitted for IO while
+	 * checkpointing. [j_list_lock]
+	 */
+	struct journal_head	*t_checkpoint_io_list;
+
+	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
 	 */
@@ -843,7 +849,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-void __journal_remove_checkpoint(struct journal_head *);
+int __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.1


From 93fbf1a5de8afde08988dda3735669099dee84d0 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Fri, 6 Jan 2006 00:19:56 -0800
Subject: [PATCH] Keep nfsd from exiting when seeing recv() errors

I submitted this one previously - svc_tcp_recvfrom currently returns
any errors to the caller, including ECONNRESET and the like.

This is something svc_recv isn't able to deal with:

	len = svsk->sk_recvfrom(rqstp);
	[...]
	if (len == 0 || len == -EAGAIN) {
		[...]
		return -EAGAIN;
	}

	[...]
	return len;

The nfsd main loop will exit when it sees an error code other than
EAGAIN.

The following patch fixes this problem

svc_recv is not equipped to deal with error codes other than EAGAIN,
and will propagate anything else (such as ECONNRESET) up to nfsd,
causing it to exit.

Signed-off-by: Olaf Kirch <okir@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/svcsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d68eba4..e67613e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1026,7 +1026,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 	} else {
 		printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
 					svsk->sk_server->sv_name, -len);
-		svc_sock_received(svsk);
+		goto err_delete;
 	}
 
 	return len;
-- 
cgit v1.1


From a334de28665b14f0a33df82699fa9a78cfeedf31 Mon Sep 17 00:00:00 2001
From: David Shaw <dshaw@jabberwocky.com>
Date: Fri, 6 Jan 2006 00:19:58 -0800
Subject: [PATCH] knfsd: check error status from vfs_getattr and i_op->fsync

Both vfs_getattr and i_op->fsync return error statuses which nfsd was
largely ignoring.  This as noticed when exporting directories using fuse.

This patch cleans up most of the offences, which involves moving the call
to vfs_getattr out of the xdr encoding routines (where it is too late to
report an error) into the main NFS procedure handling routines.

There is still a called to vfs_gettattr (related to the ACL code) where the
status is ignored, and called to nfsd_sync_dir don't check return status
either.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs3proc.c        | 11 +++++++++--
 fs/nfsd/nfs3xdr.c         | 47 ++++++++++++++++++++++++----------------------
 fs/nfsd/nfsxdr.c          | 48 +++++++++++++++++++++++------------------------
 fs/nfsd/vfs.c             | 20 +++++++++++++-------
 include/linux/nfsd/xdr.h  |  3 +++
 include/linux/nfsd/xdr3.h |  1 +
 6 files changed, 75 insertions(+), 55 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380f..6d2dfed 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	nfserr;
+	int	err, nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
-				SVCFH_fmt(&argp->fh));
+		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	if (nfserr)
+		RETURN_STATUS(nfserr);
+
+	err = vfs_getattr(resp->fh.fh_export->ex_mnt,
+			  resp->fh.fh_dentry, &resp->stat);
+	nfserr = nfserrno(err);
+
 	RETURN_STATUS(nfserr);
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b85..243d94b 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	      struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-
-	*p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
-	if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
+	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
-		p = xdr_encode_hyper(p, (u64) stat.size);
+		p = xdr_encode_hyper(p, (u64) stat->size);
 	}
-	p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9);
-	*p++ = htonl((u32) MAJOR(stat.rdev));
-	*p++ = htonl((u32) MINOR(stat.rdev));
+	p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+	*p++ = htonl((u32) MAJOR(stat->rdev));
+	*p++ = htonl((u32) MINOR(stat->rdev));
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
 	else
-		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev));
-	p = xdr_encode_hyper(p, (u64) stat.ino);
-	p = encode_time3(p, &stat.atime);
+		p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
+	p = xdr_encode_hyper(p, (u64) stat->ino);
+	p = encode_time3(p, &stat->atime);
 	lease_get_mtime(dentry->d_inode, &time); 
 	p = encode_time3(p, &time);
-	p = encode_time3(p, &stat.ctime);
+	p = encode_time3(p, &stat->ctime);
 
 	return p;
 }
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode != NULL) {
-		*p++ = xdr_one;		/* attributes follow */
-		return encode_fattr3(rqstp, p, fhp);
+	        int err;
+		struct kstat stat;
+
+		err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
+		if (!err) {
+			*p++ = xdr_one;		/* attributes follow */
+			return encode_fattr3(rqstp, p, fhp, &stat);
+		}
 	}
 	*p++ = xdr_zero;
 	return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	if (resp->status == 0)
-		p = encode_fattr3(rqstp, p, &resp->fh);
+		p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999f..aa7bb41 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
 }
 
 static inline u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+	     struct kstat *stat)
 {
-	struct vfsmount *mnt = fhp->fh_export->ex_mnt;
 	struct dentry	*dentry = fhp->fh_dentry;
-	struct kstat stat;
 	int type;
 	struct timespec time;
 
-	vfs_getattr(mnt, dentry, &stat);
-	type = (stat.mode & S_IFMT);
+	type = (stat->mode & S_IFMT);
 
 	*p++ = htonl(nfs_ftypes[type >> 12]);
-	*p++ = htonl((u32) stat.mode);
-	*p++ = htonl((u32) stat.nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
+	*p++ = htonl((u32) stat->mode);
+	*p++ = htonl((u32) stat->nlink);
+	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
+	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
 
-	if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) {
+	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
 	} else {
-		*p++ = htonl((u32) stat.size);
+		*p++ = htonl((u32) stat->size);
 	}
-	*p++ = htonl((u32) stat.blksize);
+	*p++ = htonl((u32) stat->blksize);
 	if (S_ISCHR(type) || S_ISBLK(type))
-		*p++ = htonl(new_encode_dev(stat.rdev));
+		*p++ = htonl(new_encode_dev(stat->rdev));
 	else
 		*p++ = htonl(0xffffffff);
-	*p++ = htonl((u32) stat.blocks);
+	*p++ = htonl((u32) stat->blocks);
 	if (is_fsid(fhp, rqstp->rq_reffh))
 		*p++ = htonl((u32) fhp->fh_export->ex_fsid);
 	else
-		*p++ = htonl(new_encode_dev(stat.dev));
-	*p++ = htonl((u32) stat.ino);
-	*p++ = htonl((u32) stat.atime.tv_sec);
-	*p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0);
+		*p++ = htonl(new_encode_dev(stat->dev));
+	*p++ = htonl((u32) stat->ino);
+	*p++ = htonl((u32) stat->atime.tv_sec);
+	*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
 	lease_get_mtime(dentry->d_inode, &time); 
 	*p++ = htonl((u32) time.tv_sec);
 	*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 
-	*p++ = htonl((u32) stat.ctime.tv_sec);
-	*p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0);
+	*p++ = htonl((u32) stat->ctime.tv_sec);
+	*p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
 
 	return p;
 }
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /* Helper function for NFSv2 ACL code */
 u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 {
-	return encode_fattr(rqstp, p, fhp);
+	struct kstat stat;
+	vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
+	return encode_fattr(rqstp, p, fhp, &stat);
 }
 
 /*
@@ -394,7 +394,7 @@ int
 nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_attrstat *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_diropres *resp)
 {
 	p = encode_fh(p, &resp->fh);
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -428,7 +428,7 @@ int
 nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readres *resp)
 {
-	p = encode_fattr(rqstp, p, &resp->fh);
+	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->count);
 	xdr_ressize_check(rqstp, p);
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3..f83ab4c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
  * As this calls fsync (not fdatasync) there is no need for a write_inode
  * after it.
  */
-static inline void nfsd_dosync(struct file *filp, struct dentry *dp,
-			       struct file_operations *fop)
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+			      struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
+	int err = nfs_ok;
 
 	filemap_fdatawrite(inode->i_mapping);
 	if (fop && (fsync = fop->fsync))
-		fsync(filp, dp, 0);
+		err=fsync(filp, dp, 0);
 	filemap_fdatawait(inode->i_mapping);
+
+	return nfserrno(err);
 }
 	
 
-static void
+static int
 nfsd_sync(struct file *filp)
 {
+        int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
 	down(&inode->i_sem);
-	nfsd_dosync(filp, filp->f_dentry, filp->f_op);
+	err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
 	up(&inode->i_sem);
+
+	return err;
 }
 
 void
@@ -962,7 +968,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 			if (inode->i_state & I_DIRTY) {
 				dprintk("nfsd: write sync %d\n", current->pid);
-				nfsd_sync(file);
+				err=nfsd_sync(file);
 			}
 #if 0
 			wake_up(&inode->i_wait);
@@ -1066,7 +1072,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
-			nfsd_sync(file);
+			err = nfsd_sync(file);
 		} else {
 			err = nfserr_notsupp;
 		}
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 130d4f5..3f4f714 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -88,10 +88,12 @@ struct nfsd_readdirargs {
 
 struct nfsd_attrstat {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_diropres  {
 	struct svc_fh		fh;
+	struct kstat		stat;
 };
 
 struct nfsd_readlinkres {
@@ -101,6 +103,7 @@ struct nfsd_readlinkres {
 struct nfsd_readres {
 	struct svc_fh		fh;
 	unsigned long		count;
+	struct kstat		stat;
 };
 
 struct nfsd_readdirres {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 3c2a71b..a432274 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -126,6 +126,7 @@ struct nfsd3_setaclargs {
 struct nfsd3_attrstat {
 	__u32			status;
 	struct svc_fh		fh;
+	struct kstat            stat;
 };
 
 /* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
-- 
cgit v1.1


From 9f708e40fe040e79f6c393a282f0701c9f8dc174 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:19:59 -0800
Subject: [PATCH] knfsd: reduce stack consumption

A typical nfsd call trace is
 nfsd -> svc_process -> nfsd_dispatch -> nfsd3_proc_write ->
   nfsd_write ->nfsd_vfs_write -> vfs_writev

These add up to over 300 bytes on the stack.
Looking at each of these, I see that nfsd_write (which includes
 nfsd_vfs_write) contributes 0x8c to stack usage itself!!

It turns out this is because it puts a 'struct iattr' on the stack so
it can kill suid if needed.  The following patch saves about 50 bytes
off the stack in this call path.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f83ab4c..df4019f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -880,6 +880,16 @@ out:
 	return err;
 }
 
+static void kill_suid(struct dentry *dentry)
+{
+	struct iattr	ia;
+	ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
+
+	down(&dentry->d_inode->i_sem);
+	notify_change(dentry, &ia);
+	up(&dentry->d_inode->i_sem);
+}
+
 static inline int
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
@@ -933,14 +943,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	/* clear setuid/setgid flag after write */
-	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) {
-		struct iattr	ia;
-		ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
-
-		down(&inode->i_sem);
-		notify_change(dentry, &ia);
-		up(&inode->i_sem);
-	}
+	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+		kill_suid(dentry);
 
 	if (err >= 0 && stable) {
 		static ino_t	last_ino;
-- 
cgit v1.1


From 637842cfdbe2b981f7088f7633e630570f58efaf Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 6 Jan 2006 00:20:00 -0800
Subject: [PATCH] device-mapper: add dm_find_md

Abstract dm_find_md() from dm_get_mdptr() to allow use elsewhere.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 930b9fc..27cd234 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -902,10 +902,9 @@ int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
 	return create_aux(minor, 1, result);
 }
 
-void *dm_get_mdptr(dev_t dev)
+static struct mapped_device *dm_find_md(dev_t dev)
 {
 	struct mapped_device *md;
-	void *mdptr = NULL;
 	unsigned minor = MINOR(dev);
 
 	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
@@ -914,12 +913,22 @@ void *dm_get_mdptr(dev_t dev)
 	down(&_minor_lock);
 
 	md = idr_find(&_minor_idr, minor);
-
-	if (md && (dm_disk(md)->first_minor == minor))
-		mdptr = md->interface_ptr;
+	if (!md || (dm_disk(md)->first_minor != minor))
+		md = NULL;
 
 	up(&_minor_lock);
 
+	return md;
+}
+
+void *dm_get_mdptr(dev_t dev)
+{
+	struct mapped_device *md;
+	void *mdptr = NULL;
+
+	md = dm_find_md(dev);
+	if (md)
+		mdptr = md->interface_ptr;
 	return mdptr;
 }
 
-- 
cgit v1.1


From d229a9589ff3b988d3f999cdcfa350f97a372673 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 6 Jan 2006 00:20:01 -0800
Subject: [PATCH] device-mapper: add dm_get_md

Add dm_get_dev() to get a mapped device given its dev_t.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 10 ++++++++++
 drivers/md/dm.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 27cd234..9e8c1ed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -921,6 +921,16 @@ static struct mapped_device *dm_find_md(dev_t dev)
 	return md;
 }
 
+struct mapped_device *dm_get_md(dev_t dev)
+{
+	struct mapped_device *md = dm_find_md(dev);
+
+	if (md)
+		dm_get(md);
+
+	return md;
+}
+
 void *dm_get_mdptr(dev_t dev)
 {
 	struct mapped_device *md;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e38c3fc..ab078a2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -58,6 +58,7 @@ int dm_create(struct mapped_device **md);
 int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
 void dm_set_mdptr(struct mapped_device *md, void *ptr);
 void *dm_get_mdptr(dev_t dev);
+struct mapped_device *dm_get_md(dev_t dev);
 
 /*
  * Reference counting for md.
-- 
cgit v1.1


From 81f1777a55e8c631b61e5fa5980fb7a2004287af Mon Sep 17 00:00:00 2001
From: "goggin, edward" <egoggin@emc.com>
Date: Fri, 6 Jan 2006 00:20:01 -0800
Subject: [PATCH] device-mapper ioctl: event on rename

After changing the name of a mapped device, trigger a dm event.  (For
userspace multipath tools.)

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 07d44e1..3e327db 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -270,6 +270,7 @@ static int dm_hash_rename(const char *old, const char *new)
 {
 	char *new_name, *old_name;
 	struct hash_cell *hc;
+	struct dm_table *table;
 
 	/*
 	 * duplicate new.
@@ -317,6 +318,15 @@ static int dm_hash_rename(const char *old, const char *new)
 	/* rename the device node in devfs */
 	register_with_devfs(hc);
 
+	/*
+	 * Wake up any dm event waiters.
+	 */
+	table = dm_get_table(hc->md);
+	if (table) {
+		dm_table_event(table);
+		dm_table_put(table);
+	}
+
 	up_write(&_hash_lock);
 	kfree(old_name);
 	return 0;
-- 
cgit v1.1


From 2d38fe204461dc542bb38f2b01a9cd115b367b36 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:02 -0800
Subject: [PATCH] device-mapper snapshot: metadata reading separation

More snapshot metadata reading into separate function, to prepare for changing
the place it gets called from.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-snap.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ab54f99..4b9dd8f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -371,6 +371,20 @@ static inline ulong round_up(ulong n, ulong size)
 	return (n + size) & ~size;
 }
 
+static void read_snapshot_metadata(struct dm_snapshot *s)
+{
+	if (s->have_metadata)
+		return;
+
+	if (s->store.read_metadata(&s->store)) {
+		down_write(&s->lock);
+		s->valid = 0;
+		up_write(&s->lock);
+	}
+
+	s->have_metadata = 1;
+}
+
 /*
  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  */
@@ -848,16 +862,7 @@ static void snapshot_resume(struct dm_target *ti)
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
 
-	if (s->have_metadata)
-		return;
-
-	if (s->store.read_metadata(&s->store)) {
-		down_write(&s->lock);
-		s->valid = 0;
-		up_write(&s->lock);
-	}
-
-	s->have_metadata = 1;
+	read_snapshot_metadata(s);
 }
 
 static int snapshot_status(struct dm_target *ti, status_type_t type,
-- 
cgit v1.1


From e6c276159c812ab25f47b7c7b683a5c97c442dd5 Mon Sep 17 00:00:00 2001
From: Andrew Stribblehill <a.d.stribblehill@durham.ac.uk>
Date: Fri, 6 Jan 2006 00:20:03 -0800
Subject: [PATCH] device-mapper: remove unused definition

This patch removes an unused #define.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-io.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index 1a77f32..f9035bf 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -9,9 +9,6 @@
 
 #include "dm.h"
 
-/* FIXME make this configurable */
-#define DM_MAX_IO_REGIONS 8
-
 struct io_region {
 	struct block_device *bdev;
 	sector_t sector;
-- 
cgit v1.1


From 2d5fe68987341a59a3fd97c71695efcabb0c6fd5 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:04 -0800
Subject: [PATCH] device-mapper: scanf sector format change

Use %llu not %Lu in sscanf/printf format strings.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index ab078a2..95a0cfb 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -28,7 +28,7 @@
  * in types.h.
  */
 #ifdef CONFIG_LBD
-#define SECTOR_FORMAT "%Lu"
+#define SECTOR_FORMAT "%llu"
 #else
 #define SECTOR_FORMAT "%lu"
 #endif
-- 
cgit v1.1


From a1a190807074bd6ad8771e00b00752771ae586cb Mon Sep 17 00:00:00 2001
From: Jonathan E Brassow <jbrassow@redhat.com>
Date: Fri, 6 Jan 2006 00:20:05 -0800
Subject: [PATCH] device-mapper raid1: add default mirror

This patch introduces a new field to the mirror_set (default_mirror) to store
the default mirror.

(A subsequent patch will allow us to change the default mirror in the event of
a failure.)

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-raid1.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6b0fc16..6cfa8d4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -562,6 +562,8 @@ struct mirror_set {
 	region_t nr_regions;
 	int in_sync;
 
+	struct mirror *default_mirror;	/* Default mirror */
+
 	unsigned int nr_mirrors;
 	struct mirror mirror[0];
 };
@@ -611,7 +613,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
 	unsigned long flags = 0;
 
 	/* fill in the source */
-	m = ms->mirror + DEFAULT_MIRROR;
+	m = ms->default_mirror;
 	from.bdev = m->dev->bdev;
 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
 	if (reg->key == (ms->nr_regions - 1)) {
@@ -627,7 +629,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
 
 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-		if (i == DEFAULT_MIRROR)
+		if (&ms->mirror[i] == ms->default_mirror)
 			continue;
 
 		m = ms->mirror + i;
@@ -682,7 +684,7 @@ static void do_recovery(struct mirror_set *ms)
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
 {
 	/* FIXME: add read balancing */
-	return ms->mirror + DEFAULT_MIRROR;
+	return ms->default_mirror;
 }
 
 /*
@@ -709,7 +711,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		if (rh_in_sync(&ms->rh, region, 0))
 			m = choose_mirror(ms, bio->bi_sector);
 		else
-			m = ms->mirror + DEFAULT_MIRROR;
+			m = ms->default_mirror;
 
 		map_bio(ms, m, bio);
 		generic_make_request(bio);
@@ -833,7 +835,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		rh_delay(&ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+		map_bio(ms, ms->default_mirror, bio);
 		generic_make_request(bio);
 	}
 }
@@ -900,6 +902,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
+	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
 
 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
 		ti->error = "dm-mirror: Error creating dirty region hash";
-- 
cgit v1.1


From e39e2e95eb8bd536b61654e8fda1516d0a6a3cd1 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:05 -0800
Subject: [PATCH] device-mapper: rename frozen_bdev

Rename frozen_bdev to suspended_bdev and move the bdget outside lockfs.  (This
prepares for making lockfs optional.)

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e8c1ed..fc335e0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -97,7 +97,7 @@ struct mapped_device {
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
-	struct block_device *frozen_bdev;
+	struct block_device *suspended_bdev;
 };
 
 #define MIN_IOS 256
@@ -836,9 +836,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	down(&md->frozen_bdev->bd_inode->i_sem);
-	i_size_write(md->frozen_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	up(&md->frozen_bdev->bd_inode->i_sem);
+	down(&md->suspended_bdev->bd_inode->i_sem);
+	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	up(&md->suspended_bdev->bd_inode->i_sem);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -1010,43 +1010,27 @@ out:
  */
 static int lock_fs(struct mapped_device *md)
 {
-	int r = -ENOMEM;
-
-	md->frozen_bdev = bdget_disk(md->disk, 0);
-	if (!md->frozen_bdev) {
-		DMWARN("bdget failed in lock_fs");
-		goto out;
-	}
+	int r;
 
 	WARN_ON(md->frozen_sb);
 
-	md->frozen_sb = freeze_bdev(md->frozen_bdev);
+	md->frozen_sb = freeze_bdev(md->suspended_bdev);
 	if (IS_ERR(md->frozen_sb)) {
 		r = PTR_ERR(md->frozen_sb);
-		goto out_bdput;
+		md->frozen_sb = NULL;
+		return r;
 	}
 
 	/* don't bdput right now, we don't want the bdev
-	 * to go away while it is locked.  We'll bdput
-	 * in unlock_fs
+	 * to go away while it is locked.
 	 */
 	return 0;
-
-out_bdput:
-	bdput(md->frozen_bdev);
-	md->frozen_sb = NULL;
-	md->frozen_bdev = NULL;
-out:
-	return r;
 }
 
 static void unlock_fs(struct mapped_device *md)
 {
-	thaw_bdev(md->frozen_bdev, md->frozen_sb);
-	bdput(md->frozen_bdev);
-
+	thaw_bdev(md->suspended_bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
-	md->frozen_bdev = NULL;
 }
 
 /*
@@ -1072,6 +1056,13 @@ int dm_suspend(struct mapped_device *md)
 	/* This does not get reverted if there's an error later. */
 	dm_table_presuspend_targets(map);
 
+	md->suspended_bdev = bdget_disk(md->disk, 0);
+	if (!md->suspended_bdev) {
+		DMWARN("bdget failed in dm_suspend");
+		r = -ENOMEM;
+		goto out;
+	}
+
 	/* Flush I/O to the device. */
 	r = lock_fs(md);
 	if (r)
@@ -1124,6 +1115,11 @@ int dm_suspend(struct mapped_device *md)
 	r = 0;
 
 out:
+	if (r && md->suspended_bdev) {
+		bdput(md->suspended_bdev);
+		md->suspended_bdev = NULL;
+	}
+
 	dm_table_put(map);
 	up(&md->suspend_lock);
 	return r;
@@ -1154,6 +1150,9 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
+	bdput(md->suspended_bdev);
+	md->suspended_bdev = NULL;
+
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);
-- 
cgit v1.1


From aa8d7c2fbe619d8c0837296d2eaf4c14cebac198 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:06 -0800
Subject: [PATCH] device-mapper: make lock_fs optional

Devices only needs syncing when creating snapshots, so make this optional when
suspending a device.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c |  4 ++--
 drivers/md/dm.c       | 17 +++++++++++++----
 drivers/md/dm.h       |  2 +-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3e327db..dbc07af 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -700,7 +700,7 @@ static int do_suspend(struct dm_ioctl *param)
 		return -ENXIO;
 
 	if (!dm_suspended(md))
-		r = dm_suspend(md);
+		r = dm_suspend(md, 1);
 
 	if (!r)
 		r = __dev_status(md, param);
@@ -738,7 +738,7 @@ static int do_resume(struct dm_ioctl *param)
 	if (new_map) {
 		/* Suspend if it isn't already suspended */
 		if (!dm_suspended(md))
-			dm_suspend(md);
+			dm_suspend(md, 1);
 
 		r = dm_swap_table(md, new_map);
 		if (r) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fc335e0..0e48151 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -55,6 +55,7 @@ union map_info *dm_get_mapinfo(struct bio *bio)
  */
 #define DMF_BLOCK_IO 0
 #define DMF_SUSPENDED 1
+#define DMF_FROZEN 2
 
 struct mapped_device {
 	struct rw_semaphore io_lock;
@@ -1021,6 +1022,8 @@ static int lock_fs(struct mapped_device *md)
 		return r;
 	}
 
+	set_bit(DMF_FROZEN, &md->flags);
+
 	/* don't bdput right now, we don't want the bdev
 	 * to go away while it is locked.
 	 */
@@ -1029,8 +1032,12 @@ static int lock_fs(struct mapped_device *md)
 
 static void unlock_fs(struct mapped_device *md)
 {
+	if (!test_bit(DMF_FROZEN, &md->flags))
+		return;
+
 	thaw_bdev(md->suspended_bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
+	clear_bit(DMF_FROZEN, &md->flags);
 }
 
 /*
@@ -1040,7 +1047,7 @@ static void unlock_fs(struct mapped_device *md)
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
-int dm_suspend(struct mapped_device *md)
+int dm_suspend(struct mapped_device *md, int do_lockfs)
 {
 	struct dm_table *map = NULL;
 	DECLARE_WAITQUEUE(wait, current);
@@ -1064,9 +1071,11 @@ int dm_suspend(struct mapped_device *md)
 	}
 
 	/* Flush I/O to the device. */
-	r = lock_fs(md);
-	if (r)
-		goto out;
+	if (do_lockfs) {
+		r = lock_fs(md);
+		if (r)
+			goto out;
+	}
 
 	/*
 	 * First we set the BLOCK_IO flag so no more ios will be mapped.
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 95a0cfb..4eaf075 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -69,7 +69,7 @@ void dm_put(struct mapped_device *md);
 /*
  * A device can still be used while suspended, but I/O is deferred.
  */
-int dm_suspend(struct mapped_device *md);
+int dm_suspend(struct mapped_device *md, int with_lockfs);
 int dm_resume(struct mapped_device *md);
 
 /*
-- 
cgit v1.1


From 6da487dcc0c6f4c827779687a20016efeffc4d60 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 6 Jan 2006 00:20:07 -0800
Subject: [PATCH] device-mapper ioctl: add skip lock_fs flag

Add ioctl DM_SKIP_LOCKFS_FLAG for userspace to request that lock_fs is
bypassed when suspending a device.

There's no change to the behaviour of existing code that doesn't know about
the new flag.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c    | 11 +++++++++--
 include/linux/dm-ioctl.h | 11 ++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index dbc07af..561bda5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -693,14 +693,18 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct mapped_device *md;
 
 	md = find_device(param);
 	if (!md)
 		return -ENXIO;
 
+	if (param->flags & DM_SKIP_LOCKFS_FLAG)
+		do_lockfs = 0;
+
 	if (!dm_suspended(md))
-		r = dm_suspend(md, 1);
+		r = dm_suspend(md, do_lockfs);
 
 	if (!r)
 		r = __dev_status(md, param);
@@ -712,6 +716,7 @@ static int do_suspend(struct dm_ioctl *param)
 static int do_resume(struct dm_ioctl *param)
 {
 	int r = 0;
+	int do_lockfs = 1;
 	struct hash_cell *hc;
 	struct mapped_device *md;
 	struct dm_table *new_map;
@@ -737,8 +742,10 @@ static int do_resume(struct dm_ioctl *param)
 	/* Do we need to load a new map ? */
 	if (new_map) {
 		/* Suspend if it isn't already suspended */
+		if (param->flags & DM_SKIP_LOCKFS_FLAG)
+			do_lockfs = 0;
 		if (!dm_suspended(md))
-			dm_suspend(md, 1);
+			dm_suspend(md, do_lockfs);
 
 		r = dm_swap_table(md, new_map);
 		if (r) {
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index f5eb6b6..fa75ba0 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -272,9 +272,9 @@ typedef char ioctl_struct[308];
 #define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	4
+#define DM_VERSION_MINOR	5
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2005-01-12)"
+#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -301,8 +301,13 @@ typedef char ioctl_struct[308];
 #define DM_BUFFER_FULL_FLAG	(1 << 8) /* Out */
 
 /*
- * Set this to improve performance when you aren't going to use open_count
+ * Set this to improve performance when you aren't going to use open_count.
  */
 #define DM_SKIP_BDGET_FLAG	(1 << 9) /* In */
 
+/*
+ * Set this to avoid attempting to freeze any filesystem when suspending.
+ */
+#define DM_SKIP_LOCKFS_FLAG	(1 << 10) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */
-- 
cgit v1.1


From 0b56306e56784d0513e1193d58c05a6bd97bd1a9 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:20:08 -0800
Subject: [PATCH] drivers/md/kcopyd.c: #if 0 kcopyd_cancel()

This patch #if 0's the not yet implemented global function kcopyd_cancel().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/kcopyd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index eb70364..ca99979 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -561,11 +561,13 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
  * Cancels a kcopyd job, eg. someone might be deactivating a
  * mirror.
  */
+#if 0
 int kcopyd_cancel(struct kcopyd_job *job, int block)
 {
 	/* FIXME: finish */
 	return -1;
 }
+#endif  /*  0  */
 
 /*-----------------------------------------------------------------
  * Unit setup
@@ -684,4 +686,3 @@ void kcopyd_client_destroy(struct kcopyd_client *kc)
 EXPORT_SYMBOL(kcopyd_client_create);
 EXPORT_SYMBOL(kcopyd_client_destroy);
 EXPORT_SYMBOL(kcopyd_copy);
-EXPORT_SYMBOL(kcopyd_cancel);
-- 
cgit v1.1


From 9d3520a339d62f942085e9888f66905eb8b350bd Mon Sep 17 00:00:00 2001
From: Stefan Rompf <stefan@loplof.de>
Date: Fri, 6 Jan 2006 00:20:08 -0800
Subject: [PATCH] dm-crypt: zero key before freeing it

Zap the memory before freeing it so we don't leave crypto information
around in memory.

Signed-off-by: Stefan Rompf <stefan@loplof.de>
Acked-by: Clemens Fruhwirth <clemens@endorphin.org>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-crypt.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cf66310..a601a42 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -690,6 +690,8 @@ bad3:
 bad2:
 	crypto_free_tfm(tfm);
 bad1:
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
 	kfree(cc);
 	return -EINVAL;
 }
@@ -706,6 +708,9 @@ static void crypt_dtr(struct dm_target *ti)
 		cc->iv_gen_ops->dtr(cc);
 	crypto_free_tfm(cc->tfm);
 	dm_put_device(ti, cc->dev);
+
+	/* Must zero key material before freeing */
+	memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
 	kfree(cc);
 }
 
-- 
cgit v1.1


From ac81b2ee45eb811fdb0aa1cfb71d468d944d00ce Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Fri, 6 Jan 2006 00:20:11 -0800
Subject: [PATCH] make dm-mirror not issue invalid resync requests

I've been attempting to set up a (Host)RAID mirror with dm_mirror on
2.6.14.3, and I've been having a strange little problem.  The configuration
in question is a set of 9GB SCSI disks that have 17942584 sectors.  I set
up the dm_mirror table as such:

0 17942528 mirror core 2 2048 nosync 2 8:48 0 8:64 0

If I'm not mistaken, this sets up a 9GB RAID1 mriror with 1MB stripes
across both SCSI disks.  The sector count of the dm device is less than the
size of the disks, so we shouldn't fall off the end.  However, I always get
the messages like this in dmesg when I set up the dm table:

attempt to access beyond end of device
sdd: rw=0, want=17958656, limit=17942584

Clearly, something is trying to read sectors past the end of the drive.  I
traced it down to the __rh_recovery_prepare function in dm-raid1.c, which
gets called when we're putting the mirror set together.  This function
calls the dirty region log's get_resync_work function to see if there's any
resync that needs to be done, and queues up any areas that are out of sync.
 The log's get_resync_work function is actually a pointer to the
core_get_resync_work function in dm-log.c.

The core_get_resync_work function queries a bitset lc->sync_bits to find
out if there are any regions that are out of date (i.e.  the bit is 0),
which is where the problem occurs.  If every bit in lc->sync_bits is 1
(which is the case when we've just configured a new RAID1 with the nosync
option), the find_next_zero_bit does NOT return the size parameter
(lc->region_count in this case), it returns the size parameter rounded up
to the nearest multiple of 32!  I don't know if this is intentional, but
i386 and x86_64 both exhibit this behavior.

In any case, the statement "if (*region == lc->region_count)" looks like
it's supposed to catch the case where are no regions to resync and
return 0.  Since find_next_zero_bit apparently has a habit of returning
a value that's larger than lc->region_count, the enclosed patch changes
the equality test to a greater-than test so that we don't try to resync
areas outside of the RAID1 region.  Seeing as the HostRAID metadata
lives just past the end of the RAID1 data, mucking around in that area
is not a good idea.

I suppose another way to fix this would be to amend find_next_zero_bit so
that it doesn't return values larger than "size", but I don't know if
there's a reason for the current behavior.

Signed-Off-By: Darrick J. Wong <djwong@us.ibm.com>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a76349c..efe4adf 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -573,7 +573,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
 					     lc->sync_search);
 		lc->sync_search = *region + 1;
 
-		if (*region == lc->region_count)
+		if (*region >= lc->region_count)
 			return 0;
 
 	} while (log_test_bit(lc->recovering_bits, *region));
-- 
cgit v1.1


From 17999be4aa408e7ff3b9d32c735649676567a3cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:12 -0800
Subject: [PATCH] md: improve raid1 "IO Barrier" concept

raid1 needs to put up a barrier to new requests while it does resync or other
background recovery.  The code for this is currently open-coded, slighty
obscure by its use of two waitqueues, and not documented.

This patch gathers all the related code into 4 functions, and includes a
comment which (hopefully) explains what is happening.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c         | 167 ++++++++++++++++++++++++---------------------
 include/linux/raid/raid1.h |   4 +-
 2 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 229d7b2..f520414 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -51,6 +51,8 @@ static mdk_personality_t raid1_personality;
 
 static void unplug_slaves(mddev_t *mddev);
 
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -160,20 +162,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 static inline void free_r1bio(r1bio_t *r1_bio)
 {
-	unsigned long flags;
-
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	allow_barrier(conf);
 
 	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +177,10 @@ static inline void free_r1bio(r1bio_t *r1_bio)
 static inline void put_buf(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
-	unsigned long flags;
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!conf->barrier)
-		BUG();
-	--conf->barrier;
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	lower_barrier(conf);
 }
 
 static void reschedule_retry(r1bio_t *r1_bio)
@@ -210,6 +193,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 	list_add(&r1_bio->retry_list, &conf->retry_list);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
+	wake_up(&conf->wait_barrier);
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -593,30 +577,83 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-/*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down.  This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'.  When that returns there
+ *    is no backgroup IO happening,  It must arrange to call
+ *    allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier.  Once that returns
+ *    there is no normal IO happeing.  It must arrange to call
+ *    lower_barrier when the particular background IO completes.
  */
 #define RESYNC_DEPTH 32
 
-static void device_barrier(conf_t *conf, sector_t sect)
+static void raise_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-			    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-	
-	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-		if (conf->nr_pending)
-			BUG();
+
+	/* Wait until no block IO is waiting */
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+
+	/* block any new IO from starting */
+	conf->barrier++;
+
+	/* No wait for all pending IO to complete */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->barrier--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->barrier) {
+		conf->nr_waiting++;
+		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+				    conf->resync_lock,
+				    raid1_unplug(conf->mddev->queue));
+		conf->nr_waiting--;
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, raid1_unplug(conf->mddev->queue));
-	conf->next_resync = sect;
+	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void allow_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->nr_pending--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
 {
@@ -678,10 +715,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 */
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
+	wait_barrier(conf);
 
 	disk_stat_inc(mddev->gendisk, ios[rw]);
 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -909,13 +943,8 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-			    conf->resync_lock, 	raid1_unplug(conf->mddev->queue));
-	spin_unlock_irq(&conf->resync_lock);
-
-	if (conf->barrier) BUG();
-	if (waitqueue_active(&conf->wait_idle)) BUG();
+	wait_barrier(conf);
+	allow_barrier(conf);
 
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
@@ -1317,12 +1346,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks;
 	}
 	/*
-	 * If there is non-resync activity waiting for us then
-	 * put in a delay to throttle resync.
+	 * If there is non-resync activity waiting for a turn,
+	 * and resync is going fast enough,
+	 * then let it though before starting on this new sync request.
 	 */
-	if (!go_faster && waitqueue_active(&conf->wait_resume))
+	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	device_barrier(conf, sector_nr + RESYNC_SECTORS);
+
+	raise_barrier(conf);
+
+	conf->next_resync = sector_nr;
 
 	/*
 	 * If reconstructing, and >1 working disc,
@@ -1355,10 +1388,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
-	spin_lock_irq(&conf->resync_lock);
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
-
 	r1_bio->mddev = mddev;
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
@@ -1542,8 +1571,7 @@ static int run(mddev_t *mddev)
 		mddev->recovery_cp = MaxSector;
 
 	spin_lock_init(&conf->resync_lock);
-	init_waitqueue_head(&conf->wait_idle);
-	init_waitqueue_head(&conf->wait_resume);
+	init_waitqueue_head(&conf->wait_barrier);
 
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
@@ -1714,11 +1742,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	}
 	memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
 
-	spin_lock_irq(&conf->resync_lock);
-	conf->barrier++;
-	wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-			    conf->resync_lock, raid1_unplug(mddev->queue));
-	spin_unlock_irq(&conf->resync_lock);
+	raise_barrier(conf);
 
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
@@ -1738,12 +1762,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 
 	conf->last_used = 0; /* just make sure it is in-range */
-	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
-	spin_unlock_irq(&conf->resync_lock);
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
+	lower_barrier(conf);
 
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
@@ -1758,18 +1777,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 
 	switch(state) {
 	case 1:
-		spin_lock_irq(&conf->resync_lock);
-		conf->barrier++;
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, raid1_unplug(mddev->queue));
-		spin_unlock_irq(&conf->resync_lock);
+		raise_barrier(conf);
 		break;
 	case 0:
-		spin_lock_irq(&conf->resync_lock);
-		conf->barrier--;
-		spin_unlock_irq(&conf->resync_lock);
-		wake_up(&conf->wait_resume);
-		wake_up(&conf->wait_idle);
+		lower_barrier(conf);
 		break;
 	}
 	if (mddev->thread) {
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 292b98f..c556742 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -45,6 +45,7 @@ struct r1_private_data_s {
 
 	spinlock_t		resync_lock;
 	int			nr_pending;
+	int			nr_waiting;
 	int			barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -52,8 +53,7 @@ struct r1_private_data_s {
 					    * Cleared when a sync completes.
 					    */
 
-	wait_queue_head_t	wait_idle;
-	wait_queue_head_t	wait_resume;
+	wait_queue_head_t	wait_barrier;
 
 	struct pool_info	*poolinfo;
 
-- 
cgit v1.1


From 0a27ec96b6fb1abf867e36d7b0b681d67588767a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:13 -0800
Subject: [PATCH] md: improve raid10 "IO Barrier" concept

raid10 needs to put up a barrier to new requests while it does resync or other
background recovery.  The code for this is currently open-coded, slighty
obscure by its use of two waitqueues, and not documented.

This patch gathers all the related code into 4 functions, and includes a
comment which (hopefully) explains what is happening.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 135 ++++++++++++++++++++++++++------------------
 include/linux/raid/raid10.h |   4 +-
 2 files changed, 81 insertions(+), 58 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 713dc9c..50bd7b1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -47,6 +47,9 @@
 
 static void unplug_slaves(mddev_t *mddev);
 
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
@@ -175,20 +178,13 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 static inline void free_r10bio(r10bio_t *r10_bio)
 {
-	unsigned long flags;
-
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	allow_barrier(conf);
 
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
@@ -197,22 +193,10 @@ static inline void free_r10bio(r10bio_t *r10_bio)
 static inline void put_buf(r10bio_t *r10_bio)
 {
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
-	unsigned long flags;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	if (!conf->barrier)
-		BUG();
-	--conf->barrier;
-	wake_up(&conf->wait_resume);
-	wake_up(&conf->wait_idle);
-
-	if (!--conf->nr_pending) {
-		wake_up(&conf->wait_idle);
-		wake_up(&conf->wait_resume);
-	}
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	lower_barrier(conf);
 }
 
 static void reschedule_retry(r10bio_t *r10_bio)
@@ -640,30 +624,82 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
 	return ret;
 }
 
-/*
- * Throttle resync depth, so that we can both get proper overlapping of
- * requests, but are still able to handle normal requests quickly.
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down.  This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'.  When that returns there
+ *    is no backgroup IO happening,  It must arrange to call
+ *    allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier.  Once that returns
+ *    there is no normal IO happeing.  It must arrange to call
+ *    lower_barrier when the particular background IO completes.
  */
 #define RESYNC_DEPTH 32
 
-static void device_barrier(conf_t *conf, sector_t sect)
+static void raise_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
-			    conf->resync_lock, unplug_slaves(conf->mddev));
-
-	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
-				    conf->resync_lock, unplug_slaves(conf->mddev));
-		if (conf->nr_pending)
-			BUG();
+
+	/* Wait until no block IO is waiting */
+	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+
+	/* block any new IO from starting */
+	conf->barrier++;
+
+	/* No wait for all pending IO to complete */
+	wait_event_lock_irq(conf->wait_barrier,
+			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->barrier--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+	spin_lock_irq(&conf->resync_lock);
+	if (conf->barrier) {
+		conf->nr_waiting++;
+		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+				    conf->resync_lock,
+				    raid10_unplug(conf->mddev->queue));
+		conf->nr_waiting--;
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock, unplug_slaves(conf->mddev));
-	conf->next_resync = sect;
+	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void allow_barrier(conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->resync_lock, flags);
+	conf->nr_pending--;
+	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	wake_up(&conf->wait_barrier);
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -719,10 +755,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
-	conf->nr_pending++;
-	spin_unlock_irq(&conf->resync_lock);
+	wait_barrier(conf);
 
 	disk_stat_inc(mddev->gendisk, ios[rw]);
 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -897,13 +930,8 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
-			    conf->resync_lock, 	unplug_slaves(conf->mddev));
-	spin_unlock_irq(&conf->resync_lock);
-
-	if (conf->barrier) BUG();
-	if (waitqueue_active(&conf->wait_idle)) BUG();
+	wait_barrier(conf);
+	allow_barrier(conf);
 
 	mempool_destroy(conf->r10buf_pool);
 	conf->r10buf_pool = NULL;
@@ -1395,9 +1423,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
 	 */
-	if (!go_faster && waitqueue_active(&conf->wait_resume))
+	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	device_barrier(conf, sector_nr + RESYNC_SECTORS);
+	raise_barrier(conf);
+	conf->next_resync = sector_nr;
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
@@ -1427,7 +1456,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 				spin_lock_irq(&conf->resync_lock);
-				conf->nr_pending++;
 				if (rb2) conf->barrier++;
 				spin_unlock_irq(&conf->resync_lock);
 				atomic_set(&r10_bio->remaining, 0);
@@ -1500,10 +1528,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		int count = 0;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 
-		spin_lock_irq(&conf->resync_lock);
-		conf->nr_pending++;
-		spin_unlock_irq(&conf->resync_lock);
-
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
 
@@ -1713,8 +1737,7 @@ static int run(mddev_t *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 
 	spin_lock_init(&conf->resync_lock);
-	init_waitqueue_head(&conf->wait_idle);
-	init_waitqueue_head(&conf->wait_resume);
+	init_waitqueue_head(&conf->wait_barrier);
 
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf)) {
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 6070878..08317b7 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -39,11 +39,11 @@ struct r10_private_data_s {
 
 	spinlock_t		resync_lock;
 	int nr_pending;
+	int nr_waiting;
 	int barrier;
 	sector_t		next_resync;
 
-	wait_queue_head_t	wait_idle;
-	wait_queue_head_t	wait_resume;
+	wait_queue_head_t	wait_barrier;
 
 	mempool_t *r10bio_pool;
 	mempool_t *r10buf_pool;
-- 
cgit v1.1


From 14f8d26b8ea3413b28f2cac208c9a93600fe3a80 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:14 -0800
Subject: [PATCH] md: small cleanups for raid5

Resync code:
  A test that isn't needed,
  a 'compute_block' that makes more sense
    elsewhere (And then doesn't need a test),
  a couple of BUG_ONs to confirm the change makes sense.

Printks:
  A few were missing KERN_*

Also fix a typo in a comment..

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fafc4bc..334ff7a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -417,7 +417,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 #endif
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-			printk("R5: read error corrected!!\n");
+			printk(KERN_INFO "raid5: read error corrected!!\n");
 			clear_bit(R5_ReadError, &sh->dev[i].flags);
 			clear_bit(R5_ReWrite, &sh->dev[i].flags);
 		}
@@ -428,13 +428,14 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&conf->disks[i].rdev->read_errors);
 		if (conf->mddev->degraded)
-			printk("R5: read error not correctable.\n");
+			printk(KERN_WARNING "raid5: read error not correctable.\n");
 		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
 			/* Oh, no!!! */
-			printk("R5: read error NOT corrected!!\n");
+			printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
 		else if (atomic_read(&conf->disks[i].rdev->read_errors)
 			 > conf->max_nr_stripes)
-			printk("raid5: Too many read errors, failing device.\n");
+			printk(KERN_WARNING
+			       "raid5: Too many read errors, failing device.\n");
 		else
 			retry = 1;
 		if (retry)
@@ -604,7 +605,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 			*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
 			break;
 		default:
-			printk("raid5: unsupported algorithm %d\n",
+			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
 				conf->algorithm);
 	}
 
@@ -645,7 +646,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 			i -= (sh->pd_idx + 1);
 			break;
 		default:
-			printk("raid5: unsupported algorithm %d\n",
+			printk(KERN_ERR "raid5: unsupported algorithm %d\n",
 				conf->algorithm);
 	}
 
@@ -654,7 +655,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
 
 	check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
 	if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
-		printk("compute_blocknr: map not correct\n");
+		printk(KERN_ERR "compute_blocknr: map not correct\n");
 		return 0;
 	}
 	return r_sector;
@@ -737,7 +738,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 		if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
 			ptr[count++] = p;
 		else
-			printk("compute_block() %d, stripe %llu, %d"
+			printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
 				" not present\n", dd_idx,
 				(unsigned long long)sh->sector, i);
 
@@ -1005,7 +1006,7 @@ static void handle_stripe(struct stripe_head *sh)
 		if (dev->written) written++;
 		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
-			/* The ReadError flag wil just be confusing now */
+			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
 			clear_bit(R5_ReWrite, &dev->flags);
 		}
@@ -1288,7 +1289,7 @@ static void handle_stripe(struct stripe_head *sh)
 	 * is available
 	 */
 	if (syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
+	    !test_bit(STRIPE_INSYNC, &sh->state)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		if (failed == 0) {
 			char *pagea;
@@ -1306,21 +1307,20 @@ static void handle_stripe(struct stripe_head *sh)
 				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
 					/* don't try to repair!! */
 					set_bit(STRIPE_INSYNC, &sh->state);
+				else {
+					compute_block(sh, sh->pd_idx);
+					uptodate++;
+				}
 			}
 		}
 		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+			/* either failed parity check, or recovery is happening */
 			if (failed==0)
 				failed_num = sh->pd_idx;
-			/* should be able to compute the missing block and write it to spare */
-			if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
-				if (uptodate+1 != disks)
-					BUG();
-				compute_block(sh, failed_num);
-				uptodate++;
-			}
-			if (uptodate != disks)
-				BUG();
 			dev = &sh->dev[failed_num];
+			BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
+			BUG_ON(uptodate != disks);
+
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
 			clear_bit(STRIPE_DEGRADED, &sh->state);
@@ -1822,7 +1822,8 @@ static int run(mddev_t *mddev)
 	struct list_head *tmp;
 
 	if (mddev->level != 5 && mddev->level != 4) {
-		printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level);
+		printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n",
+		       mdname(mddev), mddev->level);
 		return -EIO;
 	}
 
-- 
cgit v1.1


From 6ff8d8ec06690f4011a6c3ad9e0759b9094f0601 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:15 -0800
Subject: [PATCH] md: allow dirty raid[456] arrays to be started at boot

See patch to md.txt for more details

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 24 ++++++++++++++++++++++++
 drivers/md/md.c           |  4 ++++
 drivers/md/raid5.c        | 15 +++++++++++----
 drivers/md/raid6main.c    | 13 +++++++++----
 include/linux/raid/md_k.h |  1 +
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 23e6cce..1dd0fb6 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -51,6 +51,30 @@ superblock can be autodetected and run at boot time.
 The kernel parameter "raid=partitionable" (or "raid=part") means
 that all auto-detected arrays are assembled as partitionable.
 
+Boot time assembly of degraded/dirty arrays
+-------------------------------------------
+
+If a raid5 or raid6 array is both dirty and degraded, it could have
+undetectable data corruption.  This is because the fact that it is
+'dirty' means that the parity cannot be trusted, and the fact that it
+is degraded means that some datablocks are missing and cannot reliably
+be reconstructed (due to no parity).
+
+For this reason, md will normally refuse to start such an array.  This
+requires the sysadmin to take action to explicitly start the array
+desipite possible corruption.  This is normally done with
+   mdadm --assemble --force ....
+
+This option is not really available if the array has the root
+filesystem on it.  In order to support this booting from such an
+array, md supports a module parameter "start_dirty_degraded" which,
+when set to 1, bypassed the checks and will allows dirty degraded
+arrays to be started.
+
+So, to boot with a root filesystem of a dirty degraded raid[56], use
+
+   md-mod.start_dirty_degraded=1
+
 
 Superblock formats
 ------------------
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a..b4fb724 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1937,6 +1937,7 @@ static void md_safemode_timeout(unsigned long data)
 	md_wakeup_thread(mddev->thread);
 }
 
+static int start_dirty_degraded;
 
 static int do_md_run(mddev_t * mddev)
 {
@@ -2048,6 +2049,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
+	mddev->ok_start_degraded = start_dirty_degraded;
 
 	if (start_readonly)
 		mddev->ro = 2; /* read-only, but switch on first write */
@@ -4509,6 +4511,8 @@ static int set_ro(const char *val, struct kernel_param *kp)
 }
 
 module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
+module_param(start_dirty_degraded, int, 0644);
+
 
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 334ff7a..53a0f2c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1904,10 +1904,17 @@ static int run(mddev_t *mddev)
 
 	if (mddev->degraded == 1 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR 
-			"raid5: cannot start dirty degraded array for %s\n",
-			mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING
+			       "raid5: starting dirty degraded array: %s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR
+			       "raid5: cannot start dirty degraded array for %s\n",
+			       mdname(mddev));
+			goto abort;
+		}
 	}
 
 	{
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 0000d16..9ac6dcd 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1929,13 +1929,18 @@ static int run(mddev_t *mddev)
 		goto abort;
 	}
 
-#if 0				/* FIX: For now */
 	if (mddev->degraded > 0 &&
 	    mddev->recovery_cp != MaxSector) {
-		printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
-		goto abort;
+		if (mddev->ok_start_degraded)
+			printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
+			       "- data corruption possible.\n",
+			       mdname(mddev));
+		else {
+			printk(KERN_ERR "raid6: cannot start dirty degraded array"
+			       " for %s\n", mdname(mddev));
+			goto abort;
+		}
 	}
-#endif
 
 	{
 		mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 46629a2..1dd587b 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -183,6 +183,7 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
 	 * RUNNING:  a thread is running, or about to be started
-- 
cgit v1.1


From b15c2e57f0f5bf596a19e9c5571e5b07cdfc7363 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:16 -0800
Subject: [PATCH] md: move bitmap_create to after md array has been initialised

This is important because bitmap_create uses
  mddev->resync_max_sectors
and that doesn't have a valid value until after the array
has been initialised (with pers->run()).
[It doesn't make a difference for current personalities that
 support bitmaps, but will make a difference for raid10]

This has the added advantage of meaning with can move the thread->timeout
manipulation inside the bitmap.c code instead of sprinkling identical code
throughout all personalities.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c    |  4 ++++
 drivers/md/md.c        | 16 +++++++++-------
 drivers/md/raid1.c     |  8 --------
 drivers/md/raid5.c     | 11 +----------
 drivers/md/raid6main.c | 11 +----------
 5 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 252d55d..b65c36d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1530,6 +1530,8 @@ void bitmap_destroy(mddev_t *mddev)
 		return;
 
 	mddev->bitmap = NULL; /* disconnect from the md device */
+	if (mddev->thread)
+		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
 
 	bitmap_free(bitmap);
 }
@@ -1636,6 +1638,8 @@ int bitmap_create(mddev_t *mddev)
 
 	if (IS_ERR(bitmap->writeback_daemon))
 		return PTR_ERR(bitmap->writeback_daemon);
+	mddev->thread->timeout = bitmap->daemon_sleep * HZ;
+
 	return bitmap_update_sb(bitmap);
 
  error:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b4fb724..ee199d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2054,13 +2054,15 @@ static int do_md_run(mddev_t * mddev)
 	if (start_readonly)
 		mddev->ro = 2; /* read-only, but switch on first write */
 
-	/* before we start the array running, initialise the bitmap */
-	err = bitmap_create(mddev);
-	if (err)
-		printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
-			mdname(mddev), err);
-	else
-		err = mddev->pers->run(mddev);
+	err = mddev->pers->run(mddev);
+	if (!err && mddev->pers->sync_request) {
+		err = bitmap_create(mddev);
+		if (err) {
+			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
+			       mdname(mddev), err);
+			mddev->pers->stop(mddev);
+		}
+	}
 	if (err) {
 		printk(KERN_ERR "md: pers->run() failed ...\n");
 		module_put(mddev->pers->owner);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f520414..c618015 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1611,7 +1611,6 @@ static int run(mddev_t *mddev)
 		       mdname(mddev));
 		goto out_free_conf;
 	}
-	if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
 
 	printk(KERN_INFO 
 		"raid1: raid set %s active with %d out of %d mirrors\n",
@@ -1783,13 +1782,6 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 		lower_barrier(conf);
 		break;
 	}
-	if (mddev->thread) {
-		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-		else
-			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		md_wakeup_thread(mddev->thread);
-	}
 }
 
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 53a0f2c..0d016a8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1964,9 +1964,6 @@ static int run(mddev_t *mddev)
 	/* Ok, everything is just fine now */
 	sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
 
-	if (mddev->bitmap)
-		mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
@@ -2200,14 +2197,8 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 		spin_unlock_irq(&conf->device_lock);
 		break;
 	}
-	if (mddev->thread) {
-		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-		else
-			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		md_wakeup_thread(mddev->thread);
-	}
 }
+
 static mdk_personality_t raid5_personality=
 {
 	.name		= "raid5",
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 9ac6dcd..304455d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1990,9 +1990,6 @@ static int run(mddev_t *mddev)
 	/* Ok, everything is just fine now */
 	mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
 
-	if (mddev->bitmap)
-		mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-
 	mddev->queue->unplug_fn = raid6_unplug_device;
 	mddev->queue->issue_flush_fn = raid6_issue_flush;
 	return 0;
@@ -2228,14 +2225,8 @@ static void raid6_quiesce(mddev_t *mddev, int state)
 		spin_unlock_irq(&conf->device_lock);
 		break;
 	}
-	if (mddev->thread) {
-		if (mddev->bitmap)
-			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
-		else
-			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
-		md_wakeup_thread(mddev->thread);
-	}
 }
+
 static mdk_personality_t raid6_personality=
 {
 	.name		= "raid6",
-- 
cgit v1.1


From 6cce3b23f6f8e974c00af7a9b88f1d413ba368a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:16 -0800
Subject: [PATCH] md: write intent bitmap support for raid10

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c             |  10 ++-
 drivers/md/raid10.c         | 178 ++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/raid10.h |   9 ++-
 3 files changed, 171 insertions(+), 26 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ee199d4..64e7da3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 		    mddev->bitmap_file == NULL) {
-			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
 				/* FIXME use a better test */
-				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
 				return -EINVAL;
 			}
 			mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
 		    mddev->bitmap_file == NULL ) {
-			if (mddev->level != 1) {
-				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
+			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+			    && mddev->level != 10) {
+				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
 				return -EINVAL;
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50bd7b1..8f58a44 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include "dm-bio-list.h"
 #include <linux/raid/raid10.h>
+#include <linux/raid/bitmap.h>
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
+	if (!uptodate) {
 		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-	else
+		/* an I/O failed, we can't clear the bitmap */
+		set_bit(R10BIO_Degraded, &r10_bio->state);
+	} else
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		/* clear the bitmap if all writes complete successfully */
+		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
+				r10_bio->sectors,
+				!test_bit(R10BIO_Degraded, &r10_bio->state),
+				0);
 		md_write_end(r10_bio->mddev);
 		raid_end_bio_io(r10_bio);
 	}
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 	rcu_read_lock();
 	/*
 	 * Check if we can balance. We can balance on the whole
-	 * device if no resync is going on, or below the resync window.
-	 * We take the first readable disk when above the resync window.
+	 * device if no resync is going on (recovery is ok), or below
+	 * the resync window. We take the first readable disk when
+	 * above the resync window.
 	 */
 	if (conf->mddev->recovery_cp < MaxSector
 	    && (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
 
 static void raid10_unplug(request_queue_t *q)
 {
+	mddev_t *mddev = q->queuedata;
+
 	unplug_slaves(q->queuedata);
+	md_wakeup_thread(mddev->thread);
 }
 
 static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
  */
 #define RESYNC_DEPTH 32
 
-static void raise_barrier(conf_t *conf)
+static void raise_barrier(conf_t *conf, int force)
 {
+	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
 
-	/* Wait until no block IO is waiting */
-	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+	/* Wait until no block IO is waiting (unless 'force') */
+	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 			    conf->resync_lock,
 			    raid10_unplug(conf->mddev->queue));
 
@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
+	struct bio_list bl;
+	unsigned long flags;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_sector;
+	r10_bio->state = 0;
 
 	if (rw == READ) {
 		/*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		    !test_bit(Faulty, &rdev->flags)) {
 			atomic_inc(&rdev->nr_pending);
 			r10_bio->devs[i].bio = bio;
-		} else
+		} else {
 			r10_bio->devs[i].bio = NULL;
+			set_bit(R10BIO_Degraded, &r10_bio->state);
+		}
 	}
 	rcu_read_unlock();
 
-	atomic_set(&r10_bio->remaining, 1);
+	atomic_set(&r10_bio->remaining, 0);
 
+	bio_list_init(&bl);
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		mbio->bi_private = r10_bio;
 
 		atomic_inc(&r10_bio->remaining);
-		generic_make_request(mbio);
+		bio_list_add(&bl, mbio);
 	}
 
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(mddev);
-		raid_end_bio_io(r10_bio);
-	}
+	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+	spin_lock_irqsave(&conf->device_lock, flags);
+	bio_list_merge(&conf->pending_bio_list, &bl);
+	blk_plug_device(mddev->queue);
+	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	return 0;
 }
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!enough(conf))
 		return 0;
 
-	for (mirror=0; mirror < mddev->raid_disks; mirror++)
+	if (rdev->saved_raid_disk >= 0 &&
+	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+		mirror = rdev->saved_raid_disk;
+	else
+		mirror = 0;
+	for ( ; mirror < mddev->raid_disks; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 
 			blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
 			found = 1;
+			if (rdev->saved_raid_disk != mirror)
+				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 		spin_lock_irqsave(&conf->device_lock, flags);
+
+		if (conf->pending_bio_list.head) {
+			bio = bio_list_get(&conf->pending_bio_list);
+			blk_remove_plug(mddev->queue);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
+			if (bitmap_unplug(mddev->bitmap) != 0)
+				printk("%s: bitmap file write failed!\n", mdname(mddev));
+
+			while (bio) { /* submit pending writes */
+				struct bio *next = bio->bi_next;
+				bio->bi_next = NULL;
+				generic_make_request(bio);
+				bio = next;
+			}
+			unplug = 1;
+
+			continue;
+		}
+
 		if (list_empty(head))
 			break;
 		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t max_sector, nr_sectors;
 	int disk;
 	int i;
+	int max_sync;
+	int sync_blocks;
 
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
+		/* If we aborted, we need to abort the
+		 * sync on the 'current' bitmap chucks (there can
+		 * be several when recovering multiple devices).
+		 * as we may have started syncing it but not finished.
+		 * We can find the current address in
+		 * mddev->curr_resync, but for recovery,
+		 * we need to convert that to several
+		 * virtual addresses.
+		 */
+		if (mddev->curr_resync < max_sector) { /* aborted */
+			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+						&sync_blocks, 1);
+			else for (i=0; i<conf->raid_disks; i++) {
+				sector_t sect =
+					raid10_find_virt(conf, mddev->curr_resync, i);
+				bitmap_end_sync(mddev->bitmap, sect,
+						&sync_blocks, 1);
+			}
+		} else /* completed sync */
+			conf->fullsync = 0;
+
+		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		*skipped = 1;
 		return sectors_skipped;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 */
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
-	raise_barrier(conf);
-	conf->next_resync = sector_nr;
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 * end_sync_write if we will want to write.
 	 */
 
+	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
 		int i, j, k;
@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		for (i=0 ; i<conf->raid_disks; i++)
 			if (conf->mirrors[i].rdev &&
 			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
+				int still_degraded = 0;
 				/* want to reconstruct this device */
 				r10bio_t *rb2 = r10_bio;
+				sector_t sect = raid10_find_virt(conf, sector_nr, i);
+				int must_sync;
+				/* Unless we are doing a full sync, we only need
+				 * to recover the block if it is set in the bitmap
+				 */
+				must_sync = bitmap_start_sync(mddev->bitmap, sect,
+							      &sync_blocks, 1);
+				if (sync_blocks < max_sync)
+					max_sync = sync_blocks;
+				if (!must_sync &&
+				    !conf->fullsync) {
+					/* yep, skip the sync_blocks here, but don't assume
+					 * that there will never be anything to do here
+					 */
+					chunks_skipped = -1;
+					continue;
+				}
 
 				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
-				spin_lock_irq(&conf->resync_lock);
-				if (rb2) conf->barrier++;
-				spin_unlock_irq(&conf->resync_lock);
+				raise_barrier(conf, rb2 != NULL);
 				atomic_set(&r10_bio->remaining, 0);
 
 				r10_bio->master_bio = (struct bio*)rb2;
@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 					atomic_inc(&rb2->remaining);
 				r10_bio->mddev = mddev;
 				set_bit(R10BIO_IsRecover, &r10_bio->state);
-				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
+				r10_bio->sector = sect;
+
 				raid10_find_phys(conf, r10_bio);
+				/* Need to check if this section will still be
+				 * degraded
+				 */
+				for (j=0; j<conf->copies;j++) {
+					int d = r10_bio->devs[j].devnum;
+					if (conf->mirrors[d].rdev == NULL ||
+					    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+						still_degraded = 1;
+				}
+				must_sync = bitmap_start_sync(mddev->bitmap, sect,
+							      &sync_blocks, still_degraded);
+
 				for (j=0; j<conf->copies;j++) {
 					int d = r10_bio->devs[j].devnum;
 					if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	} else {
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
+
+		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+				       &sync_blocks, mddev->degraded) &&
+		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+			/* We can skip this block */
+			*skipped = 1;
+			return sync_blocks + sectors_skipped;
+		}
+		if (sync_blocks < max_sync)
+			max_sync = sync_blocks;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
+		raise_barrier(conf, 0);
+		conf->next_resync = sector_nr;
 
 		r10_bio->master_bio = NULL;
 		r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	}
 
 	nr_sectors = 0;
+	if (sector_nr + max_sync < max_sector)
+		max_sector = sector_nr + max_sync;
 	do {
 		struct page *page;
 		int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
 	return 0;
 }
 
+static void raid10_quiesce(mddev_t *mddev, int state)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+
+	switch(state) {
+	case 1:
+		raise_barrier(conf, 0);
+		break;
+	case 0:
+		lower_barrier(conf);
+		break;
+	}
+	if (mddev->thread) {
+		if (mddev->bitmap)
+			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+		else
+			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		md_wakeup_thread(mddev->thread);
+	}
+}
 
 static mdk_personality_t raid10_personality =
 {
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
 	.hot_remove_disk= raid10_remove_disk,
 	.spare_active	= raid10_spare_active,
 	.sync_request	= sync_request,
+	.quiesce	= raid10_quiesce,
 };
 
 static int __init raid_init(void)
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 08317b7..b660cbf 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -35,13 +35,19 @@ struct r10_private_data_s {
 	sector_t chunk_mask;
 
 	struct list_head	retry_list;
-	/* for use when syncing mirrors: */
+	/* queue pending writes and submit them on unplug */
+	struct bio_list		pending_bio_list;
+
 
 	spinlock_t		resync_lock;
 	int nr_pending;
 	int nr_waiting;
 	int barrier;
 	sector_t		next_resync;
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
 
 	wait_queue_head_t	wait_barrier;
 
@@ -100,4 +106,5 @@ struct r10bio_s {
 #define	R10BIO_Uptodate	0
 #define	R10BIO_IsSync	1
 #define	R10BIO_IsRecover 2
+#define	R10BIO_Degraded 3
 #endif
-- 
cgit v1.1


From ca65b73bd9c301d243df93780f7b26579e6c9204 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:17 -0800
Subject: [PATCH] md: fix raid6 resync check/repair code

raid6 currently does not check the P/Q syndromes when doing a resync, it just
calculates the correct value and writes it.  Doing the check can reduce writes
(often to 0) for a resync, and it is needed to properly implement the

  echo check > sync_action

operation.

This patch implements the appropriate checks and tidies up some related code.

It also allows raid6 user-requested resync to bypass the intent bitmap.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid6main.c     | 182 ++++++++++++++++++++++++++-------------------
 include/linux/raid/raid5.h |   2 +
 2 files changed, 108 insertions(+), 76 deletions(-)

diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 304455d..52e8796 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method)
 }
 
 /* Compute one missing block */
-static void compute_block_1(struct stripe_head *sh, int dd_idx)
+static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int i, count, disks = conf->raid_disks;
@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
 		compute_parity(sh, UPDATE_PARITY);
 	} else {
 		ptr[0] = page_address(sh->dev[dd_idx].page);
-		memset(ptr[0], 0, STRIPE_SIZE);
+		if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
 		count = 1;
 		for (i = disks ; i--; ) {
 			if (i == dd_idx || i == qd_idx)
@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
 		}
 		if (count != 1)
 			xor_block(count, STRIPE_SIZE, ptr);
-		set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+		if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
+		else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
 	}
 }
 
@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 			return;
 		} else {
 			/* We're missing D+Q; recompute D from P */
-			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1);
+			compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
 			compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
 			return;
 		}
@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 }
 
 
+static int page_is_zero(struct page *p)
+{
+	char *a = page_address(p);
+	return ((*(u32*)a) == 0 &&
+		memcmp(a, a+4, STRIPE_SIZE-4)==0);
+}
 /*
  * handle_stripe - do things to a stripe.
  *
@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
  *
  */
 
-static void handle_stripe(struct stripe_head *sh)
+static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 {
 	raid6_conf_t *conf = sh->raid_conf;
 	int disks = conf->raid_disks;
@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh)
 				if (uptodate == disks-1) {
 					PRINTK("Computing stripe %llu block %d\n",
 					       (unsigned long long)sh->sector, i);
-					compute_block_1(sh, i);
+					compute_block_1(sh, i, 0);
 					uptodate++;
 				} else if ( uptodate == disks-2 && failed >= 2 ) {
 					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh)
 				/* We have failed blocks and need to compute them */
 				switch ( failed ) {
 				case 0:	BUG();
-				case 1: compute_block_1(sh, failed_num[0]); break;
+				case 1: compute_block_1(sh, failed_num[0], 0); break;
 				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
 				default: BUG();	/* This request should have been failed? */
 				}
@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh)
 					       (unsigned long long)sh->sector, i);
 					locked++;
 					set_bit(R5_Wantwrite, &sh->dev[i].flags);
-#if 0 /**** FIX: I don't understand the logic here... ****/
-					if (!test_bit(R5_Insync, &sh->dev[i].flags)
-					    || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
-						set_bit(STRIPE_INSYNC, &sh->state);
-#endif
 				}
+			/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
+			set_bit(STRIPE_INSYNC, &sh->state);
+
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 				atomic_dec(&conf->preread_active_stripes);
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh)
 	 * Any reads will already have been scheduled, so we just see if enough data
 	 * is available
 	 */
-	if (syncing && locked == 0 &&
-	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {
-		set_bit(STRIPE_HANDLE, &sh->state);
-#if 0 /* RAID-6: Don't support CHECK PARITY yet */
-		if (failed == 0) {
-			char *pagea;
-			if (uptodate != disks)
-				BUG();
-			compute_parity(sh, CHECK_PARITY);
-			uptodate--;
-			pagea = page_address(sh->dev[pd_idx].page);
-			if ((*(u32*)pagea) == 0 &&
-			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
-				/* parity is correct (on disc, not in buffer any more) */
-				set_bit(STRIPE_INSYNC, &sh->state);
-			}
-		}
-#endif
-		if (!test_bit(STRIPE_INSYNC, &sh->state)) {
-			int failed_needupdate[2];
-			struct r5dev *adev, *bdev;
-
-			if ( failed < 1 )
-				failed_num[0] = pd_idx;
-			if ( failed < 2 )
-				failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
+	if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
+		int update_p = 0, update_q = 0;
+		struct r5dev *dev;
 
-			failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);
-			failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
+		set_bit(STRIPE_HANDLE, &sh->state);
 
-			PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",
-			       failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);
+		BUG_ON(failed>2);
+		BUG_ON(uptodate < disks);
+		/* Want to check and possibly repair P and Q.
+		 * However there could be one 'failed' device, in which
+		 * case we can only check one of them, possibly using the
+		 * other to generate missing data
+		 */
 
-#if 0  /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */
-			/* should be able to compute the missing block(s) and write to spare */
-			if ( failed_needupdate[0] ^ failed_needupdate[1] ) {
-				if (uptodate+1 != disks)
-					BUG();
-				compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);
-				uptodate++;
-			} else if ( failed_needupdate[0] & failed_needupdate[1] ) {
-				if (uptodate+2 != disks)
-					BUG();
-				compute_block_2(sh, failed_num[0], failed_num[1]);
-				uptodate += 2;
+		/* If !tmp_page, we cannot do the calculations,
+		 * but as we have set STRIPE_HANDLE, we will soon be called
+		 * by stripe_handle with a tmp_page - just wait until then.
+		 */
+		if (tmp_page) {
+			if (failed == q_failed) {
+				/* The only possible failed device holds 'Q', so it makes
+				 * sense to check P (If anything else were failed, we would
+				 * have used P to recreate it).
+				 */
+				compute_block_1(sh, pd_idx, 1);
+				if (!page_is_zero(sh->dev[pd_idx].page)) {
+					compute_block_1(sh,pd_idx,0);
+					update_p = 1;
+				}
+			}
+			if (!q_failed && failed < 2) {
+				/* q is not failed, and we didn't use it to generate
+				 * anything, so it makes sense to check it
+				 */
+				memcpy(page_address(tmp_page),
+				       page_address(sh->dev[qd_idx].page),
+				       STRIPE_SIZE);
+				compute_parity(sh, UPDATE_PARITY);
+				if (memcmp(page_address(tmp_page),
+					   page_address(sh->dev[qd_idx].page),
+					   STRIPE_SIZE)!= 0) {
+					clear_bit(STRIPE_INSYNC, &sh->state);
+					update_q = 1;
+				}
+			}
+			if (update_p || update_q) {
+				conf->mddev->resync_mismatches += STRIPE_SECTORS;
+				if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+					/* don't try to repair!! */
+					update_p = update_q = 0;
 			}
-#else
-			compute_block_2(sh, failed_num[0], failed_num[1]);
-			uptodate += failed_needupdate[0] + failed_needupdate[1];
-#endif
 
-			if (uptodate != disks)
-				BUG();
+			/* now write out any block on a failed drive,
+			 * or P or Q if they need it
+			 */
 
-			PRINTK("Marking for sync stripe %llu blocks %d,%d\n",
-			       (unsigned long long)sh->sector, failed_num[0], failed_num[1]);
+			if (failed == 2) {
+				dev = &sh->dev[failed_num[1]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
+			if (failed >= 1) {
+				dev = &sh->dev[failed_num[0]];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
 
-			/**** FIX: Should we really do both of these unconditionally? ****/
-			adev = &sh->dev[failed_num[0]];
-			locked += !test_bit(R5_LOCKED, &adev->flags);
-			set_bit(R5_LOCKED, &adev->flags);
-			set_bit(R5_Wantwrite, &adev->flags);
-			bdev = &sh->dev[failed_num[1]];
-			locked += !test_bit(R5_LOCKED, &bdev->flags);
-			set_bit(R5_LOCKED, &bdev->flags);
+			if (update_p) {
+				dev = &sh->dev[pd_idx];
+				locked ++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
+			if (update_q) {
+				dev = &sh->dev[qd_idx];
+				locked++;
+				set_bit(R5_LOCKED, &dev->flags);
+				set_bit(R5_Wantwrite, &dev->flags);
+				set_bit(R5_Syncio, &dev->flags);
+			}
 			clear_bit(STRIPE_DEGRADED, &sh->state);
-			set_bit(R5_Wantwrite, &bdev->flags);
 
 			set_bit(STRIPE_INSYNC, &sh->state);
-			set_bit(R5_Syncio, &adev->flags);
-			set_bit(R5_Syncio, &bdev->flags);
 		}
 	}
+
 	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,1);
 		clear_bit(STRIPE_SYNCING, &sh->state);
@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			}
 			finish_wait(&conf->wait_for_overlap, &w);
 			raid6_plug_device(conf);
-			handle_stripe(sh);
+			handle_stripe(sh, NULL);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return rv;
 	}
 	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
 		/* we can skip this block, and probably more */
 		sync_blocks /= STRIPE_SECTORS;
@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	clear_bit(STRIPE_INSYNC, &sh->state);
 	spin_unlock(&sh->lock);
 
-	handle_stripe(sh);
+	handle_stripe(sh, NULL);
 	release_stripe(sh);
 
 	return STRIPE_SECTORS;
@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev)
 		spin_unlock_irq(&conf->device_lock);
 
 		handled++;
-		handle_stripe(sh);
+		handle_stripe(sh, conf->spare_page);
 		release_stripe(sh);
 
 		spin_lock_irq(&conf->device_lock);
@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev)
 		goto abort;
 	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
+	conf->spare_page = alloc_page(GFP_KERNEL);
+	if (!conf->spare_page)
+		goto abort;
+
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid6_conf(conf);
+		if (conf->spare_page)
+			page_cache_release(conf->spare_page);
 		if (conf->stripe_hashtbl)
 			free_pages((unsigned long) conf->stripe_hashtbl,
 							HASH_PAGES_ORDER);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f025ba6..e9c1c0d 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -228,6 +228,8 @@ struct raid5_private_data {
 					    * Cleared when a sync completes.
 					    */
 
+	struct page 		*spare_page; /* Used when checking P/Q in raid6 */
+
 	/*
 	 * Free stripes pool
 	 */
-- 
cgit v1.1


From d69762e98456b71167865db9e33e732a28dd36ab Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:18 -0800
Subject: [PATCH] md: improve handing of read errors with raid6

This is a simple port of match functionality across from raid5.  If we get a
read error, we don't kick the drive straight away, but try to over-write with
good data first.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid6main.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 52e8796..7a51553 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -367,8 +367,8 @@ static void shrink_stripes(raid6_conf_t *conf)
 	conf->slab_cache = NULL;
 }
 
-static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
-				   int error)
+static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
+				  int error)
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid6_conf_t *conf = sh->raid_conf;
@@ -420,9 +420,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
 #else
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 #endif
+		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+			printk(KERN_INFO "raid6: read error corrected!!\n");
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+		}
+		if (atomic_read(&conf->disks[i].rdev->read_errors))
+			atomic_set(&conf->disks[i].rdev->read_errors, 0);
 	} else {
-		md_error(conf->mddev, conf->disks[i].rdev);
+		int retry = 0;
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+		atomic_inc(&conf->disks[i].rdev->read_errors);
+		if (conf->mddev->degraded)
+			printk(KERN_WARNING "raid6: read error not correctable.\n");
+		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
+			/* Oh, no!!! */
+			printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
+		else if (atomic_read(&conf->disks[i].rdev->read_errors)
+			 > conf->max_nr_stripes)
+			printk(KERN_WARNING
+			       "raid6: Too many read errors, failing device.\n");
+		else
+			retry = 1;
+		if (retry)
+			set_bit(R5_ReadError, &sh->dev[i].flags);
+		else {
+			clear_bit(R5_ReadError, &sh->dev[i].flags);
+			clear_bit(R5_ReWrite, &sh->dev[i].flags);
+			md_error(conf->mddev, conf->disks[i].rdev);
+		}
 	}
 	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
 #if 0
@@ -1079,6 +1105,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		if (dev->written) written++;
 		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+			/* The ReadError flag will just be confusing now */
+			clear_bit(R5_ReadError, &dev->flags);
+			clear_bit(R5_ReWrite, &dev->flags);
+		}
+		if (!rdev || !test_bit(In_sync, &rdev->flags)
+		    || test_bit(R5_ReadError, &dev->flags)) {
 			if ( failed < 2 )
 				failed_num[failed] = i;
 			failed++;
@@ -1095,6 +1127,14 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 	if (failed > 2 && to_read+to_write+written) {
 		for (i=disks; i--; ) {
 			int bitmap_end = 0;
+
+			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
+				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				if (rdev && test_bit(In_sync, &rdev->flags))
+					/* multiple read failures in one stripe */
+					md_error(conf->mddev, rdev);
+			}
+
 			spin_lock_irq(&conf->device_lock);
 			/* fail all writes first */
 			bi = sh->dev[i].towrite;
@@ -1130,7 +1170,8 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			}
 
 			/* fail any reads if this device is non-operational */
-			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+			if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
+			    test_bit(R5_ReadError, &sh->dev[i].flags)) {
 				bi = sh->dev[i].toread;
 				sh->dev[i].toread = NULL;
 				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1457,6 +1498,27 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		clear_bit(STRIPE_SYNCING, &sh->state);
 	}
 
+	/* If the failed drives are just a ReadError, then we might need
+	 * to progress the repair/check process
+	 */
+	if (failed <= 2 && ! conf->mddev->ro)
+		for (i=0; i<failed;i++) {
+			dev = &sh->dev[failed_num[i]];
+			if (test_bit(R5_ReadError, &dev->flags)
+			    && !test_bit(R5_LOCKED, &dev->flags)
+			    && test_bit(R5_UPTODATE, &dev->flags)
+				) {
+				if (!test_bit(R5_ReWrite, &dev->flags)) {
+					set_bit(R5_Wantwrite, &dev->flags);
+					set_bit(R5_ReWrite, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+				} else {
+					/* let's read it back */
+					set_bit(R5_Wantread, &dev->flags);
+					set_bit(R5_LOCKED, &dev->flags);
+				}
+			}
+		}
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
-- 
cgit v1.1


From ddaf22abaa831763e75775e6d4c7693504237997 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:19 -0800
Subject: [PATCH] md: attempt to auto-correct read errors in raid1

On a read-error we suspend the array, then synchronously read the block from
other arrays until we find one where we can read it.  Then we try writing the
good data back everywhere and make sure it works.  If any write or subsequent
read fails, only then do we fail the device out of the array.

To be able to suspend the array, we need to also keep track of how many
requests are queued for handling by raid1d.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   1 +
 drivers/md/raid1.c         | 115 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid1.h |   3 ++
 3 files changed, 109 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64e7da3..1364a1c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL(sync_page_io);
 
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c618015..b3856db 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
+	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
-		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-	else
+	update_head_pos(mirror, r1_bio);
+
+	if (uptodate || conf->working_disks <= 1) {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 		 */
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
-	update_head_pos(mirror, r1_bio);
-
-	/*
-	 * we have only one bio on the read side
-	 */
-	if (uptodate)
 		raid_end_bio_io(r1_bio);
-	else {
+	} else {
 		/*
 		 * oops, read error:
 		 */
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
 	wake_up(&conf->wait_barrier);
 }
 
+static void freeze_array(conf_t *conf)
+{
+	/* stop syncio and normal IO and wait for everything to
+	 * go quite.
+	 * We increment barrier and nr_waiting, and then
+	 * wait until barrier+nr_pending match nr_queued+2
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier++;
+	conf->nr_waiting++;
+	wait_event_lock_irq(conf->wait_barrier,
+			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
+			    conf->resync_lock,
+			    raid1_unplug(conf->mddev->queue));
+	spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+	/* reverse the effect of the freeze */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier--;
+	conf->nr_waiting--;
+	wake_up(&conf->wait_barrier);
+	spin_unlock_irq(&conf->resync_lock);
+}
+
 
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
 			break;
 		r1_bio = list_entry(head->prev, r1bio_t, retry_list);
 		list_del(head->prev);
+		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
 				}
 		} else {
 			int disk;
+
+			/* we got a read error. Maybe the drive is bad.  Maybe just
+			 * the block and we can fix it.
+			 * We freeze all other IO, and try reading the block from
+			 * other devices.  When we find one, we re-write
+			 * and check it that fixes the read error.
+			 * This is all done synchronously while the array is
+			 * frozen
+			 */
+			sector_t sect = r1_bio->sector;
+			int sectors = r1_bio->sectors;
+			freeze_array(conf);
+			while(sectors) {
+				int s = sectors;
+				int d = r1_bio->read_disk;
+				int success = 0;
+
+				if (s > (PAGE_SIZE>>9))
+					s = PAGE_SIZE >> 9;
+
+				do {
+					rdev = conf->mirrors[d].rdev;
+					if (rdev &&
+					    test_bit(In_sync, &rdev->flags) &&
+					    sync_page_io(rdev->bdev,
+							 sect + rdev->data_offset,
+							 s<<9,
+							 conf->tmppage, READ))
+						success = 1;
+					else {
+						d++;
+						if (d == conf->raid_disks)
+							d = 0;
+					}
+				} while (!success && d != r1_bio->read_disk);
+
+				if (success) {
+					/* write it back and re-read */
+					while (d != r1_bio->read_disk) {
+						if (d==0)
+							d = conf->raid_disks;
+						d--;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, WRITE) == 0 ||
+							    sync_page_io(rdev->bdev,
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, READ) == 0) {
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+							}
+						}
+					}
+				} else {
+					/* Cannot read from anywhere -- bye bye array */
+					md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+					break;
+				}
+				sectors -= s;
+				sect += s;
+			}
+
+
+			unfreeze_array(conf);
+
 			bio = r1_bio->bios[r1_bio->read_disk];
 			if ((disk=read_balance(conf, r1_bio)) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
 
 	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
 
+	conf->tmppage = alloc_page(GFP_KERNEL);
+	if (!conf->tmppage)
+		goto out_no_mem;
+
 	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
 		goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
+		__free_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index c556742..cbe4238 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
 	spinlock_t		resync_lock;
 	int			nr_pending;
 	int			nr_waiting;
+	int			nr_queued;
 	int			barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
 
 	struct pool_info	*poolinfo;
 
+	struct page		*tmppage;
+
 	mempool_t *r1bio_pool;
 	mempool_t *r1buf_pool;
 };
-- 
cgit v1.1


From 3e198f7826f830604f3aa7c20359a773e70cbeaa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:21 -0800
Subject: [PATCH] md: tidyup some issues with raid1 resync and prepare for
 catching read errors

We are dereferencing ->rdev without an rcu lock!

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 110 +++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b3856db..ea1f1eb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -177,6 +177,13 @@ static inline void free_r1bio(r1bio_t *r1_bio)
 static inline void put_buf(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	int i;
+
+	for (i=0; i<conf->raid_disks; i++) {
+		struct bio *bio = r1_bio->bios[i];
+		if (bio->bi_end_io)
+			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
+	}
 
 	mempool_free(r1_bio, conf->r1buf_pool);
 
@@ -1085,7 +1092,6 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 			 conf->mirrors[r1_bio->read_disk].rdev);
 	} else
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
-	rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
 	reschedule_retry(r1_bio);
 	return 0;
 }
@@ -1116,7 +1122,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
 		md_done_sync(mddev, r1_bio->sectors, uptodate);
 		put_buf(r1_bio);
 	}
-	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 	return 0;
 }
 
@@ -1153,10 +1158,14 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 	atomic_set(&r1_bio->remaining, 1);
 	for (i = 0; i < disks ; i++) {
 		wbio = r1_bio->bios[i];
-		if (wbio->bi_end_io != end_sync_write)
+		if (wbio->bi_end_io == NULL ||
+		    (wbio->bi_end_io == end_sync_read &&
+		     (i == r1_bio->read_disk ||
+		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
 			continue;
 
-		atomic_inc(&conf->mirrors[i].rdev->nr_pending);
+		wbio->bi_rw = WRITE;
+		wbio->bi_end_io = end_sync_write;
 		atomic_inc(&r1_bio->remaining);
 		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
 
@@ -1388,14 +1397,13 @@ static int init_resync(conf_t *conf)
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	conf_t *conf = mddev_to_conf(mddev);
-	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
-	int disk;
+	int disk = -1;
 	int i;
-	int wonly;
-	int write_targets = 0;
+	int wonly = -1;
+	int write_targets = 0, read_targets = 0;
 	int sync_blocks;
 	int still_degraded = 0;
 
@@ -1447,44 +1455,24 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	conf->next_resync = sector_nr;
 
+	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+	rcu_read_lock();
 	/*
-	 * If reconstructing, and >1 working disc,
-	 * could dedicate one to rebuild and others to
-	 * service read requests ..
+	 * If we get a correctably read error during resync or recovery,
+	 * we might want to read from a different device.  So we
+	 * flag all drives that could conceivably be read from for READ,
+	 * and any others (which will be non-In_sync devices) for WRITE.
+	 * If a read fails, we try reading from something else for which READ
+	 * is OK.
 	 */
-	disk = conf->last_used;
-	/* make sure disk is operational */
-	wonly = disk;
-	while (conf->mirrors[disk].rdev == NULL ||
-	       !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
-	       test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
-		) {
-		if (conf->mirrors[disk].rdev  &&
-		    test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
-			wonly = disk;
-		if (disk <= 0)
-			disk = conf->raid_disks;
-		disk--;
-		if (disk == conf->last_used) {
-			disk = wonly;
-			break;
-		}
-	}
-	conf->last_used = disk;
-	atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
-
-
-	mirror = conf->mirrors + disk;
-
-	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
 	r1_bio->mddev = mddev;
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
 	set_bit(R1BIO_IsSync, &r1_bio->state);
-	r1_bio->read_disk = disk;
 
 	for (i=0; i < conf->raid_disks; i++) {
+		mdk_rdev_t *rdev;
 		bio = r1_bio->bios[i];
 
 		/* take from bio_init */
@@ -1499,35 +1487,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		bio->bi_end_io = NULL;
 		bio->bi_private = NULL;
 
-		if (i == disk) {
-			bio->bi_rw = READ;
-			bio->bi_end_io = end_sync_read;
-		} else if (conf->mirrors[i].rdev == NULL ||
-			   test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
+		rdev = rcu_dereference(conf->mirrors[i].rdev);
+		if (rdev == NULL ||
+			   test_bit(Faulty, &rdev->flags)) {
 			still_degraded = 1;
 			continue;
-		} else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
-			   sector_nr + RESYNC_SECTORS > mddev->recovery_cp   ||
-			   test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+		} else if (!test_bit(In_sync, &rdev->flags)) {
 			bio->bi_rw = WRITE;
 			bio->bi_end_io = end_sync_write;
 			write_targets ++;
-		} else
-			/* no need to read or write here */
-			continue;
-		bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
-		bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+		} else {
+			/* may need to read from here */
+			bio->bi_rw = READ;
+			bio->bi_end_io = end_sync_read;
+			if (test_bit(WriteMostly, &rdev->flags)) {
+				if (wonly < 0)
+					wonly = i;
+			} else {
+				if (disk < 0)
+					disk = i;
+			}
+			read_targets++;
+		}
+		atomic_inc(&rdev->nr_pending);
+		bio->bi_sector = sector_nr + rdev->data_offset;
+		bio->bi_bdev = rdev->bdev;
 		bio->bi_private = r1_bio;
 	}
+	rcu_read_unlock();
+	if (disk < 0)
+		disk = wonly;
+	r1_bio->read_disk = disk;
 
-	if (write_targets == 0) {
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
+		/* extra read targets are also write targets */
+		write_targets += read_targets-1;
+
+	if (write_targets == 0 || read_targets == 0) {
 		/* There is nowhere to write, so all non-sync
 		 * drives must be failed - so we are finished
 		 */
 		sector_t rv = max_sector - sector_nr;
 		*skipped = 1;
 		put_buf(r1_bio);
-		rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
 		return rv;
 	}
 
@@ -1578,10 +1580,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
-	bio = r1_bio->bios[disk];
+	bio = r1_bio->bios[r1_bio->read_disk];
 	r1_bio->sectors = nr_sectors;
 
-	md_sync_acct(mirror->rdev->bdev, nr_sectors);
+	md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, nr_sectors);
 
 	generic_make_request(bio);
 
-- 
cgit v1.1


From 69382e85371c232df71524137a806b9c210ec021 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:22 -0800
Subject: [PATCH] md: better handling for read error in raid1 during resync

Handling of read errors during resync is separate from handling of read errors
during normal IO in raid1.  A previous patch added support for read errors
during normal IO.  This one adds support for read errors during resync or
recovery.

The key differences are that we don't need to freeze the array, because the
normal handling of resync means that this part of the array will be idle
except for resync, and the read/overwrite/re-read is needed in a separate
piece of code.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 99 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ea1f1eb..14a8fe0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1072,9 +1072,7 @@ abort:
 
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	if (bio->bi_size)
 		return 1;
@@ -1087,10 +1085,7 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
-	if (!uptodate) {
-		md_error(r1_bio->mddev,
-			 conf->mirrors[r1_bio->read_disk].rdev);
-	} else
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 	reschedule_retry(r1_bio);
 	return 0;
@@ -1134,27 +1129,89 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 
 	bio = r1_bio->bios[r1_bio->read_disk];
 
-/*
-	if (r1_bio->sector == 0) printk("First sync write startss\n");
-*/
+
 	/*
 	 * schedule writes
 	 */
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-		/*
-		 * There is no point trying a read-for-reconstruct as
-		 * reconstruct is about to be aborted
+		/* ouch - failed to read all of that.
+		 * Try some synchronous reads of other devices to get
+		 * good data, much like with normal read errors.  Only
+		 * read into the pages we already have so they we don't
+		 * need to re-issue the read request.
+		 * We don't need to freeze the array, because being in an
+		 * active sync request, there is no normal IO, and
+		 * no overlapping syncs.
 		 */
-		char b[BDEVNAME_SIZE];
-		printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
-			" for block %llu\n",
-			bdevname(bio->bi_bdev,b), 
-			(unsigned long long)r1_bio->sector);
-		md_done_sync(mddev, r1_bio->sectors, 0);
-		put_buf(r1_bio);
-		return;
+		sector_t sect = r1_bio->sector;
+		int sectors = r1_bio->sectors;
+		int idx = 0;
+
+		while(sectors) {
+			int s = sectors;
+			int d = r1_bio->read_disk;
+			int success = 0;
+			mdk_rdev_t *rdev;
+
+			if (s > (PAGE_SIZE>>9))
+				s = PAGE_SIZE >> 9;
+			do {
+				if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+					rdev = conf->mirrors[d].rdev;
+					if (sync_page_io(rdev->bdev,
+							 sect + rdev->data_offset,
+							 s<<9,
+							 bio->bi_io_vec[idx].bv_page,
+							 READ)) {
+						success = 1;
+						break;
+					}
+				}
+				d++;
+				if (d == conf->raid_disks)
+					d = 0;
+			} while (!success && d != r1_bio->read_disk);
+
+			if (success) {
+				/* write it back and re-read */
+				set_bit(R1BIO_Uptodate, &r1_bio->state);
+				while (d != r1_bio->read_disk) {
+					if (d == 0)
+						d = conf->raid_disks;
+					d--;
+					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+						continue;
+					rdev = conf->mirrors[d].rdev;
+					if (sync_page_io(rdev->bdev,
+							 sect + rdev->data_offset,
+							 s<<9,
+							 bio->bi_io_vec[idx].bv_page,
+							 WRITE) == 0 ||
+					    sync_page_io(rdev->bdev,
+							 sect + rdev->data_offset,
+							 s<<9,
+							 bio->bi_io_vec[idx].bv_page,
+							 READ) == 0) {
+						md_error(mddev, rdev);
+					}
+				}
+			} else {
+				char b[BDEVNAME_SIZE];
+				/* Cannot read from anywhere, array is toast */
+				md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+				printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
+				       " for block %llu\n",
+				       bdevname(bio->bi_bdev,b),
+				       (unsigned long long)r1_bio->sector);
+				md_done_sync(mddev, r1_bio->sectors, 0);
+				put_buf(r1_bio);
+				return;
+			}
+			sectors -= s;
+			sect += s;
+			idx ++;
+		}
 	}
-
 	atomic_set(&r1_bio->remaining, 1);
 	for (i = 0; i < disks ; i++) {
 		wbio = r1_bio->bios[i];
-- 
cgit v1.1


From cf30a473a02901fe4db37abc0b0fa26dd5ba3f72 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:23 -0800
Subject: [PATCH] md: handle errors when read-only

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c         | 18 +++++++++++-------
 include/linux/raid/raid1.h |  7 +++++++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 14a8fe0..a8bc93d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -154,7 +154,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 	for (i = 0; i < conf->raid_disks; i++) {
 		struct bio **bio = r1_bio->bios + i;
-		if (*bio)
+		if (*bio && *bio != IO_BLOCKED)
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -419,11 +419,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 		new_disk = 0;
 
 		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+		     r1_bio->bios[new_disk] == IO_BLOCKED ||
 		     !rdev || !test_bit(In_sync, &rdev->flags)
 			     || test_bit(WriteMostly, &rdev->flags);
 		     rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
 
-			if (rdev && test_bit(In_sync, &rdev->flags))
+			if (rdev && test_bit(In_sync, &rdev->flags) &&
+				r1_bio->bios[new_disk] != IO_BLOCKED)
 				wonly_disk = new_disk;
 
 			if (new_disk == conf->raid_disks - 1) {
@@ -437,11 +439,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 	/* make sure the disk is operational */
 	for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+	     r1_bio->bios[new_disk] == IO_BLOCKED ||
 	     !rdev || !test_bit(In_sync, &rdev->flags) ||
 		     test_bit(WriteMostly, &rdev->flags);
 	     rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
 
-		if (rdev && test_bit(In_sync, &rdev->flags))
+		if (rdev && test_bit(In_sync, &rdev->flags) &&
+		    r1_bio->bios[new_disk] != IO_BLOCKED)
 			wonly_disk = new_disk;
 
 		if (new_disk <= 0)
@@ -478,7 +482,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 
-		if (!rdev ||
+		if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
 		    !test_bit(In_sync, &rdev->flags) ||
 		    test_bit(WriteMostly, &rdev->flags))
 			continue;
@@ -1335,7 +1339,7 @@ static void raid1d(mddev_t *mddev)
 			sector_t sect = r1_bio->sector;
 			int sectors = r1_bio->sectors;
 			freeze_array(conf);
-			while(sectors) {
+			if (mddev->ro == 0) while(sectors) {
 				int s = sectors;
 				int d = r1_bio->read_disk;
 				int success = 0;
@@ -1388,7 +1392,6 @@ static void raid1d(mddev_t *mddev)
 				sect += s;
 			}
 
-
 			unfreeze_array(conf);
 
 			bio = r1_bio->bios[r1_bio->read_disk];
@@ -1399,7 +1402,8 @@ static void raid1d(mddev_t *mddev)
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
-				r1_bio->bios[r1_bio->read_disk] = NULL;
+				r1_bio->bios[r1_bio->read_disk] =
+					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
 				bio_put(bio);
 				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index cbe4238..9d5494a 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -109,6 +109,13 @@ struct r1bio_s {
 	/* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
 };
 
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
 /* bits for r1bio.state */
 #define	R1BIO_Uptodate	0
 #define	R1BIO_IsSync	1
-- 
cgit v1.1


From 9910f16af35419a5382fa7850eecc220103036fa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:24 -0800
Subject: [PATCH] md: fix up some rdev rcu locking in raid5/6

There is this "FIXME" comment with a typo in it!!  that been annoying me for
days, so I just had to remove it.

conf->disks[i].rdev should only be accessed if
  - we know we hold a reference or
  - the mddev->reconfig_sem is down or
  - we have a rcu_readlock

handle_stripe was referencing rdev in three places without any of these.  For
the first two, get an rcu_readlock.  For the last, the same access
(md_sync_acct call) is made a little later after the rdev has been claimed
under and rcu_readlock, if R5_Syncio is set.  So just use that access...
However R5_Syncio isn't really needed as the 'syncing' variable contains the
same information.  So use that instead.

Issues, comment, and fix are identical in raid5 and raid6.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 16 ++++++++--------
 drivers/md/raid6main.c     | 19 ++++++++-----------
 include/linux/raid/raid5.h |  1 -
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0d016a8..0222ba1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -961,11 +961,11 @@ static void handle_stripe(struct stripe_head *sh)
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	/* Now to look around and see what can be done */
 
+	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
-		clear_bit(R5_Syncio, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1004,7 +1004,7 @@ static void handle_stripe(struct stripe_head *sh)
 				non_overwrite++;
 		}
 		if (dev->written) written++;
-		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -1017,6 +1017,7 @@ static void handle_stripe(struct stripe_head *sh)
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
+	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 		" to_write=%d failed=%d failed_num=%d\n",
 		locked, uptodate, to_read, to_write, failed, failed_num);
@@ -1028,10 +1029,13 @@ static void handle_stripe(struct stripe_head *sh)
 			int bitmap_end = 0;
 
 			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				mdk_rdev_t *rdev;
+				rcu_read_lock();
+				rdev = rcu_dereference(conf->disks[i].rdev);
 				if (rdev && test_bit(In_sync, &rdev->flags))
 					/* multiple read failures in one stripe */
 					md_error(conf->mddev, rdev);
+				rcu_read_unlock();
 			}
 
 			spin_lock_irq(&conf->device_lock);
@@ -1180,9 +1184,6 @@ static void handle_stripe(struct stripe_head *sh)
 					locked++;
 					PRINTK("Reading block %d (sync=%d)\n", 
 						i, syncing);
-					if (syncing)
-						md_sync_acct(conf->disks[i].rdev->bdev,
-							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1326,7 +1327,6 @@ static void handle_stripe(struct stripe_head *sh)
 			clear_bit(STRIPE_DEGRADED, &sh->state);
 			locked++;
 			set_bit(STRIPE_INSYNC, &sh->state);
-			set_bit(R5_Syncio, &dev->flags);
 		}
 	}
 	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -1392,7 +1392,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (test_bit(R5_Syncio, &sh->dev[i].flags))
+			if (syncing)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 7a51553..b5b7a8d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1060,11 +1060,11 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
 	/* Now to look around and see what can be done */
 
+	rcu_read_lock();
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
 		dev = &sh->dev[i];
 		clear_bit(R5_Insync, &dev->flags);
-		clear_bit(R5_Syncio, &dev->flags);
 
 		PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
 			i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1103,7 +1103,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				non_overwrite++;
 		}
 		if (dev->written) written++;
-		rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
+		rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
 			/* The ReadError flag will just be confusing now */
 			clear_bit(R5_ReadError, &dev->flags);
@@ -1117,6 +1117,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		} else
 			set_bit(R5_Insync, &dev->flags);
 	}
+	rcu_read_unlock();
 	PRINTK("locked=%d uptodate=%d to_read=%d"
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       locked, uptodate, to_read, to_write, failed,
@@ -1129,10 +1130,13 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			int bitmap_end = 0;
 
 			if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-				mdk_rdev_t *rdev = conf->disks[i].rdev;
+				mdk_rdev_t *rdev;
+				rcu_read_lock();
+				rdev = rcu_dereference(conf->disks[i].rdev);
 				if (rdev && test_bit(In_sync, &rdev->flags))
 					/* multiple read failures in one stripe */
 					md_error(conf->mddev, rdev);
+				rcu_read_unlock();
 			}
 
 			spin_lock_irq(&conf->device_lock);
@@ -1307,9 +1311,6 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 					locked++;
 					PRINTK("Reading block %d (sync=%d)\n",
 						i, syncing);
-					if (syncing)
-						md_sync_acct(conf->disks[i].rdev->bdev,
-							     STRIPE_SECTORS);
 				}
 			}
 		}
@@ -1463,14 +1464,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			if (failed >= 1) {
 				dev = &sh->dev[failed_num[0]];
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 
 			if (update_p) {
@@ -1478,14 +1477,12 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 				locked ++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			if (update_q) {
 				dev = &sh->dev[qd_idx];
 				locked++;
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantwrite, &dev->flags);
-				set_bit(R5_Syncio, &dev->flags);
 			}
 			clear_bit(STRIPE_DEGRADED, &sh->state);
 
@@ -1557,7 +1554,7 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 		rcu_read_unlock();
 
 		if (rdev) {
-			if (test_bit(R5_Syncio, &sh->dev[i].flags))
+			if (syncing)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index e9c1c0d..28fcd75 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -152,7 +152,6 @@ struct stripe_head {
 #define	R5_Insync	3	/* rdev && rdev->in_sync at start */
 #define	R5_Wantread	4	/* want to schedule a read */
 #define	R5_Wantwrite	5
-#define	R5_Syncio	6	/* this io need to be accounted as resync io */
 #define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
-- 
cgit v1.1


From 18f08819f42b647783e4f6ea99141623881bf182 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:25 -0800
Subject: [PATCH] md: support check-without-repair of raid10 arrays

Also keep count on the number of errors found.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8f58a44..1fa70c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1206,6 +1206,10 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 				break;
 		if (j == vcnt)
 			continue;
+		mddev->resync_mismatches += r10_bio->sectors;
+		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+			/* Don't fix anything. */
+			continue;
 		/* Ok, we need to write this bio
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
-- 
cgit v1.1


From d11c171e636cfd2df818cf3411d88222c2f4fcef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:26 -0800
Subject: [PATCH] md: allow raid1 to check consistency

Where performing a user-requested 'check' or 'repair', we read all readable
devices, and compare the contents.  We only write to blocks which had read
errors, or blocks with content that differs from the first good device found.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 156 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 128 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a8bc93d..7fbb608 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -106,15 +106,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	}
 	/*
 	 * Allocate RESYNC_PAGES data pages and attach them to
-	 * the first bio;
+	 * the first bio.
+	 * If this is a user-requested check/repair, allocate
+	 * RESYNC_PAGES for each bio.
 	 */
-	bio = r1_bio->bios[0];
-	for (i = 0; i < RESYNC_PAGES; i++) {
-		page = alloc_page(gfp_flags);
-		if (unlikely(!page))
-			goto out_free_pages;
-
-		bio->bi_io_vec[i].bv_page = page;
+	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+		j = pi->raid_disks;
+	else
+		j = 1;
+	while(j--) {
+		bio = r1_bio->bios[j];
+		for (i = 0; i < RESYNC_PAGES; i++) {
+			page = alloc_page(gfp_flags);
+			if (unlikely(!page))
+				goto out_free_pages;
+
+			bio->bi_io_vec[i].bv_page = page;
+		}
+	}
+	/* If not user-requests, copy the page pointers to all bios */
+	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+		for (i=0; i<RESYNC_PAGES ; i++)
+			for (j=1; j<pi->raid_disks; j++)
+				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+					r1_bio->bios[0]->bi_io_vec[i].bv_page;
 	}
 
 	r1_bio->master_bio = NULL;
@@ -122,8 +137,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	return r1_bio;
 
 out_free_pages:
-	for ( ; i > 0 ; i--)
-		__free_page(bio->bi_io_vec[i-1].bv_page);
+	for (i=0; i < RESYNC_PAGES ; i++)
+		for (j=0 ; j < pi->raid_disks; j++)
+			__free_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
 		bio_put(r1_bio->bios[j]);
@@ -134,14 +151,16 @@ out_free_bio:
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
-	int i;
+	int i,j;
 	r1bio_t *r1bio = __r1_bio;
-	struct bio *bio = r1bio->bios[0];
 
-	for (i = 0; i < RESYNC_PAGES; i++) {
-		__free_page(bio->bi_io_vec[i].bv_page);
-		bio->bi_io_vec[i].bv_page = NULL;
-	}
+	for (i = 0; i < RESYNC_PAGES; i++)
+		for (j = pi->raid_disks; j-- ;) {
+			if (j == 0 ||
+			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
+			    r1bio->bios[0]->bi_io_vec[i].bv_page)
+				__free_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+		}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 
@@ -1077,13 +1096,16 @@ abort:
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+	int i;
 
 	if (bio->bi_size)
 		return 1;
 
-	if (r1_bio->bios[r1_bio->read_disk] != bio)
-		BUG();
-	update_head_pos(r1_bio->read_disk, r1_bio);
+	for (i=r1_bio->mddev->raid_disks; i--; )
+		if (r1_bio->bios[i] == bio)
+			break;
+	BUG_ON(i < 0);
+	update_head_pos(i, r1_bio);
 	/*
 	 * we have read a block, now it needs to be re-written,
 	 * or re-read if the read failed.
@@ -1091,7 +1113,9 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 	 */
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
-	reschedule_retry(r1_bio);
+
+	if (atomic_dec_and_test(&r1_bio->remaining))
+		reschedule_retry(r1_bio);
 	return 0;
 }
 
@@ -1134,9 +1158,65 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 	bio = r1_bio->bios[r1_bio->read_disk];
 
 
-	/*
-	 * schedule writes
-	 */
+	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+		/* We have read all readable devices.  If we haven't
+		 * got the block, then there is no hope left.
+		 * If we have, then we want to do a comparison
+		 * and skip the write if everything is the same.
+		 * If any blocks failed to read, then we need to
+		 * attempt an over-write
+		 */
+		int primary;
+		if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+			for (i=0; i<mddev->raid_disks; i++)
+				if (r1_bio->bios[i]->bi_end_io == end_sync_read)
+					md_error(mddev, conf->mirrors[i].rdev);
+
+			md_done_sync(mddev, r1_bio->sectors, 1);
+			put_buf(r1_bio);
+			return;
+		}
+		for (primary=0; primary<mddev->raid_disks; primary++)
+			if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+			    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+				r1_bio->bios[primary]->bi_end_io = NULL;
+				break;
+			}
+		r1_bio->read_disk = primary;
+		for (i=0; i<mddev->raid_disks; i++)
+			if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
+			    test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
+				int j;
+				int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+				struct bio *pbio = r1_bio->bios[primary];
+				struct bio *sbio = r1_bio->bios[i];
+				for (j = vcnt; j-- ; )
+					if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
+						   page_address(sbio->bi_io_vec[j].bv_page),
+						   PAGE_SIZE))
+						break;
+				if (j >= 0)
+					mddev->resync_mismatches += r1_bio->sectors;
+				if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+					sbio->bi_end_io = NULL;
+				else {
+					/* fixup the bio for reuse */
+					sbio->bi_vcnt = vcnt;
+					sbio->bi_size = r1_bio->sectors << 9;
+					sbio->bi_idx = 0;
+					sbio->bi_phys_segments = 0;
+					sbio->bi_hw_segments = 0;
+					sbio->bi_hw_front_size = 0;
+					sbio->bi_hw_back_size = 0;
+					sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+					sbio->bi_flags |= 1 << BIO_UPTODATE;
+					sbio->bi_next = NULL;
+					sbio->bi_sector = r1_bio->sector +
+						conf->mirrors[i].rdev->data_offset;
+					sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+				}
+			}
+	}
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 		/* ouch - failed to read all of that.
 		 * Try some synchronous reads of other devices to get
@@ -1216,6 +1296,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 			idx ++;
 		}
 	}
+
+	/*
+	 * schedule writes
+	 */
 	atomic_set(&r1_bio->remaining, 1);
 	for (i = 0; i < disks ; i++) {
 		wbio = r1_bio->bios[i];
@@ -1618,10 +1702,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		for (i=0 ; i < conf->raid_disks; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
-				page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
-					r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page;
+					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
@@ -1641,12 +1725,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
-	bio = r1_bio->bios[r1_bio->read_disk];
 	r1_bio->sectors = nr_sectors;
 
-	md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, nr_sectors);
+	/* For a user-requested sync, we read all readable devices and do a
+	 * compare
+	 */
+	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+		atomic_set(&r1_bio->remaining, read_targets);
+		for (i=0; i<conf->raid_disks; i++) {
+			bio = r1_bio->bios[i];
+			if (bio->bi_end_io == end_sync_read) {
+				md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
+				generic_make_request(bio);
+			}
+		}
+	} else {
+		atomic_set(&r1_bio->remaining, 1);
+		bio = r1_bio->bios[r1_bio->read_disk];
+		md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
+			     nr_sectors);
+		generic_make_request(bio);
 
-	generic_make_request(bio);
+	}
 
 	return nr_sectors;
 }
-- 
cgit v1.1


From 220946c9018de74b952446e3a4dff1bfd4cbf310 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:27 -0800
Subject: [PATCH] md: make sure read error on last working drive of raid1
 actually returns failure

We are inadvertently setting the R1BIO_Uptodate bit on read errors when we
decide not to try correcting (because there are no other working devices).
This means that the read error is reported to the client as success.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7fbb608..6c10f28 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -284,7 +284,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
-		set_bit(R1BIO_Uptodate, &r1_bio->state);
+		if (uptodate)
+			set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 		raid_end_bio_io(r1_bio);
 	} else {
-- 
cgit v1.1


From 4443ae10ca15d07922ceda622f03db8865fa3d13 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:28 -0800
Subject: [PATCH] md: auto-correct correctable read errors in raid10

Largely just a cross-port from raid1.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 127 ++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/raid10.h |   2 +
 2 files changed, 114 insertions(+), 15 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1fa70c3..64bb4dd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -209,6 +209,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
+	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	md_wakeup_thread(mddev->thread);
@@ -254,9 +255,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (!uptodate)
-		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-	else
+	update_head_pos(slot, r10_bio);
+
+	if (uptodate) {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code to the higher
@@ -267,15 +268,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-
-	update_head_pos(slot, r10_bio);
-
-	/*
-	 * we have only one bio on the read side
-	 */
-	if (uptodate)
 		raid_end_bio_io(r10_bio);
-	else {
+	} else {
 		/*
 		 * oops, read error:
 		 */
@@ -714,6 +708,33 @@ static void allow_barrier(conf_t *conf)
 	wake_up(&conf->wait_barrier);
 }
 
+static void freeze_array(conf_t *conf)
+{
+	/* stop syncio and normal IO and wait for everything to
+	 * go quite.
+	 * We increment barrier and nr_waiting, and then
+	 * wait until barrier+nr_pending match nr_queued+2
+	 */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier++;
+	conf->nr_waiting++;
+	wait_event_lock_irq(conf->wait_barrier,
+			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
+			    conf->resync_lock,
+			    raid10_unplug(conf->mddev->queue));
+	spin_unlock_irq(&conf->resync_lock);
+}
+
+static void unfreeze_array(conf_t *conf)
+{
+	/* reverse the effect of the freeze */
+	spin_lock_irq(&conf->resync_lock);
+	conf->barrier--;
+	conf->nr_waiting--;
+	wake_up(&conf->wait_barrier);
+	spin_unlock_irq(&conf->resync_lock);
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1338,6 +1359,7 @@ static void raid10d(mddev_t *mddev)
 			break;
 		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
 		list_del(head->prev);
+		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r10_bio->mddev;
@@ -1350,6 +1372,78 @@ static void raid10d(mddev_t *mddev)
 			unplug = 1;
 		} else {
 			int mirror;
+			/* we got a read error. Maybe the drive is bad.  Maybe just
+			 * the block and we can fix it.
+			 * We freeze all other IO, and try reading the block from
+			 * other devices.  When we find one, we re-write
+			 * and check it that fixes the read error.
+			 * This is all done synchronously while the array is
+			 * frozen.
+			 */
+			int sect = 0; /* Offset from r10_bio->sector */
+			int sectors = r10_bio->sectors;
+			freeze_array(conf);
+			if (mddev->ro == 0) while(sectors) {
+				int s = sectors;
+				int sl = r10_bio->read_slot;
+				int success = 0;
+
+				if (s > (PAGE_SIZE>>9))
+					s = PAGE_SIZE >> 9;
+
+				do {
+					int d = r10_bio->devs[sl].devnum;
+					rdev = conf->mirrors[d].rdev;
+					if (rdev &&
+					    test_bit(In_sync, &rdev->flags) &&
+					    sync_page_io(rdev->bdev,
+							 r10_bio->devs[sl].addr +
+							 sect + rdev->data_offset,
+							 s<<9,
+							 conf->tmppage, READ))
+						success = 1;
+					else {
+						sl++;
+						if (sl == conf->copies)
+							sl = 0;
+					}
+				} while (!success && sl != r10_bio->read_slot);
+
+				if (success) {
+					/* write it back and re-read */
+					while (sl != r10_bio->read_slot) {
+						int d;
+						if (sl==0)
+							sl = conf->copies;
+						sl--;
+						d = r10_bio->devs[sl].devnum;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
+									 r10_bio->devs[sl].addr +
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, WRITE) == 0 ||
+							    sync_page_io(rdev->bdev,
+									 r10_bio->devs[sl].addr +
+									 sect + rdev->data_offset,
+									 s<<9, conf->tmppage, READ) == 0) {
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+							}
+						}
+					}
+				} else {
+					/* Cannot read from anywhere -- bye bye array */
+					md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
+					break;
+				}
+				sectors -= s;
+				sect += s;
+			}
+
+			unfreeze_array(conf);
+
 			bio = r10_bio->devs[r10_bio->read_slot].bio;
 			r10_bio->devs[r10_bio->read_slot].bio = NULL;
 			bio_put(bio);
@@ -1793,22 +1887,24 @@ static int run(mddev_t *mddev)
 	 * bookkeeping area. [whatever we allocate in run(),
 	 * should be freed in stop()]
 	 */
-	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
 			mdname(mddev));
 		goto out;
 	}
-	memset(conf, 0, sizeof(*conf));
-	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors) {
 		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
-	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+
+	conf->tmppage = alloc_page(GFP_KERNEL);
+	if (!conf->tmppage)
+		goto out_free_conf;
 
 	conf->near_copies = nc;
 	conf->far_copies = fc;
@@ -1918,6 +2014,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
+	put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index b660cbf..dfa5283 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -42,6 +42,7 @@ struct r10_private_data_s {
 	spinlock_t		resync_lock;
 	int nr_pending;
 	int nr_waiting;
+	int nr_queued;
 	int barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
@@ -53,6 +54,7 @@ struct r10_private_data_s {
 
 	mempool_t *r10bio_pool;
 	mempool_t *r10buf_pool;
+	struct page		*tmppage;
 };
 
 typedef struct r10_private_data_s conf_t;
-- 
cgit v1.1


From 0eb3ff12aa8a12538ef681dc83f4361636a0699f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:29 -0800
Subject: [PATCH] md: raid10 read-error handling - resync and read-only

Add in correct read-error handling for resync and read-only situations.

When read-only, we don't over-write, so we need to mark the failed drive in
the r10_bio so we don't re-try it.  During resync, we always read all blocks,
so if there is a read error, we simply over-write it with the good block that
we found (assuming we found one).

Note that the recovery case still isn't handled in an interesting way.  There
is nothing useful to do for the 2-copies case.  If there are 3 or more copies,
then we could try reading from one of the non-missing copies, but this is a
bit complicated and very rarely would be used, so I'm leaving it for now.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c         | 56 ++++++++++++++++++++++++++++-----------------
 include/linux/raid/raid10.h |  7 ++++++
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64bb4dd..3f8df2e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -172,7 +172,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
-		if (*bio)
+		if (*bio && *bio != IO_BLOCKED)
 			bio_put(*bio);
 		*bio = NULL;
 	}
@@ -500,6 +500,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 		disk = r10_bio->devs[slot].devnum;
 
 		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+		       r10_bio->devs[slot].bio == IO_BLOCKED ||
 		       !test_bit(In_sync, &rdev->flags)) {
 			slot++;
 			if (slot == conf->copies) {
@@ -517,6 +518,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 	slot = 0;
 	disk = r10_bio->devs[slot].devnum;
 	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
+	       r10_bio->devs[slot].bio == IO_BLOCKED ||
 	       !test_bit(In_sync, &rdev->flags)) {
 		slot ++;
 		if (slot == conf->copies) {
@@ -537,6 +539,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
 
 
 		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
+		    r10_bio->devs[nslot].bio == IO_BLOCKED ||
 		    !test_bit(In_sync, &rdev->flags))
 			continue;
 
@@ -1104,7 +1107,6 @@ abort:
 
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	conf_t *conf = mddev_to_conf(r10_bio->mddev);
 	int i,d;
@@ -1119,7 +1121,10 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 		BUG();
 	update_head_pos(i, r10_bio);
 	d = r10_bio->devs[i].devnum;
-	if (!uptodate)
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		set_bit(R10BIO_Uptodate, &r10_bio->state);
+	else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		md_error(r10_bio->mddev,
 			 conf->mirrors[d].rdev);
 
@@ -1209,25 +1214,30 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 	fbio = r10_bio->devs[i].bio;
 
 	/* now find blocks with errors */
-	for (i=first+1 ; i < conf->copies ; i++) {
-		int vcnt, j, d;
+	for (i=0 ; i < conf->copies ; i++) {
+		int  j, d;
+		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
 
-		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
-			continue;
-		/* We know that the bi_io_vec layout is the same for
-		 * both 'first' and 'i', so we just compare them.
-		 * All vec entries are PAGE_SIZE;
-		 */
 		tbio = r10_bio->devs[i].bio;
-		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
-		for (j = 0; j < vcnt; j++)
-			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-				   page_address(tbio->bi_io_vec[j].bv_page),
-				   PAGE_SIZE))
-				break;
-		if (j == vcnt)
+
+		if (tbio->bi_end_io != end_sync_read)
+			continue;
+		if (i == first)
 			continue;
-		mddev->resync_mismatches += r10_bio->sectors;
+		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
+			/* We know that the bi_io_vec layout is the same for
+			 * both 'first' and 'i', so we just compare them.
+			 * All vec entries are PAGE_SIZE;
+			 */
+			for (j = 0; j < vcnt; j++)
+				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
+					   page_address(tbio->bi_io_vec[j].bv_page),
+					   PAGE_SIZE))
+					break;
+			if (j == vcnt)
+				continue;
+			mddev->resync_mismatches += r10_bio->sectors;
+		}
 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 			/* Don't fix anything. */
 			continue;
@@ -1308,7 +1318,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
-	generic_make_request(wbio);
+	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
+		generic_make_request(wbio);
+	else
+		bio_endio(wbio, wbio->bi_size, -EIO);
 }
 
 
@@ -1445,7 +1458,8 @@ static void raid10d(mddev_t *mddev)
 			unfreeze_array(conf);
 
 			bio = r10_bio->devs[r10_bio->read_slot].bio;
-			r10_bio->devs[r10_bio->read_slot].bio = NULL;
+			r10_bio->devs[r10_bio->read_slot].bio =
+				mddev->ro ? IO_BLOCKED : NULL;
 			bio_put(bio);
 			mirror = read_balance(conf, r10_bio);
 			if (mirror == -1) {
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index dfa5283..b110329 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -104,6 +104,13 @@ struct r10bio_s {
 	} devs[0];
 };
 
+/* when we get a read error on a read-only array, we redirect to another
+ * device without failing the first device, or trying to over-write to
+ * correct the read error.  To keep track of bad blocks on a per-bio
+ * level, we store IO_BLOCKED in the appropriate 'bios' pointer
+ */
+#define IO_BLOCKED ((struct bio*)1)
+
 /* bits for r10bio.state */
 #define	R10BIO_Uptodate	0
 #define	R10BIO_IsSync	1
-- 
cgit v1.1


From d7603b7e3a7f802c67f9190b2387d4d5d111ec14 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:30 -0800
Subject: [PATCH] md: make /proc/mdstat pollable

With this patch it is possible to poll /proc/mdstat to detect arrays appearing
or disappearing, to detect failures, recovery starting, recovery completing,
and devices being added and removed.

It is similar to the poll-ability of /proc/mounts, though different in that:

We always report that the file is readable (because face it, it is, even if
only for EOF).

We report POLLPRI when there is a change so that select() can detect
it as an exceptional event.  Not only are these exceptional events, but
that is the mechanism that the current 'mdadm' uses to watch for events
(It also polls after a timeout).
(We also report POLLERR like /proc/mounts).

Finally, we only reset the per-file event counter when the start of the file
is read, rather than when poll() returns an event.  This is more robust as it
means that an fd will continue to report activity to poll/select until the
program clearly responds to that activity.

md_new_event takes an 'mddev' which isn't currently used, but it will be soon.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1364a1c..6101879 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -42,6 +42,7 @@
 #include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/suspend.h>
+#include <linux/poll.h>
 
 #include <linux/init.h>
 
@@ -134,6 +135,24 @@ static struct block_device_operations md_fops;
 static int start_readonly;
 
 /*
+ * We have a system wide 'event count' that is incremented
+ * on any 'interesting' event, and readers of /proc/mdstat
+ * can use 'poll' or 'select' to find out when the event
+ * count increases.
+ *
+ * Events are:
+ *  start array, stop array, error, add device, remove device,
+ *  start build, activate spare
+ */
+DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static atomic_t md_event_count;
+void md_new_event(mddev_t *mddev)
+{
+	atomic_inc(&md_event_count);
+	wake_up(&md_event_waiters);
+}
+
+/*
  * Enables to iterate over all existing md arrays
  * all_mddevs_lock protects this list.
  */
@@ -2111,6 +2130,7 @@ static int do_md_run(mddev_t * mddev)
 	mddev->queue->make_request_fn = mddev->pers->make_request;
 
 	mddev->changed = 1;
+	md_new_event(mddev);
 	return 0;
 }
 
@@ -2238,6 +2258,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
 		printk(KERN_INFO "md: %s switched to read-only mode.\n",
 			mdname(mddev));
 	err = 0;
+	md_new_event(mddev);
 out:
 	return err;
 }
@@ -2712,6 +2733,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
 
 	kick_rdev_from_array(rdev);
 	md_update_sb(mddev);
+	md_new_event(mddev);
 
 	return 0;
 busy:
@@ -2802,7 +2824,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	 */
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
-
+	md_new_event(mddev);
 	return 0;
 
 abort_unbind_export:
@@ -3531,6 +3553,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
+	md_new_event(mddev);
 }
 
 /* seq_file implementation /proc/mdstat */
@@ -3671,12 +3694,17 @@ static void md_seq_stop(struct seq_file *seq, void *v)
 		mddev_put(mddev);
 }
 
+struct mdstat_info {
+	int event;
+};
+
 static int md_seq_show(struct seq_file *seq, void *v)
 {
 	mddev_t *mddev = v;
 	sector_t size;
 	struct list_head *tmp2;
 	mdk_rdev_t *rdev;
+	struct mdstat_info *mi = seq->private;
 	int i;
 	struct bitmap *bitmap;
 
@@ -3689,6 +3717,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 
 		spin_unlock(&pers_lock);
 		seq_printf(seq, "\n");
+		mi->event = atomic_read(&md_event_count);
 		return 0;
 	}
 	if (v == (void*)2) {
@@ -3797,16 +3826,52 @@ static struct seq_operations md_seq_ops = {
 static int md_seq_open(struct inode *inode, struct file *file)
 {
 	int error;
+	struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
+	if (mi == NULL)
+		return -ENOMEM;
 
 	error = seq_open(file, &md_seq_ops);
+	if (error)
+		kfree(mi);
+	else {
+		struct seq_file *p = file->private_data;
+		p->private = mi;
+		mi->event = atomic_read(&md_event_count);
+	}
 	return error;
 }
 
+static int md_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct mdstat_info *mi = m->private;
+	m->private = NULL;
+	kfree(mi);
+	return seq_release(inode, file);
+}
+
+static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
+{
+	struct seq_file *m = filp->private_data;
+	struct mdstat_info *mi = m->private;
+	int mask;
+
+	poll_wait(filp, &md_event_waiters, wait);
+
+	/* always allow read */
+	mask = POLLIN | POLLRDNORM;
+
+	if (mi->event != atomic_read(&md_event_count))
+		mask |= POLLERR | POLLPRI;
+	return mask;
+}
+
 static struct file_operations md_seq_fops = {
 	.open           = md_seq_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release	= seq_release,
+	.release	= md_seq_release,
+	.poll		= mdstat_poll,
 };
 
 int register_md_personality(int pnum, mdk_personality_t *p)
@@ -4076,7 +4141,11 @@ static void md_do_sync(mddev_t *mddev)
 
 		j += sectors;
 		if (j>1) mddev->curr_resync = j;
-
+		if (last_check == 0)
+			/* this is the earliers that rebuilt will be
+			 * visible in /proc/mdstat
+			 */
+			md_new_event(mddev);
 
 		if (last_check + window > io_sectors || j == max_sectors)
 			continue;
@@ -4262,6 +4331,7 @@ void md_check_recovery(mddev_t *mddev)
 			mddev->recovery = 0;
 			/* flag recovery needed just to double check */
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			md_new_event(mddev);
 			goto unlock;
 		}
 		/* Clear some bits that don't mean anything, but
@@ -4299,6 +4369,7 @@ void md_check_recovery(mddev_t *mddev)
 						sprintf(nm, "rd%d", rdev->raid_disk);
 						sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 						spares++;
+						md_new_event(mddev);
 					} else
 						break;
 				}
@@ -4331,9 +4402,9 @@ void md_check_recovery(mddev_t *mddev)
 					mdname(mddev));
 				/* leave the spares where they are, it shouldn't hurt */
 				mddev->recovery = 0;
-			} else {
+			} else
 				md_wakeup_thread(mddev->sync_thread);
-			}
+			md_new_event(mddev);
 		}
 	unlock:
 		mddev_unlock(mddev);
-- 
cgit v1.1


From 2d1f3b5d1b2cd11a162eb29645df749ec0036413 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:31 -0800
Subject: [PATCH] md: clean up 'page' related names in md

Substitute:

  page_cache_get -> get_page
  page_cache_release -> put_page
  PAGE_CACHE_SHIFT -> PAGE_SHIFT
  PAGE_CACHE_SIZE -> PAGE_SIZE
  PAGE_CACHE_MASK -> PAGE_MASK
  __free_page -> put_page

because we aren't using the page cache, we are just using pages.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c    | 44 ++++++++++++++++++++++----------------------
 drivers/md/md.c        |  2 +-
 drivers/md/raid0.c     |  2 +-
 drivers/md/raid1.c     | 10 +++++-----
 drivers/md/raid10.c    |  8 ++++----
 drivers/md/raid5.c     |  4 ++--
 drivers/md/raid6main.c |  6 +++---
 7 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b65c36d..fc05d12 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -341,7 +341,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 		/* add to list to be waited for by daemon */
 		struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
 		item->page = page;
-		page_cache_get(page);
+		get_page(page);
 		spin_lock(&bitmap->write_lock);
 		list_add(&item->list, &bitmap->complete_pages);
 		spin_unlock(&bitmap->write_lock);
@@ -357,10 +357,10 @@ static struct page *read_page(struct file *file, unsigned long index,
 	struct inode *inode = file->f_mapping->host;
 	struct page *page = NULL;
 	loff_t isize = i_size_read(inode);
-	unsigned long end_index = isize >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = isize >> PAGE_SHIFT;
 
-	PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE,
-			(unsigned long long)index << PAGE_CACHE_SHIFT);
+	PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
+			(unsigned long long)index << PAGE_SHIFT);
 
 	page = read_cache_page(inode->i_mapping, index,
 			(filler_t *)inode->i_mapping->a_ops->readpage, file);
@@ -368,7 +368,7 @@ static struct page *read_page(struct file *file, unsigned long index,
 		goto out;
 	wait_on_page_locked(page);
 	if (!PageUptodate(page) || PageError(page)) {
-		page_cache_release(page);
+		put_page(page);
 		page = ERR_PTR(-EIO);
 		goto out;
 	}
@@ -376,14 +376,14 @@ static struct page *read_page(struct file *file, unsigned long index,
 	if (index > end_index) /* we have read beyond EOF */
 		*bytes_read = 0;
 	else if (index == end_index) /* possible short read */
-		*bytes_read = isize & ~PAGE_CACHE_MASK;
+		*bytes_read = isize & ~PAGE_MASK;
 	else
-		*bytes_read = PAGE_CACHE_SIZE; /* got a full page */
+		*bytes_read = PAGE_SIZE; /* got a full page */
 out:
 	if (IS_ERR(page))
 		printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
-			(int)PAGE_CACHE_SIZE,
-			(unsigned long long)index << PAGE_CACHE_SHIFT,
+			(int)PAGE_SIZE,
+			(unsigned long long)index << PAGE_SHIFT,
 			PTR_ERR(page));
 	return page;
 }
@@ -558,7 +558,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 		return;
 	}
-	page_cache_get(bitmap->sb_page);
+	get_page(bitmap->sb_page);
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 	sb = (bitmap_super_t *)kmap(bitmap->sb_page);
 	switch (op) {
@@ -569,7 +569,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 		default: BUG();
 	}
 	kunmap(bitmap->sb_page);
-	page_cache_release(bitmap->sb_page);
+	put_page(bitmap->sb_page);
 }
 
 /*
@@ -622,12 +622,12 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 
 	while (pages--)
 		if (map[pages]->index != 0) /* 0 is sb_page, release it below */
-			page_cache_release(map[pages]);
+			put_page(map[pages]);
 	kfree(map);
 	kfree(attr);
 
 	if (sb_page)
-		page_cache_release(sb_page);
+		put_page(sb_page);
 }
 
 static void bitmap_stop_daemon(struct bitmap *bitmap);
@@ -654,7 +654,7 @@ static void drain_write_queues(struct bitmap *bitmap)
 
 	while ((item = dequeue_page(bitmap))) {
 		/* don't bother to wait */
-		page_cache_release(item->page);
+		put_page(item->page);
 		mempool_free(item, bitmap->write_pool);
 	}
 
@@ -763,7 +763,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 
 	/* make sure the page stays cached until it gets written out */
 	if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
-		page_cache_get(page);
+		get_page(page);
 
  	/* set the bit */
 	kaddr = kmap_atomic(page, KM_USER0);
@@ -938,7 +938,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				if (ret) {
 					kunmap(page);
 					/* release, page not in filemap yet */
-					page_cache_release(page);
+					put_page(page);
 					goto out;
 				}
 			}
@@ -1043,7 +1043,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 			/* skip this page unless it's marked as needing cleaning */
 			if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
 				if (attr & BITMAP_PAGE_NEEDWRITE) {
-					page_cache_get(page);
+					get_page(page);
 					clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
 				}
 				spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1057,13 +1057,13 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 					default:
 						bitmap_file_kick(bitmap);
 					}
-					page_cache_release(page);
+					put_page(page);
 				}
 				continue;
 			}
 
 			/* grab the new page, sync and release the old */
-			page_cache_get(page);
+			get_page(page);
 			if (lastpage != NULL) {
 				if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
 					clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
@@ -1078,7 +1078,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 				}
 				kunmap(lastpage);
-				page_cache_release(lastpage);
+				put_page(lastpage);
 				if (err)
 					bitmap_file_kick(bitmap);
 			} else
@@ -1133,7 +1133,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 		}
 
-		page_cache_release(lastpage);
+		put_page(lastpage);
 	}
 
 	return err;
@@ -1184,7 +1184,7 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
 		PRINTK("finished page writeback: %p\n", page);
 
 		err = PageError(page);
-		page_cache_release(page);
+		put_page(page);
 		if (err) {
 			printk(KERN_WARNING "%s: bitmap file writeback "
 			       "failed (page %lu): %d\n",
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6101879..c3ac67c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -339,7 +339,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
 static void free_disk_sb(mdk_rdev_t * rdev)
 {
 	if (rdev->sb_page) {
-		page_cache_release(rdev->sb_page);
+		put_page(rdev->sb_page);
 		rdev->sb_loaded = 0;
 		rdev->sb_page = NULL;
 		rdev->sb_offset = 0;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fece327..a2c2e18 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -361,7 +361,7 @@ static int raid0_run (mddev_t *mddev)
 	 * chunksize should be used in that case.
 	 */
 	{
-		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6c10f28..bbe0b81 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -139,7 +139,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 out_free_pages:
 	for (i=0; i < RESYNC_PAGES ; i++)
 		for (j=0 ; j < pi->raid_disks; j++)
-			__free_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
@@ -159,7 +159,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 			if (j == 0 ||
 			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
 			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				__free_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+				put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 		}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
@@ -384,7 +384,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 			/* free extra copy of the data pages */
 			int i = bio->bi_vcnt;
 			while (i--)
-				__free_page(bio->bi_io_vec[i].bv_page);
+				put_page(bio->bi_io_vec[i].bv_page);
 		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -733,7 +733,7 @@ static struct page **alloc_behind_pages(struct bio *bio)
 do_sync_io:
 	if (pages)
 		for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
-			__free_page(pages[i]);
+			put_page(pages[i]);
 	kfree(pages);
 	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 	return NULL;
@@ -1893,7 +1893,7 @@ out_free_conf:
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
-		__free_page(conf->tmppage);
+		put_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3f8df2e..ce729d6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -134,10 +134,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	for ( ; i > 0 ; i--)
-		__free_page(bio->bi_io_vec[i-1].bv_page);
+		put_page(bio->bi_io_vec[i-1].bv_page);
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
-			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+			put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < nalloc )
@@ -157,7 +157,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
 			for (i = 0; i < RESYNC_PAGES; i++) {
-				__free_page(bio->bi_io_vec[i].bv_page);
+				put_page(bio->bi_io_vec[i].bv_page);
 				bio->bi_io_vec[i].bv_page = NULL;
 			}
 			bio_put(bio);
@@ -2015,7 +2015,7 @@ static int run(mddev_t *mddev)
 	 * maybe...
 	 */
 	{
-		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
+		int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
 		stripe /= conf->near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0222ba1..ec5186f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -167,7 +167,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
 		if (!p)
 			continue;
 		sh->dev[i].page = NULL;
-		page_cache_release(p);
+		put_page(p);
 	}
 }
 
@@ -1956,7 +1956,7 @@ static int run(mddev_t *mddev)
 	 */
 	{
 		int stripe = (mddev->raid_disks-1) * mddev->chunk_size
-			/ PAGE_CACHE_SIZE;
+			/ PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index b5b7a8d..4062fc1 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -186,7 +186,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
 		if (!p)
 			continue;
 		sh->dev[i].page = NULL;
-		page_cache_release(p);
+		put_page(p);
 	}
 }
 
@@ -2069,7 +2069,7 @@ static int run(mddev_t *mddev)
 	 */
 	{
 		int stripe = (mddev->raid_disks-2) * mddev->chunk_size
-			/ PAGE_CACHE_SIZE;
+			/ PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
@@ -2084,7 +2084,7 @@ abort:
 	if (conf) {
 		print_raid6_conf(conf);
 		if (conf->spare_page)
-			page_cache_release(conf->spare_page);
+			put_page(conf->spare_page);
 		if (conf->stripe_hashtbl)
 			free_pages((unsigned long) conf->stripe_hashtbl,
 							HASH_PAGES_ORDER);
-- 
cgit v1.1


From 9ffae0cf3ea02f75d163922accfd3e592d87adde Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:32 -0800
Subject: [PATCH] md: convert md to use kzalloc throughout

Replace multiple kmalloc/memset pairs with kzalloc calls.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c    | 11 +++--------
 drivers/md/linear.c    |  3 +--
 drivers/md/md.c        | 10 +++-------
 drivers/md/multipath.c | 10 +++-------
 drivers/md/raid0.c     |  9 ++-------
 drivers/md/raid1.c     | 20 ++++++--------------
 drivers/md/raid10.c    |  6 ++----
 drivers/md/raid5.c     |  8 ++++----
 8 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index fc05d12..c3faa6a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -887,12 +887,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->filemap)
 		goto out;
 
-	bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL);
+	bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL);
 	if (!bitmap->filemap_attr)
 		goto out;
 
-	memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
-
 	oldindex = ~0L;
 
 	for (i = 0; i < chunks; i++) {
@@ -1557,12 +1555,10 @@ int bitmap_create(mddev_t *mddev)
 
 	BUG_ON(file && mddev->bitmap_offset);
 
-	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
 		return -ENOMEM;
 
-	memset(bitmap, 0, sizeof(*bitmap));
-
 	spin_lock_init(&bitmap->lock);
 	bitmap->mddev = mddev;
 
@@ -1603,12 +1599,11 @@ int bitmap_create(mddev_t *mddev)
 #ifdef INJECT_FATAL_FAULT_1
 	bitmap->bp = NULL;
 #else
-	bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
+	bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
 #endif
 	err = -ENOMEM;
 	if (!bitmap->bp)
 		goto error;
-	memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
 
 	bitmap->flags |= BITMAP_ACTIVE;
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 946efef..f46c98d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -121,11 +121,10 @@ static int linear_run (mddev_t *mddev)
 	sector_t curr_offset;
 	struct list_head *tmp;
 
-	conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
+	conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
 			GFP_KERNEL);
 	if (!conf)
 		goto out;
-	memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
 	mddev->private = conf;
 
 	cnt = 0;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c3ac67c..8c378b6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -228,12 +228,10 @@ static mddev_t * mddev_find(dev_t unit)
 	}
 	spin_unlock(&all_mddevs_lock);
 
-	new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return NULL;
 
-	memset(new, 0, sizeof(*new));
-
 	new->unit = unit;
 	if (MAJOR(unit) == MD_MAJOR)
 		new->md_minor = MINOR(unit);
@@ -1620,12 +1618,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	mdk_rdev_t *rdev;
 	sector_t size;
 
-	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
 	if (!rdev) {
 		printk(KERN_ERR "md: could not alloc mem for new device!\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	memset(rdev, 0, sizeof(*rdev));
 
 	if ((err = alloc_disk_sb(rdev)))
 		goto abort_free;
@@ -3505,11 +3502,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 {
 	mdk_thread_t *thread;
 
-	thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL);
+	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
 	if (!thread)
 		return NULL;
 
-	memset(thread, 0, sizeof(mdk_thread_t));
 	init_waitqueue_head(&thread->wqueue);
 
 	thread->run = run;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 145cdc5..97a56aa 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -41,9 +41,7 @@ static mdk_personality_t multipath_personality;
 static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct multipath_bh *mpb;
-	mpb = kmalloc(sizeof(*mpb), gfp_flags);
-	if (mpb) 
-		memset(mpb, 0, sizeof(*mpb));
+	mpb = kzalloc(sizeof(*mpb), gfp_flags);
 	return mpb;
 }
 
@@ -444,7 +442,7 @@ static int multipath_run (mddev_t *mddev)
 	 * should be freed in multipath_stop()]
 	 */
 
-	conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(KERN_ERR 
@@ -452,9 +450,8 @@ static int multipath_run (mddev_t *mddev)
 			mdname(mddev));
 		goto out;
 	}
-	memset(conf, 0, sizeof(*conf));
 
-	conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks,
+	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
 				   GFP_KERNEL);
 	if (!conf->multipaths) {
 		printk(KERN_ERR 
@@ -462,7 +459,6 @@ static int multipath_run (mddev_t *mddev)
 			mdname(mddev));
 		goto out_free_conf;
 	}
-	memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
 
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index a2c2e18..b4eaa67 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -113,21 +113,16 @@ static int create_strip_zones (mddev_t *mddev)
 	}
 	printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
 
-	conf->strip_zone = kmalloc(sizeof(struct strip_zone)*
+	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
 	if (!conf->strip_zone)
 		return 1;
-	conf->devlist = kmalloc(sizeof(mdk_rdev_t*)*
+	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
 				conf->nr_strip_zones*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->devlist)
 		return 1;
 
-	memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
-				   conf->nr_strip_zones);
-	memset(conf->devlist, 0,
-	       sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks);
-
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
 	 */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index bbe0b81..c42ef1c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -61,10 +61,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
-	r1_bio = kmalloc(size, gfp_flags);
-	if (r1_bio)
-		memset(r1_bio, 0, size);
-	else
+	r1_bio = kzalloc(size, gfp_flags);
+	if (!r1_bio)
 		unplug_slaves(pi->mddev);
 
 	return r1_bio;
@@ -711,13 +709,11 @@ static struct page **alloc_behind_pages(struct bio *bio)
 {
 	int i;
 	struct bio_vec *bvec;
-	struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
+	struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
 					GFP_NOIO);
 	if (unlikely(!pages))
 		goto do_sync_io;
 
-	memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
-
 	bio_for_each_segment(bvec, bio, i) {
 		pages[i] = alloc_page(GFP_NOIO);
 		if (unlikely(!pages[i]))
@@ -1770,19 +1766,16 @@ static int run(mddev_t *mddev)
 	 * bookkeeping area. [whatever we allocate in run(),
 	 * should be freed in stop()]
 	 */
-	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf)
 		goto out_no_mem;
 
-	memset(conf, 0, sizeof(*conf));
-	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, 
+	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out_no_mem;
 
-	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
-
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
 		goto out_no_mem;
@@ -1992,13 +1985,12 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 		kfree(newpoolinfo);
 		return -ENOMEM;
 	}
-	newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
 	if (!newmirrors) {
 		kfree(newpoolinfo);
 		mempool_destroy(newpool);
 		return -ENOMEM;
 	}
-	memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
 
 	raise_barrier(conf);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ce729d6..254b50e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -59,10 +59,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 	int size = offsetof(struct r10bio_s, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
-	r10_bio = kmalloc(size, gfp_flags);
-	if (r10_bio)
-		memset(r10_bio, 0, size);
-	else
+	r10_bio = kzalloc(size, gfp_flags);
+	if (!r10_bio)
 		unplug_slaves(conf->mddev);
 
 	return r10_bio;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ec5186f..9fc5048 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1827,12 +1827,12 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kmalloc (sizeof (raid5_conf_t)
-				  + mddev->raid_disks * sizeof(struct disk_info),
-				  GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid5_conf_t)
+				 + mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
-- 
cgit v1.1


From fccddba060f2b4916a30aa27acc3d03b01bb981e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:33 -0800
Subject: [PATCH] md: tidy up raid5/6 hash table code

- replace open-coded hash chain with hlist macros

- Fix hash-table size at one page - it is already quite generous, so there
  will never be a need to use multiple pages, so no need for __get_free_pages

No functional change.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 40 ++++++++++++++--------------------------
 drivers/md/raid6main.c     | 46 +++++++++++++++++-----------------------------
 include/linux/raid/raid5.h |  4 ++--
 3 files changed, 33 insertions(+), 57 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9fc5048..6e4db95 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -35,12 +35,10 @@
 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
 #define	IO_THRESHOLD		1
-#define HASH_PAGES		1
-#define HASH_PAGES_ORDER	0
-#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
-#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
 
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
@@ -113,29 +111,21 @@ static void release_stripe(struct stripe_head *sh)
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static void remove_hash(struct stripe_head *sh)
+static inline void remove_hash(struct stripe_head *sh)
 {
 	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
-	if (sh->hash_pprev) {
-		if (sh->hash_next)
-			sh->hash_next->hash_pprev = sh->hash_pprev;
-		*sh->hash_pprev = sh->hash_next;
-		sh->hash_pprev = NULL;
-	}
+	hlist_del_init(&sh->hash);
 }
 
-static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
 	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
 	CHECK_DEVLOCK();
-	if ((sh->hash_next = *shp) != NULL)
-		(*shp)->hash_pprev = &sh->hash_next;
-	*shp = sh;
-	sh->hash_pprev = shp;
+	hlist_add_head(&sh->hash, hp);
 }
 
 
@@ -228,10 +218,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 		if (sh->sector == sector)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -1835,9 +1826,8 @@ static int run(mddev_t *mddev)
 
 	conf->mddev = mddev;
 
-	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
-	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
@@ -1972,9 +1962,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
-		if (conf->stripe_hashtbl)
-			free_pages((unsigned long) conf->stripe_hashtbl,
-							HASH_PAGES_ORDER);
+		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -1991,7 +1979,7 @@ static int stop(mddev_t *mddev)
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	shrink_stripes(conf);
-	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
 	kfree(conf);
@@ -2019,12 +2007,12 @@ static void print_sh (struct stripe_head *sh)
 static void printall (raid5_conf_t *conf)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 	int i;
 
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < NR_HASH; i++) {
-		sh = conf->stripe_hashtbl[i];
-		for (; sh; sh = sh->hash_next) {
+		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
 			if (sh->raid_conf != conf)
 				continue;
 			print_sh(sh);
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 4062fc1..79b5244 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -40,12 +40,10 @@
 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
 #define	IO_THRESHOLD		1
-#define HASH_PAGES		1
-#define HASH_PAGES_ORDER	0
-#define NR_HASH			(HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
-#define stripe_hash(conf, sect)	((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
+#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
 
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
@@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh)
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static void remove_hash(struct stripe_head *sh)
+static inline void remove_hash(struct stripe_head *sh)
 {
 	PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
-	if (sh->hash_pprev) {
-		if (sh->hash_next)
-			sh->hash_next->hash_pprev = sh->hash_pprev;
-		*sh->hash_pprev = sh->hash_next;
-		sh->hash_pprev = NULL;
-	}
+	hlist_del_init(&sh->hash);
 }
 
-static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
 {
-	struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
 	PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
 
 	CHECK_DEVLOCK();
-	if ((sh->hash_next = *shp) != NULL)
-		(*shp)->hash_pprev = &sh->hash_next;
-	*shp = sh;
-	sh->hash_pprev = shp;
+	hlist_add_head(&sh->hash, hp);
 }
 
 
@@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
 static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
-	for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+	hlist_for_each_entry (sh, hn,  stripe_hash(conf, sector), hash)
 		if (sh->sector == sector)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -1931,17 +1922,15 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kmalloc (sizeof (raid6_conf_t)
-				  + mddev->raid_disks * sizeof(struct disk_info),
-				  GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid6_conf_t)
+				 + mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
 	conf->mddev = mddev;
 
-	if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
 		goto abort;
-	memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
 
 	conf->spare_page = alloc_page(GFP_KERNEL);
 	if (!conf->spare_page)
@@ -2085,9 +2074,7 @@ abort:
 		print_raid6_conf(conf);
 		if (conf->spare_page)
 			put_page(conf->spare_page);
-		if (conf->stripe_hashtbl)
-			free_pages((unsigned long) conf->stripe_hashtbl,
-							HASH_PAGES_ORDER);
+		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
 	mddev->private = NULL;
@@ -2104,7 +2091,7 @@ static int stop (mddev_t *mddev)
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	shrink_stripes(conf);
-	free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf);
 	mddev->private = NULL;
@@ -2131,12 +2118,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh)
 static void printall (struct seq_file *seq, raid6_conf_t *conf)
 {
 	struct stripe_head *sh;
+	struct hlist_node *hn;
 	int i;
 
 	spin_lock_irq(&conf->device_lock);
 	for (i = 0; i < NR_HASH; i++) {
 		sh = conf->stripe_hashtbl[i];
-		for (; sh; sh = sh->hash_next) {
+		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
 			if (sh->raid_conf != conf)
 				continue;
 			print_sh(seq, sh);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 28fcd75..394da82 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -126,7 +126,7 @@
  */
 
 struct stripe_head {
-	struct stripe_head	*hash_next, **hash_pprev; /* hash pointers */
+	struct hlist_node	hash;
 	struct list_head	lru;			/* inactive_list or handle_list */
 	struct raid5_private_data	*raid_conf;
 	sector_t		sector;			/* sector of this row */
@@ -204,7 +204,7 @@ struct disk_info {
 };
 
 struct raid5_private_data {
-	struct stripe_head	**stripe_hashtbl;
+	struct hlist_head	*stripe_hashtbl;
 	mddev_t			*mddev;
 	struct disk_info	*spare;
 	int			chunk_size, level, algorithm;
-- 
cgit v1.1


From ea03aff93b4117c2ddfb91b34b44a44e19df24cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:34 -0800
Subject: [PATCH] md: convert various kmap calls to kmap_atomic

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c3faa6a..519b1bf 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -406,11 +406,11 @@ int bitmap_update_sb(struct bitmap *bitmap)
 		return 0;
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 	sb->events = cpu_to_le64(bitmap->mddev->events);
 	if (!bitmap->mddev->degraded)
 		sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
-	kunmap(bitmap->sb_page);
+	kunmap_atomic(sb, KM_USER0);
 	return write_page(bitmap, bitmap->sb_page, 1);
 }
 
@@ -421,7 +421,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 
 	if (!bitmap || !bitmap->sb_page)
 		return;
-	sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 	printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
 	printk(KERN_DEBUG "         magic: %08x\n", le32_to_cpu(sb->magic));
 	printk(KERN_DEBUG "       version: %d\n", le32_to_cpu(sb->version));
@@ -440,7 +440,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	printk(KERN_DEBUG "     sync size: %llu KB\n",
 			(unsigned long long)le64_to_cpu(sb->sync_size)/2);
 	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
-	kunmap(bitmap->sb_page);
+	kunmap_atomic(sb, KM_USER0);
 }
 
 /* read the superblock from the bitmap file and initialize some bitmap fields */
@@ -466,7 +466,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return err;
 	}
 
-	sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 
 	if (bytes_read < sizeof(*sb)) { /* short read */
 		printk(KERN_INFO "%s: bitmap file superblock truncated\n",
@@ -535,7 +535,7 @@ success:
 		bitmap->events_cleared = bitmap->mddev->events;
 	err = 0;
 out:
-	kunmap(bitmap->sb_page);
+	kunmap_atomic(sb, KM_USER0);
 	if (err)
 		bitmap_print_sb(bitmap);
 	return err;
@@ -560,7 +560,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 	}
 	get_page(bitmap->sb_page);
 	spin_unlock_irqrestore(&bitmap->lock, flags);
-	sb = (bitmap_super_t *)kmap(bitmap->sb_page);
+	sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
 	switch (op) {
 		case MASK_SET: sb->state |= bits;
 				break;
@@ -568,7 +568,7 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
 				break;
 		default: BUG();
 	}
-	kunmap(bitmap->sb_page);
+	kunmap_atomic(sb, KM_USER0);
 	put_page(bitmap->sb_page);
 }
 
@@ -854,6 +854,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	unsigned long bytes, offset, dummy;
 	int outofdate;
 	int ret = -ENOSPC;
+	void *paddr;
 
 	chunks = bitmap->chunks;
 	file = bitmap->file;
@@ -899,8 +900,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		bit = file_page_offset(i);
 		if (index != oldindex) { /* this is a new page, read it in */
 			/* unmap the old page, we're done with it */
-			if (oldpage != NULL)
-				kunmap(oldpage);
 			if (index == 0) {
 				/*
 				 * if we're here then the superblock page
@@ -923,18 +922,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 			oldindex = index;
 			oldpage = page;
-			kmap(page);
 
 			if (outofdate) {
 				/*
 				 * if bitmap is out of date, dirty the
 			 	 * whole page and write it out
 				 */
-				memset(page_address(page) + offset, 0xff,
+				paddr = kmap_atomic(page, KM_USER0);
+				memset(paddr + offset, 0xff,
 				       PAGE_SIZE - offset);
+				kunmap_atomic(paddr, KM_USER0);
 				ret = write_page(bitmap, page, 1);
 				if (ret) {
-					kunmap(page);
 					/* release, page not in filemap yet */
 					put_page(page);
 					goto out;
@@ -943,10 +942,12 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 
 			bitmap->filemap[bitmap->file_pages++] = page;
 		}
+		paddr = kmap_atomic(page, KM_USER0);
 		if (bitmap->flags & BITMAP_HOSTENDIAN)
-			b = test_bit(bit, page_address(page));
+			b = test_bit(bit, paddr);
 		else
-			b = ext2_test_bit(bit, page_address(page));
+			b = ext2_test_bit(bit, paddr);
+		kunmap_atomic(paddr, KM_USER0);
 		if (b) {
 			/* if the disk bit is set, set the memory bit */
 			bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
@@ -961,9 +962,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	ret = 0;
 	bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
 
-	if (page) /* unmap the last page */
-		kunmap(page);
-
 	if (bit_cnt) { /* Kick recovery if any bits were set */
 		set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
 		md_wakeup_thread(bitmap->mddev->thread);
@@ -1019,6 +1017,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 	int err = 0;
 	int blocks;
 	int attr;
+	void *paddr;
 
 	if (bitmap == NULL)
 		return 0;
@@ -1075,14 +1074,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 					set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 				}
-				kunmap(lastpage);
 				put_page(lastpage);
 				if (err)
 					bitmap_file_kick(bitmap);
 			} else
 				spin_unlock_irqrestore(&bitmap->lock, flags);
 			lastpage = page;
-			kmap(page);
 /*
 			printk("bitmap clean at page %lu\n", j);
 */
@@ -1105,10 +1102,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 						  -1);
 
 				/* clear the bit */
+				paddr = kmap_atomic(page, KM_USER0);
 				if (bitmap->flags & BITMAP_HOSTENDIAN)
-					clear_bit(file_page_offset(j), page_address(page));
+					clear_bit(file_page_offset(j), paddr);
 				else
-					ext2_clear_bit(file_page_offset(j), page_address(page));
+					ext2_clear_bit(file_page_offset(j), paddr);
+				kunmap_atomic(paddr, KM_USER0);
 			}
 		}
 		spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1116,7 +1115,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
 
 	/* now sync the final page */
 	if (lastpage != NULL) {
-		kunmap(lastpage);
 		spin_lock_irqsave(&bitmap->lock, flags);
 		if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
 			clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
-- 
cgit v1.1


From a8745db232dd86f145bff6946e78f839acab511b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:34 -0800
Subject: [PATCH] md: convert recently exported symbol to GPL

...because that seems to be the preferred practice these days.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8c378b6..a6a066f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -478,7 +478,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio_put(bio);
 	return ret;
 }
-EXPORT_SYMBOL(sync_page_io);
+EXPORT_SYMBOL_GPL(sync_page_io);
 
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
-- 
cgit v1.1


From a24a8dd858e0ba50f06a9fd8f61fe8c4fe7a8d8e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:35 -0800
Subject: [PATCH] md: break out of a loop that doesn't need to run to
 completion

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 254b50e..253322a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1672,8 +1672,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 				for (j=0; j<conf->copies;j++) {
 					int d = r10_bio->devs[j].devnum;
 					if (conf->mirrors[d].rdev == NULL ||
-					    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
+					    test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
 						still_degraded = 1;
+						break;
+					}
 				}
 				must_sync = bitmap_start_sync(mddev->bitmap, sect,
 							      &sync_blocks, still_degraded);
-- 
cgit v1.1


From 2604b703b6b3db80e3c75ce472a54dfd0b7bf9f4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:36 -0800
Subject: [PATCH] md: remove personality numbering from md

md supports multiple different RAID level, each being implemented by a
'personality' (which is often in a separate module).

These personalities have fairly artificial 'numbers'.  The numbers
are use to:
 1- provide an index into an array where the various personalities
    are recorded
 2- identify the module (via an alias) which implements are particular
    personality.

Neither of these uses really justify the existence of personality numbers.
The array can be replaced by a linked list which is searched (array lookup
only happens very rarely).  Module identification can be done using an alias
based on level rather than 'personality' number.

The current 'raid5' modules support two level (4 and 5) but only one
personality.  This slight awkwardness (which was handled in the mapping from
level to personality) can be better handled by allowing raid5 to register 2
personalities.

With this change in place, the core md module does not need to have an
exhaustive list of all possible personalities, so other personalities can be
added independently.

This patch also moves the check for chunksize being non-zero into the ->run
routines for the personalities that need it, rather than having it in core-md.
 This has a side effect of allowing 'faulty' and 'linear' not to have a
chunk-size set.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/faulty.c       |  8 +++--
 drivers/md/linear.c       | 10 +++---
 drivers/md/md.c           | 79 +++++++++++++++++------------------------------
 drivers/md/multipath.c    | 11 +++----
 drivers/md/raid0.c        | 14 ++++++---
 drivers/md/raid1.c        |  9 +++---
 drivers/md/raid10.c       | 16 +++++-----
 drivers/md/raid5.c        | 34 +++++++++++++++++---
 drivers/md/raid6main.c    | 10 +++---
 include/linux/raid/md.h   |  4 +--
 include/linux/raid/md_k.h | 63 ++++++-------------------------------
 init/do_mounts_md.c       | 22 ++++++-------
 12 files changed, 125 insertions(+), 155 deletions(-)

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 0248f8e..f12e830 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -316,9 +316,10 @@ static int stop(mddev_t *mddev)
 	return 0;
 }
 
-static mdk_personality_t faulty_personality =
+static struct mdk_personality faulty_personality =
 {
 	.name		= "faulty",
+	.level		= LEVEL_FAULTY,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -329,15 +330,16 @@ static mdk_personality_t faulty_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(FAULTY, &faulty_personality);
+	return register_md_personality(&faulty_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(FAULTY);
+	unregister_md_personality(&faulty_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f46c98d..79dee81 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -351,9 +351,10 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
 }
 
 
-static mdk_personality_t linear_personality=
+static struct mdk_personality linear_personality =
 {
 	.name		= "linear",
+	.level		= LEVEL_LINEAR,
 	.owner		= THIS_MODULE,
 	.make_request	= linear_make_request,
 	.run		= linear_run,
@@ -363,16 +364,17 @@ static mdk_personality_t linear_personality=
 
 static int __init linear_init (void)
 {
-	return register_md_personality (LINEAR, &linear_personality);
+	return register_md_personality (&linear_personality);
 }
 
 static void linear_exit (void)
 {
-	unregister_md_personality (LINEAR);
+	unregister_md_personality (&linear_personality);
 }
 
 
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-1"); /* LINEAR */
+MODULE_ALIAS("md-personality-1"); /* LINEAR - degrecated*/
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a6a066f..07f180f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,7 +68,7 @@
 static void autostart_arrays (int part);
 #endif
 
-static mdk_personality_t *pers[MAX_PERSONALITY];
+static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
 /*
@@ -303,6 +303,15 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 	return NULL;
 }
 
+static struct mdk_personality *find_pers(int level)
+{
+	struct mdk_personality *pers;
+	list_for_each_entry(pers, &pers_list, list)
+		if (pers->level == level)
+			return pers;
+	return NULL;
+}
+
 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 {
 	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -1744,7 +1753,7 @@ static void analyze_sbs(mddev_t * mddev)
 static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
-	mdk_personality_t *p = mddev->pers;
+	struct mdk_personality *p = mddev->pers;
 	if (p == NULL && mddev->raid_disks == 0)
 		return 0;
 	if (mddev->level >= 0)
@@ -1960,11 +1969,12 @@ static int start_dirty_degraded;
 
 static int do_md_run(mddev_t * mddev)
 {
-	int pnum, err;
+	int err;
 	int chunk_size;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
+	struct mdk_personality *pers;
 	char b[BDEVNAME_SIZE];
 
 	if (list_empty(&mddev->disks))
@@ -1981,20 +1991,8 @@ static int do_md_run(mddev_t * mddev)
 		analyze_sbs(mddev);
 
 	chunk_size = mddev->chunk_size;
-	pnum = level_to_pers(mddev->level);
 
-	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
-		if (!chunk_size) {
-			/*
-			 * 'default chunksize' in the old md code used to
-			 * be PAGE_SIZE, baaad.
-			 * we abort here to be on the safe side. We don't
-			 * want to continue the bad practice.
-			 */
-			printk(KERN_ERR 
-				"no chunksize specified, see 'man raidtab'\n");
-			return -EINVAL;
-		}
+	if (chunk_size) {
 		if (chunk_size > MAX_CHUNK_SIZE) {
 			printk(KERN_ERR "too big chunk_size: %d > %d\n",
 				chunk_size, MAX_CHUNK_SIZE);
@@ -2030,10 +2028,7 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 #ifdef CONFIG_KMOD
-	if (!pers[pnum])
-	{
-		request_module("md-personality-%d", pnum);
-	}
+	request_module("md-level-%d", mddev->level);
 #endif
 
 	/*
@@ -2055,14 +2050,14 @@ static int do_md_run(mddev_t * mddev)
 		return -ENOMEM;
 
 	spin_lock(&pers_lock);
-	if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+	pers = find_pers(mddev->level);
+	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality %d is not loaded!\n",
-		       pnum);
+		printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+		       mddev->level);
 		return -EINVAL;
 	}
-
-	mddev->pers = pers[pnum];
+	mddev->pers = pers;
 	spin_unlock(&pers_lock);
 
 	mddev->recovery = 0;
@@ -3701,15 +3696,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
 	struct list_head *tmp2;
 	mdk_rdev_t *rdev;
 	struct mdstat_info *mi = seq->private;
-	int i;
 	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
+		struct mdk_personality *pers;
 		seq_printf(seq, "Personalities : ");
 		spin_lock(&pers_lock);
-		for (i = 0; i < MAX_PERSONALITY; i++)
-			if (pers[i])
-				seq_printf(seq, "[%s] ", pers[i]->name);
+		list_for_each_entry(pers, &pers_list, list)
+			seq_printf(seq, "[%s] ", pers->name);
 
 		spin_unlock(&pers_lock);
 		seq_printf(seq, "\n");
@@ -3870,35 +3864,20 @@ static struct file_operations md_seq_fops = {
 	.poll		= mdstat_poll,
 };
 
-int register_md_personality(int pnum, mdk_personality_t *p)
+int register_md_personality(struct mdk_personality *p)
 {
-	if (pnum >= MAX_PERSONALITY) {
-		printk(KERN_ERR
-		       "md: tried to install personality %s as nr %d, but max is %lu\n",
-		       p->name, pnum, MAX_PERSONALITY-1);
-		return -EINVAL;
-	}
-
 	spin_lock(&pers_lock);
-	if (pers[pnum]) {
-		spin_unlock(&pers_lock);
-		return -EBUSY;
-	}
-
-	pers[pnum] = p;
-	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+	list_add_tail(&p->list, &pers_list);
+	printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
 	spin_unlock(&pers_lock);
 	return 0;
 }
 
-int unregister_md_personality(int pnum)
+int unregister_md_personality(struct mdk_personality *p)
 {
-	if (pnum >= MAX_PERSONALITY)
-		return -EINVAL;
-
-	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
 	spin_lock(&pers_lock);
-	pers[pnum] = NULL;
+	list_del_init(&p->list);
 	spin_unlock(&pers_lock);
 	return 0;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 97a56aa..d4d838e 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,9 +35,6 @@
 #define	NR_RESERVED_BUFS	32
 
 
-static mdk_personality_t multipath_personality;
-
-
 static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct multipath_bh *mpb;
@@ -553,9 +550,10 @@ static int multipath_stop (mddev_t *mddev)
 	return 0;
 }
 
-static mdk_personality_t multipath_personality=
+static struct mdk_personality multipath_personality =
 {
 	.name		= "multipath",
+	.level		= LEVEL_MULTIPATH,
 	.owner		= THIS_MODULE,
 	.make_request	= multipath_make_request,
 	.run		= multipath_run,
@@ -568,15 +566,16 @@ static mdk_personality_t multipath_personality=
 
 static int __init multipath_init (void)
 {
-	return register_md_personality (MULTIPATH, &multipath_personality);
+	return register_md_personality (&multipath_personality);
 }
 
 static void __exit multipath_exit (void)
 {
-	unregister_md_personality (MULTIPATH);
+	unregister_md_personality (&multipath_personality);
 }
 
 module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index b4eaa67..7fb69e2 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -275,7 +275,11 @@ static int raid0_run (mddev_t *mddev)
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
 
-	printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+		return -EINVAL;
+	}
+	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
 	       mdname(mddev),
 	       mddev->chunk_size >> 9,
 	       (mddev->chunk_size>>1)-1);
@@ -507,9 +511,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
-static mdk_personality_t raid0_personality=
+static struct mdk_personality raid0_personality=
 {
 	.name		= "raid0",
+	.level		= 0,
 	.owner		= THIS_MODULE,
 	.make_request	= raid0_make_request,
 	.run		= raid0_run,
@@ -519,15 +524,16 @@ static mdk_personality_t raid0_personality=
 
 static int __init raid0_init (void)
 {
-	return register_md_personality (RAID0, &raid0_personality);
+	return register_md_personality (&raid0_personality);
 }
 
 static void raid0_exit (void)
 {
-	unregister_md_personality (RAID0);
+	unregister_md_personality (&raid0_personality);
 }
 
 module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c42ef1c..6e0f59e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,7 +47,6 @@
  */
 #define	NR_RAID1_BIOS 256
 
-static mdk_personality_t raid1_personality;
 
 static void unplug_slaves(mddev_t *mddev);
 
@@ -2036,9 +2035,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 }
 
 
-static mdk_personality_t raid1_personality =
+static struct mdk_personality raid1_personality =
 {
 	.name		= "raid1",
+	.level		= 1,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2056,15 +2056,16 @@ static mdk_personality_t raid1_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(RAID1, &raid1_personality);
+	return register_md_personality(&raid1_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(RAID1);
+	unregister_md_personality(&raid1_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 253322a..f23d52c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1883,11 +1883,11 @@ static int run(mddev_t *mddev)
 	int nc, fc;
 	sector_t stride, size;
 
-	if (mddev->level != 10) {
-		printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
-		       mdname(mddev), mddev->level);
-		goto out;
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
+		return -EINVAL;
 	}
+
 	nc = mddev->layout & 255;
 	fc = (mddev->layout >> 8) & 255;
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
@@ -2072,9 +2072,10 @@ static void raid10_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid10_personality =
+static struct mdk_personality raid10_personality =
 {
 	.name		= "raid10",
+	.level		= 10,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2090,15 +2091,16 @@ static mdk_personality_t raid10_personality =
 
 static int __init raid_init(void)
 {
-	return register_md_personality(RAID10, &raid10_personality);
+	return register_md_personality(&raid10_personality);
 }
 
 static void raid_exit(void)
 {
-	unregister_md_personality(RAID10);
+	unregister_md_personality(&raid10_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6e4db95..b0cfd3c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2187,9 +2187,10 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid5_personality=
+static struct mdk_personality raid5_personality =
 {
 	.name		= "raid5",
+	.level		= 5,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2204,17 +2205,40 @@ static mdk_personality_t raid5_personality=
 	.quiesce	= raid5_quiesce,
 };
 
-static int __init raid5_init (void)
+static struct mdk_personality raid4_personality =
 {
-	return register_md_personality (RAID5, &raid5_personality);
+	.name		= "raid4",
+	.level		= 4,
+	.owner		= THIS_MODULE,
+	.make_request	= make_request,
+	.run		= run,
+	.stop		= stop,
+	.status		= status,
+	.error_handler	= error,
+	.hot_add_disk	= raid5_add_disk,
+	.hot_remove_disk= raid5_remove_disk,
+	.spare_active	= raid5_spare_active,
+	.sync_request	= sync_request,
+	.resize		= raid5_resize,
+	.quiesce	= raid5_quiesce,
+};
+
+static int __init raid5_init(void)
+{
+	register_md_personality(&raid5_personality);
+	register_md_personality(&raid4_personality);
+	return 0;
 }
 
-static void raid5_exit (void)
+static void raid5_exit(void)
 {
-	unregister_md_personality (RAID5);
+	unregister_md_personality(&raid5_personality);
+	unregister_md_personality(&raid4_personality);
 }
 
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-level-5");
+MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 79b5244..950e5fa 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2304,9 +2304,10 @@ static void raid6_quiesce(mddev_t *mddev, int state)
 	}
 }
 
-static mdk_personality_t raid6_personality=
+static struct mdk_personality raid6_personality =
 {
 	.name		= "raid6",
+	.level		= 6,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
@@ -2321,7 +2322,7 @@ static mdk_personality_t raid6_personality=
 	.quiesce	= raid6_quiesce,
 };
 
-static int __init raid6_init (void)
+static int __init raid6_init(void)
 {
 	int e;
 
@@ -2329,15 +2330,16 @@ static int __init raid6_init (void)
 	if ( e )
 		return e;
 
-	return register_md_personality (RAID6, &raid6_personality);
+	return register_md_personality(&raid6_personality);
 }
 
 static void raid6_exit (void)
 {
-	unregister_md_personality (RAID6);
+	unregister_md_personality(&raid6_personality);
 }
 
 module_init(raid6_init);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 13e7c4b..b6e0bca 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -71,8 +71,8 @@
  */
 #define MD_PATCHLEVEL_VERSION           3
 
-extern int register_md_personality (int p_num, mdk_personality_t *p);
-extern int unregister_md_personality (int p_num);
+extern int register_md_personality (struct mdk_personality *p);
+extern int unregister_md_personality (struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
 				mddev_t *mddev, const char *name);
 extern void md_unregister_thread (mdk_thread_t *thread);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 1dd587b..e559fb7 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -18,62 +18,19 @@
 /* and dm-bio-list.h is not under include/linux because.... ??? */
 #include "../../../drivers/md/dm-bio-list.h"
 
-#define MD_RESERVED       0UL
-#define LINEAR            1UL
-#define RAID0             2UL
-#define RAID1             3UL
-#define RAID5             4UL
-#define TRANSLUCENT       5UL
-#define HSM               6UL
-#define MULTIPATH         7UL
-#define RAID6		  8UL
-#define	RAID10		  9UL
-#define FAULTY		  10UL
-#define MAX_PERSONALITY   11UL
-
 #define	LEVEL_MULTIPATH		(-4)
 #define	LEVEL_LINEAR		(-1)
 #define	LEVEL_FAULTY		(-5)
 
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define	LEVEL_NONE		(-1000000)
+
 #define MaxSector (~(sector_t)0)
 #define MD_THREAD_NAME_MAX 14
 
-static inline int pers_to_level (int pers)
-{
-	switch (pers) {
-		case FAULTY:		return LEVEL_FAULTY;
-		case MULTIPATH:		return LEVEL_MULTIPATH;
-		case HSM:		return -3;
-		case TRANSLUCENT:	return -2;
-		case LINEAR:		return LEVEL_LINEAR;
-		case RAID0:		return 0;
-		case RAID1:		return 1;
-		case RAID5:		return 5;
-		case RAID6:		return 6;
-		case RAID10:		return 10;
-	}
-	BUG();
-	return MD_RESERVED;
-}
-
-static inline int level_to_pers (int level)
-{
-	switch (level) {
-		case LEVEL_FAULTY: return FAULTY;
-		case LEVEL_MULTIPATH: return MULTIPATH;
-		case -3: return HSM;
-		case -2: return TRANSLUCENT;
-		case LEVEL_LINEAR: return LINEAR;
-		case 0: return RAID0;
-		case 1: return RAID1;
-		case 4:
-		case 5: return RAID5;
-		case 6: return RAID6;
-		case 10: return RAID10;
-	}
-	return MD_RESERVED;
-}
-
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
@@ -140,12 +97,10 @@ struct mdk_rdev_s
 					 */
 };
 
-typedef struct mdk_personality_s mdk_personality_t;
-
 struct mddev_s
 {
 	void				*private;
-	mdk_personality_t		*pers;
+	struct mdk_personality		*pers;
 	dev_t				unit;
 	int				md_minor;
 	struct list_head 		disks;
@@ -266,9 +221,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
         atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
 
-struct mdk_personality_s
+struct mdk_personality
 {
 	char *name;
+	int level;
+	struct list_head list;
 	struct module *owner;
 	int (*make_request)(request_queue_t *q, struct bio *bio);
 	int (*run)(mddev_t *mddev);
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 3fbc355..f6f3680 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -17,7 +17,7 @@ static int __initdata raid_noautodetect, raid_autopart;
 static struct {
 	int minor;
 	int partitioned;
-	int pers;
+	int level;
 	int chunk;
 	char *device_names;
 } md_setup_args[MAX_MD_DEVS] __initdata;
@@ -47,7 +47,7 @@ extern int mdp_major;
  */
 static int __init md_setup(char *str)
 {
-	int minor, level, factor, fault, pers, partitioned = 0;
+	int minor, level, factor, fault, partitioned = 0;
 	char *pername = "";
 	char *str1;
 	int ent;
@@ -78,7 +78,7 @@ static int __init md_setup(char *str)
 	}
 	if (ent >= md_setup_ents)
 		md_setup_ents++;
-	switch (get_option(&str, &level)) {	/* RAID Personality */
+	switch (get_option(&str, &level)) {	/* RAID level */
 	case 2: /* could be 0 or -1.. */
 		if (level == 0 || level == LEVEL_LINEAR) {
 			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
@@ -86,16 +86,12 @@ static int __init md_setup(char *str)
 				printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
 				return 0;
 			}
-			md_setup_args[ent].pers = level;
+			md_setup_args[ent].level = level;
 			md_setup_args[ent].chunk = 1 << (factor+12);
-			if (level ==  LEVEL_LINEAR) {
-				pers = LINEAR;
+			if (level ==  LEVEL_LINEAR)
 				pername = "linear";
-			} else {
-				pers = RAID0;
+			else
 				pername = "raid0";
-			}
-			md_setup_args[ent].pers = pers;
 			break;
 		}
 		/* FALL THROUGH */
@@ -103,7 +99,7 @@ static int __init md_setup(char *str)
 		str = str1;
 		/* FALL THROUGH */
 	case 0:
-		md_setup_args[ent].pers = 0;
+		md_setup_args[ent].level = LEVEL_NONE;
 		pername="super-block";
 	}
 
@@ -190,10 +186,10 @@ static void __init md_setup_drive(void)
 			continue;
 		}
 
-		if (md_setup_args[ent].pers) {
+		if (md_setup_args[ent].level != LEVEL_NONE) {
 			/* non-persistent */
 			mdu_array_info_t ainfo;
-			ainfo.level = pers_to_level(md_setup_args[ent].pers);
+			ainfo.level = md_setup_args[ent].level;
 			ainfo.size = 0;
 			ainfo.nr_disks =0;
 			ainfo.raid_disks =0;
-- 
cgit v1.1


From 097426f689f179747f3cd6b4749eb2a6b605702d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:37 -0800
Subject: [PATCH] md: fix possible problem in raid1/raid10 error overwriting

The code to overwrite/reread for addressing read errors in raid1/raid10
currently assumes that the read will not alter the buffer which could be used
to write to the next device.  This is not a safe assumption to make.

So we split the loops into a overwrite loop and a separate re-read loop, so
that the writing is complete before reading is attempted.

Cc: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid1.c  | 38 ++++++++++++++++++++++++++++++--------
 drivers/md/raid10.c | 22 ++++++++++++++++++----
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6e0f59e..39c10a6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1253,6 +1253,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 			} while (!success && d != r1_bio->read_disk);
 
 			if (success) {
+				int start = d;
 				/* write it back and re-read */
 				set_bit(R1BIO_Uptodate, &r1_bio->state);
 				while (d != r1_bio->read_disk) {
@@ -1266,14 +1267,23 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 							 sect + rdev->data_offset,
 							 s<<9,
 							 bio->bi_io_vec[idx].bv_page,
-							 WRITE) == 0 ||
-					    sync_page_io(rdev->bdev,
+							 WRITE) == 0)
+						md_error(mddev, rdev);
+				}
+				d = start;
+				while (d != r1_bio->read_disk) {
+					if (d == 0)
+						d = conf->raid_disks;
+					d--;
+					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+						continue;
+					rdev = conf->mirrors[d].rdev;
+					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
 							 bio->bi_io_vec[idx].bv_page,
-							 READ) == 0) {
+							 READ) == 0)
 						md_error(mddev, rdev);
-					}
 				}
 			} else {
 				char b[BDEVNAME_SIZE];
@@ -1445,6 +1455,7 @@ static void raid1d(mddev_t *mddev)
 
 				if (success) {
 					/* write it back and re-read */
+					int start = d;
 					while (d != r1_bio->read_disk) {
 						if (d==0)
 							d = conf->raid_disks;
@@ -1454,13 +1465,24 @@ static void raid1d(mddev_t *mddev)
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
 									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, WRITE) == 0 ||
-							    sync_page_io(rdev->bdev,
+									 s<<9, conf->tmppage, WRITE) == 0)
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+						}
+					}
+					d = start;
+					while (d != r1_bio->read_disk) {
+						if (d==0)
+							d = conf->raid_disks;
+						d--;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
 									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, READ) == 0) {
+									 s<<9, conf->tmppage, READ) == 0)
 								/* Well, this device is dead */
 								md_error(mddev, rdev);
-							}
 						}
 					}
 				} else {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f23d52c..9647ebb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1421,6 +1421,7 @@ static void raid10d(mddev_t *mddev)
 				} while (!success && sl != r10_bio->read_slot);
 
 				if (success) {
+					int start = sl;
 					/* write it back and re-read */
 					while (sl != r10_bio->read_slot) {
 						int d;
@@ -1434,14 +1435,27 @@ static void raid10d(mddev_t *mddev)
 							if (sync_page_io(rdev->bdev,
 									 r10_bio->devs[sl].addr +
 									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, WRITE) == 0 ||
-							    sync_page_io(rdev->bdev,
+									 s<<9, conf->tmppage, WRITE) == 0)
+								/* Well, this device is dead */
+								md_error(mddev, rdev);
+						}
+					}
+					sl = start;
+					while (sl != r10_bio->read_slot) {
+						int d;
+						if (sl==0)
+							sl = conf->copies;
+						sl--;
+						d = r10_bio->devs[sl].devnum;
+						rdev = conf->mirrors[d].rdev;
+						if (rdev &&
+						    test_bit(In_sync, &rdev->flags)) {
+							if (sync_page_io(rdev->bdev,
 									 r10_bio->devs[sl].addr +
 									 sect + rdev->data_offset,
-									 s<<9, conf->tmppage, READ) == 0) {
+									 s<<9, conf->tmppage, READ) == 0)
 								/* Well, this device is dead */
 								md_error(mddev, rdev);
-							}
 						}
 					}
 				} else {
-- 
cgit v1.1


From 7dd5d34c6c2da04e8cd0732e08b33cd3359e4bae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:39 -0800
Subject: [PATCH] md: remove inappropriate limits in md/bitmap configuration.

The kernel should not be imposing these policy limits: The time between
bitmap updates should certainly be allowed to be more than 15 seconds, and
if someone wants a bitmap chunk size in excess of 4MB, the kernel isn't the
place to stop them.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 519b1bf..abe415f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -485,12 +485,12 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
 		 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
 		reason = "unrecognized superblock version";
-	else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
-		reason = "bitmap chunksize out of range (512B - 4MB)";
+	else if (chunksize < PAGE_SIZE)
+		reason = "bitmap chunksize too small";
 	else if ((1 << ffz(~chunksize)) != chunksize)
 		reason = "bitmap chunksize not a power of 2";
-	else if (daemon_sleep < 1 || daemon_sleep > 15)
-		reason = "daemon sleep period out of range (1-15s)";
+	else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ)
+		reason = "daemon sleep period out of range";
 	else if (write_behind > COUNTER_MAX)
 		reason = "write-behind limit out of range (0 - 16383)";
 	if (reason) {
-- 
cgit v1.1


From 1345b1d8adbdeceb1c871d9a4af5e2a700b341c6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:40 -0800
Subject: [PATCH] md: define and use safe_put_page for md

md sometimes call put_page on NULL pointers (treating it like kfree).  This is
not safe, so define and use a 'safe_put_page' which checks for NULL.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c       | 3 +--
 drivers/md/raid1.c        | 8 ++++----
 drivers/md/raid10.c       | 8 ++++----
 drivers/md/raid6main.c    | 3 +--
 include/linux/raid/md_k.h | 5 +++++
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index abe415f..ee4a342 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -626,8 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
 	kfree(map);
 	kfree(attr);
 
-	if (sb_page)
-		put_page(sb_page);
+	safe_put_page(sb_page);
 }
 
 static void bitmap_stop_daemon(struct bitmap *bitmap);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 39c10a6..feea4ee 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -136,7 +136,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 out_free_pages:
 	for (i=0; i < RESYNC_PAGES ; i++)
 		for (j=0 ; j < pi->raid_disks; j++)
-			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+			safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
@@ -156,7 +156,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 			if (j == 0 ||
 			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
 			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 		}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
@@ -381,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 			/* free extra copy of the data pages */
 			int i = bio->bi_vcnt;
 			while (i--)
-				put_page(bio->bi_io_vec[i].bv_page);
+				safe_put_page(bio->bi_io_vec[i].bv_page);
 		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -1907,7 +1907,7 @@ out_free_conf:
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
-		put_page(conf->tmppage);
+		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9647ebb..fb95200 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -132,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	for ( ; i > 0 ; i--)
-		put_page(bio->bi_io_vec[i-1].bv_page);
+		safe_put_page(bio->bi_io_vec[i-1].bv_page);
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
-			put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < nalloc )
@@ -155,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
 			for (i = 0; i < RESYNC_PAGES; i++) {
-				put_page(bio->bi_io_vec[i].bv_page);
+				safe_put_page(bio->bi_io_vec[i].bv_page);
 				bio->bi_io_vec[i].bv_page = NULL;
 			}
 			bio_put(bio);
@@ -2042,7 +2042,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
-	put_page(conf->tmppage);
+	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 950e5fa..06b32bd 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2072,8 +2072,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid6_conf(conf);
-		if (conf->spare_page)
-			put_page(conf->spare_page);
+		safe_put_page(conf->spare_page);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index e559fb7..12b3203 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -324,5 +324,10 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock, cmd);		\
 } while (0)
 
+static inline void safe_put_page(struct page *p)
+{
+	if (p) put_page(p);
+}
+
 #endif
 
-- 
cgit v1.1


From bce74dac082787375e76d2b33726b94c9701fabc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:41 -0800
Subject: [PATCH] md: helper function to match commands written to sysfs files

Commands written to sysfs files may, or my not, be \n terminated.  We want to
accept with case.  For this we use cmd_match.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 07f180f..c161385 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1525,6 +1525,26 @@ repeat:
 
 }
 
+/* words written to sysfs files may, or my not, be \n terminated.
+ * We want to accept with case. For this we use cmd_match.
+ */
+static int cmd_match(const char *cmd, const char *str)
+{
+	/* See if cmd, written into a sysfs file, matches
+	 * str.  They must either be the same, or cmd can
+	 * have a trailing newline
+	 */
+	while (*cmd && *str && *cmd == *str) {
+		cmd++;
+		str++;
+	}
+	if (*cmd == '\n')
+		cmd++;
+	if (*str || *cmd)
+		return 0;
+	return 1;
+}
+
 struct rdev_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(mdk_rdev_t *, char *);
@@ -1799,7 +1819,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 
-	if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) {
+	if (cmd_match(page, "idle")) {
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			md_unregister_thread(mddev->sync_thread);
@@ -1812,13 +1832,12 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
-	if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 ||
-	    strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
+	if (cmd_match(page, "resync") || cmd_match(page, "recover"))
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	else {
-		if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0)
+		if (cmd_match(page, "check"))
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
-		else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0)
+		else if (cmd_match(page, "repair"))
 			return -EINVAL;
 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-- 
cgit v1.1


From f188593ee7af8c71755d2df269a7a5f62c4b695e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:42 -0800
Subject: [PATCH] md: fix typo in comment

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fb95200..8e6f6df 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -712,7 +712,7 @@ static void allow_barrier(conf_t *conf)
 static void freeze_array(conf_t *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
-	 * go quite.
+	 * go quiet.
 	 * We increment barrier and nr_waiting, and then
 	 * wait until barrier+nr_pending match nr_queued+2
 	 */
-- 
cgit v1.1


From 2989ddbd6e1d9638a188311b896362c4bf7b7c25 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:43 -0800
Subject: [PATCH] md: make a couple of names in md.c static

.. because they aren't used outside md.c

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 4 ++--
 include/linux/raid/md_k.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c161385..3cf0893 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -144,7 +144,7 @@ static int start_readonly;
  *  start array, stop array, error, add device, remove device,
  *  start build, activate spare
  */
-DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
 void md_new_event(mddev_t *mddev)
 {
@@ -279,7 +279,7 @@ static inline void mddev_unlock(mddev_t * mddev)
 	md_wakeup_thread(mddev->thread);
 }
 
-mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
 	mdk_rdev_t * rdev;
 	struct list_head *tmp;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 12b3203..0fb5af6 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -263,8 +263,6 @@ static inline char * mdname (mddev_t * mddev)
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
-extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
-
 /*
  * iterates through some rdev ringlist. It's safe to remove the
  * current 'rdev'. Dont touch 'tmp' though.
-- 
cgit v1.1


From 07dbd37727049c0d56e313e5afcf8db07a9c3e37 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 6 Jan 2006 00:20:44 -0800
Subject: [PATCH] drivers/md/md.c: make md_new_event() static

Make the needlessly global function md_new_event() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3cf0893..241d514 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -146,7 +146,7 @@ static int start_readonly;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-void md_new_event(mddev_t *mddev)
+static void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
-- 
cgit v1.1


From c708443c004f2310abdd7f1c353daa372b37f7a2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:45 -0800
Subject: [PATCH] md: make sure bitmap updates are visible through filesystem

When we update a page_cache page in the kernel, we need to flush_dache_page or
userspace might not see the change.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ee4a342..76a189c 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -315,6 +315,8 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
 	if (bitmap->file == NULL)
 		return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
 
+	flush_dcache_page(page); /* make sure visible to anyone reading the file */
+
 	if (wait)
 		lock_page(page);
 	else {
-- 
cgit v1.1


From 03c902e17f40cfed63cd2494616f35fc9c58571b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:46 -0800
Subject: [PATCH] md: fix rdev->pending counts in raid1

When we do a user-requested check/repair, we lose count of the outstanding
requests...

Also make sure that when anything is written to md/sync_action, the
RECOVERY_NEEDED flag is set and the thread is woken up so any changes take
effect.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c    | 11 ++++-------
 drivers/md/raid1.c | 10 ++++++----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 241d514..0b3081a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1826,13 +1826,10 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 			mddev->sync_thread = NULL;
 			mddev->recovery = 0;
 		}
-		return len;
-	}
-
-	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
-	if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+	else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	else {
 		if (cmd_match(page, "check"))
@@ -1841,8 +1838,8 @@ action_store(mddev_t *mddev, const char *page, size_t len)
 			return -EINVAL;
 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	return len;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index feea4ee..7d4465f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -527,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 			/* cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
-			atomic_dec(&rdev->nr_pending);
+			rdev_dec_pending(rdev, conf->mddev);
 			goto retry;
 		}
 		conf->next_seq_sect = this_sector + sectors;
@@ -830,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		    !test_bit(Faulty, &rdev->flags)) {
 			atomic_inc(&rdev->nr_pending);
 			if (test_bit(Faulty, &rdev->flags)) {
-				atomic_dec(&rdev->nr_pending);
+				rdev_dec_pending(rdev, mddev);
 				r1_bio->bios[i] = NULL;
 			} else
 				r1_bio->bios[i] = bio;
@@ -1176,6 +1176,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 			if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
 			    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
 				r1_bio->bios[primary]->bi_end_io = NULL;
+				rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 				break;
 			}
 		r1_bio->read_disk = primary;
@@ -1193,9 +1194,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 						break;
 				if (j >= 0)
 					mddev->resync_mismatches += r1_bio->sectors;
-				if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+				if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 					sbio->bi_end_io = NULL;
-				else {
+					rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+				} else {
 					/* fixup the bio for reuse */
 					sbio->bi_vcnt = vcnt;
 					sbio->bi_size = r1_bio->sectors << 9;
-- 
cgit v1.1


From 3b34380ae8c5df6debd85183c7fa1ac05f79b7d2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:47 -0800
Subject: [PATCH] md: allow chunk_size to be settable through sysfs

... only before array is started of course.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 ++++++++
 drivers/md/md.c      | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 1dd0fb6..9710138 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -166,6 +166,14 @@ All md devices contain:
      will be empty.  If an array is being resized (not currently
      possible) this will contain the larger of the old and new sizes.
 
+  chunk_size
+     This is the size if bytes for 'chunks' and is only relevant to
+     raid levels that involve striping (1,4,5,6,10). The address space
+     of the array is conceptually divided into chunks and consecutive
+     chunks are striped onto neighbouring devices.
+     The size should be atleast PAGE_SIZE (4k) and should be a power
+     of 2.  This can only be set while assembling an array
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0b3081a..9e57e97 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1795,6 +1795,31 @@ raid_disks_show(mddev_t *mddev, char *page)
 static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
 
 static ssize_t
+chunk_size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d\n", mddev->chunk_size);
+}
+
+static ssize_t
+chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set chunk_size if array is not yet active */
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (mddev->pers)
+		return -EBUSY;
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	mddev->chunk_size = n;
+	return len;
+}
+static struct md_sysfs_entry md_chunk_size =
+__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
+
+
+static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
 	char *type = "idle";
@@ -1861,6 +1886,7 @@ md_mismatches = __ATTR_RO(mismatch_cnt);
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
+	&md_chunk_size.attr,
 	NULL,
 };
 
-- 
cgit v1.1


From a35b0d695d44410eb1734c9abb632725a3138628 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:49 -0800
Subject: [PATCH] md: allow md array component size to be accessed and set via
 sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |   9 ++++
 drivers/md/md.c      | 131 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 98 insertions(+), 42 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 9710138..0a2e10a 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -174,6 +174,15 @@ All md devices contain:
      The size should be atleast PAGE_SIZE (4k) and should be a power
      of 2.  This can only be set while assembling an array
 
+  component_size
+     For arrays with data redundancy (i.e. not raid0, linear, faulty,
+     multipath), all components must be the same size - or at least
+     there must a size that they all provide space for.  This is a key
+     part or the geometry of the array.  It is measured in sectors
+     and can be read from here.  Writing to this value may resize
+     the array if the personality supports it (raid1, raid5, raid6),
+     and if the component drives are large enough.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e57e97..d568ab4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1820,6 +1820,44 @@ __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
 
 static ssize_t
+size_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+}
+
+static int update_size(mddev_t *mddev, unsigned long size);
+
+static ssize_t
+size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* If array is inactive, we can reduce the component size, but
+	 * not increase it (except from 0).
+	 * If array is active, we can try an on-line resize
+	 */
+	char *e;
+	int err = 0;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (!*buf || *buf == '\n' ||
+	    (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers) {
+		err = update_size(mddev, size);
+		md_update_sb(mddev);
+	} else {
+		if (mddev->size == 0 ||
+		    mddev->size > size)
+			mddev->size = size;
+		else
+			err = -ENOSPC;
+	}
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_size =
+__ATTR(component_size, 0644, size_show, size_store);
+
+static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
 	char *type = "idle";
@@ -1887,6 +1925,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
 	&md_chunk_size.attr,
+	&md_size.attr,
 	NULL,
 };
 
@@ -3005,6 +3044,54 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	return 0;
 }
 
+static int update_size(mddev_t *mddev, unsigned long size)
+{
+	mdk_rdev_t * rdev;
+	int rv;
+	struct list_head *tmp;
+
+	if (mddev->pers->resize == NULL)
+		return -EINVAL;
+	/* The "size" is the amount of each device that is used.
+	 * This can only make sense for arrays with redundancy.
+	 * linear and raid0 always use whatever space is available
+	 * We can only consider changing the size if no resync
+	 * or reconstruction is happening, and if the new size
+	 * is acceptable. It must fit before the sb_offset or,
+	 * if that is <data_offset, it must fit before the
+	 * size of each device.
+	 * If size is zero, we find the largest size that fits.
+	 */
+	if (mddev->sync_thread)
+		return -EBUSY;
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		sector_t avail;
+		int fit = (size == 0);
+		if (rdev->sb_offset > rdev->data_offset)
+			avail = (rdev->sb_offset*2) - rdev->data_offset;
+		else
+			avail = get_capacity(rdev->bdev->bd_disk)
+				- rdev->data_offset;
+		if (fit && (size == 0 || size > avail/2))
+			size = avail/2;
+		if (avail < ((sector_t)size << 1))
+			return -ENOSPC;
+	}
+	rv = mddev->pers->resize(mddev, (sector_t)size *2);
+	if (!rv) {
+		struct block_device *bdev;
+
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			down(&bdev->bd_inode->i_sem);
+			i_size_write(bdev->bd_inode, mddev->array_size << 10);
+			up(&bdev->bd_inode->i_sem);
+			bdput(bdev);
+		}
+	}
+	return rv;
+}
+
 /*
  * update_array_info is used to change the configuration of an
  * on-line array.
@@ -3053,49 +3140,9 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		else
 			return mddev->pers->reconfig(mddev, info->layout, -1);
 	}
-	if (mddev->size != info->size) {
-		mdk_rdev_t * rdev;
-		struct list_head *tmp;
-		if (mddev->pers->resize == NULL)
-			return -EINVAL;
-		/* The "size" is the amount of each device that is used.
-		 * This can only make sense for arrays with redundancy.
-		 * linear and raid0 always use whatever space is available
-		 * We can only consider changing the size if no resync
-		 * or reconstruction is happening, and if the new size
-		 * is acceptable. It must fit before the sb_offset or,
-		 * if that is <data_offset, it must fit before the
-		 * size of each device.
-		 * If size is zero, we find the largest size that fits.
-		 */
-		if (mddev->sync_thread)
-			return -EBUSY;
-		ITERATE_RDEV(mddev,rdev,tmp) {
-			sector_t avail;
-			int fit = (info->size == 0);
-			if (rdev->sb_offset > rdev->data_offset)
-				avail = (rdev->sb_offset*2) - rdev->data_offset;
-			else
-				avail = get_capacity(rdev->bdev->bd_disk)
-					- rdev->data_offset;
-			if (fit && (info->size == 0 || info->size > avail/2))
-				info->size = avail/2;
-			if (avail < ((sector_t)info->size << 1))
-				return -ENOSPC;
-		}
-		rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
-		if (!rv) {
-			struct block_device *bdev;
+	if (mddev->size != info->size)
+		rv = update_size(mddev, info->size);
 
-			bdev = bdget_disk(mddev->gendisk, 0);
-			if (bdev) {
-				down(&bdev->bd_inode->i_sem);
-				i_size_write(bdev->bd_inode, mddev->array_size << 10);
-				up(&bdev->bd_inode->i_sem);
-				bdput(bdev);
-			}
-		}
-	}
 	if (mddev->raid_disks    != info->raid_disks) {
 		/* change the number of raid disks */
 		if (mddev->pers->reshape == NULL)
-- 
cgit v1.1


From 8bb93aaca2062cd54cc2c58c76ee8409cae209a7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:50 -0800
Subject: [PATCH] md: expose md metadata format in sysfs

Allow it to be set to a particular version, or 'none'.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  6 ++++++
 drivers/md/md.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 0a2e10a..c5512af 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -183,6 +183,12 @@ All md devices contain:
      the array if the personality supports it (raid1, raid5, raid6),
      and if the component drives are large enough.
 
+  metadata_version
+     This indicates the format that is being used to record metadata
+     about the array.  It can be 0.90 (traditional format), 1.0, 1.1,
+     1.2 (newer format in varying locations) or "none" indicating that
+     the kernel isn't managing metadata at all.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d568ab4..ecc0166 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1857,6 +1857,54 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_size =
 __ATTR(component_size, 0644, size_show, size_store);
 
+
+/* Metdata version.
+ * This is either 'none' for arrays with externally managed metadata,
+ * or N.M for internally known formats
+ */
+static ssize_t
+metadata_show(mddev_t *mddev, char *page)
+{
+	if (mddev->persistent)
+		return sprintf(page, "%d.%d\n",
+			       mddev->major_version, mddev->minor_version);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t
+metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int major, minor;
+	char *e;
+	if (!list_empty(&mddev->disks))
+		return -EBUSY;
+
+	if (cmd_match(buf, "none")) {
+		mddev->persistent = 0;
+		mddev->major_version = 0;
+		mddev->minor_version = 90;
+		return len;
+	}
+	major = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '.')
+		return -EINVAL;
+	buf = e+1;
+	minor = simple_strtoul(buf, &e, 10);
+	if (e==buf || *e != '\n')
+		return -EINVAL;
+	if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
+	    super_types[major].name == NULL)
+		return -ENOENT;
+	mddev->major_version = major;
+	mddev->minor_version = minor;
+	mddev->persistent = 1;
+	return len;
+}
+
+static struct md_sysfs_entry md_metadata =
+__ATTR(metadata_version, 0644, metadata_show, metadata_store);
+
 static ssize_t
 action_show(mddev_t *mddev, char *page)
 {
@@ -1926,6 +1974,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_raid_disks.attr,
 	&md_chunk_size.attr,
 	&md_size.attr,
+	&md_metadata.attr,
 	NULL,
 };
 
-- 
cgit v1.1


From d9d166c2a9d5d01af34396793950aa695883eed4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:51 -0800
Subject: [PATCH] md: allow array level to be set textually via sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  8 +++++++
 drivers/md/faulty.c       |  1 +
 drivers/md/linear.c       |  3 ++-
 drivers/md/md.c           | 61 +++++++++++++++++++++++++++++++++++++----------
 drivers/md/multipath.c    |  1 +
 drivers/md/raid0.c        |  1 +
 drivers/md/raid1.c        |  1 +
 drivers/md/raid10.c       |  1 +
 drivers/md/raid5.c        |  2 ++
 drivers/md/raid6main.c    |  1 +
 include/linux/raid/md_k.h |  1 +
 11 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index c5512af..fd43fd2 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -189,6 +189,14 @@ All md devices contain:
      1.2 (newer format in varying locations) or "none" indicating that
      the kernel isn't managing metadata at all.
 
+  level
+     The raid 'level' for this array.  The name will often (but not
+     always) be the same as the name of the module that implements the
+     level.  To be auto-loaded the module must have an alias
+        md-$LEVEL  e.g. md-raid5
+     This can be written only while the array is being assembled, not
+     after it is started.
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f12e830..a7a5ab5 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -342,4 +342,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-10"); /* faulty */
+MODULE_ALIAS("md-faulty");
 MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 79dee81..7775854 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -376,5 +376,6 @@ static void linear_exit (void)
 module_init(linear_init);
 module_exit(linear_exit);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("md-personality-1"); /* LINEAR - degrecated*/
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
 MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ecc0166..594d8c3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -303,12 +303,15 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 	return NULL;
 }
 
-static struct mdk_personality *find_pers(int level)
+static struct mdk_personality *find_pers(int level, char *clevel)
 {
 	struct mdk_personality *pers;
-	list_for_each_entry(pers, &pers_list, list)
-		if (pers->level == level)
+	list_for_each_entry(pers, &pers_list, list) {
+		if (level != LEVEL_NONE && pers->level == level)
 			return pers;
+		if (strcmp(pers->name, clevel)==0)
+			return pers;
+	}
 	return NULL;
 }
 
@@ -715,6 +718,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = sb->ctime;
 		mddev->utime = sb->utime;
 		mddev->level = sb->level;
+		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
 		mddev->size = sb->size;
@@ -1051,6 +1055,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
 		mddev->level = le32_to_cpu(sb->level);
+		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->size = le64_to_cpu(sb->size)/2;
@@ -1774,15 +1779,36 @@ static ssize_t
 level_show(mddev_t *mddev, char *page)
 {
 	struct mdk_personality *p = mddev->pers;
-	if (p == NULL && mddev->raid_disks == 0)
-		return 0;
-	if (mddev->level >= 0)
-		return sprintf(page, "raid%d\n", mddev->level);
-	else
+	if (p)
 		return sprintf(page, "%s\n", p->name);
+	else if (mddev->clevel[0])
+		return sprintf(page, "%s\n", mddev->clevel);
+	else if (mddev->level != LEVEL_NONE)
+		return sprintf(page, "%d\n", mddev->level);
+	else
+		return 0;
+}
+
+static ssize_t
+level_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int rv = len;
+	if (mddev->pers)
+		return -EBUSY;
+	if (len == 0)
+		return 0;
+	if (len >= sizeof(mddev->clevel))
+		return -ENOSPC;
+	strncpy(mddev->clevel, buf, len);
+	if (mddev->clevel[len-1] == '\n')
+		len--;
+	mddev->clevel[len] = 0;
+	mddev->level = LEVEL_NONE;
+	return rv;
 }
 
-static struct md_sysfs_entry md_level = __ATTR_RO(level);
+static struct md_sysfs_entry md_level =
+__ATTR(level, 0644, level_show, level_store);
 
 static ssize_t
 raid_disks_show(mddev_t *mddev, char *page)
@@ -2158,7 +2184,10 @@ static int do_md_run(mddev_t * mddev)
 	}
 
 #ifdef CONFIG_KMOD
-	request_module("md-level-%d", mddev->level);
+	if (mddev->level != LEVEL_NONE)
+		request_module("md-level-%d", mddev->level);
+	else if (mddev->clevel[0])
+		request_module("md-%s", mddev->clevel);
 #endif
 
 	/*
@@ -2180,15 +2209,21 @@ static int do_md_run(mddev_t * mddev)
 		return -ENOMEM;
 
 	spin_lock(&pers_lock);
-	pers = find_pers(mddev->level);
+	pers = find_pers(mddev->level, mddev->clevel);
 	if (!pers || !try_module_get(pers->owner)) {
 		spin_unlock(&pers_lock);
-		printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
-		       mddev->level);
+		if (mddev->level != LEVEL_NONE)
+			printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+			       mddev->level);
+		else
+			printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+			       mddev->clevel);
 		return -EINVAL;
 	}
 	mddev->pers = pers;
 	spin_unlock(&pers_lock);
+	mddev->level = pers->level;
+	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4d838e..e6aa309 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -578,4 +578,5 @@ module_init(multipath_init);
 module_exit(multipath_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
+MODULE_ALIAS("md-multipath");
 MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7fb69e2..abbca15 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -536,4 +536,5 @@ module_init(raid0_init);
 module_exit(raid0_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-2"); /* RAID0 */
+MODULE_ALIAS("md-raid0");
 MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7d4465f..181c961 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2092,4 +2092,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8e6f6df..201dc71 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2117,4 +2117,5 @@ module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
+MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b0cfd3c..9cc844f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2240,5 +2240,7 @@ module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
+MODULE_ALIAS("md-raid5");
+MODULE_ALIAS("md-raid4");
 MODULE_ALIAS("md-level-5");
 MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 06b32bd..84dd875 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2341,4 +2341,5 @@ module_init(raid6_init);
 module_exit(raid6_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-8"); /* RAID6 */
+MODULE_ALIAS("md-raid6");
 MODULE_ALIAS("md-level-6");
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 0fb5af6..6864631 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -119,6 +119,7 @@ struct mddev_s
 	int				chunk_size;
 	time_t				ctime, utime;
 	int				level, layout;
+	char				clevel[16];
 	int				raid_disks;
 	int				max_disks;
 	sector_t			size; /* used size of component devices */
-- 
cgit v1.1


From 4dbcdc751cb25ffca3a8374cbc5ab6de961cc545 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:52 -0800
Subject: [PATCH] md: count corrected read errors per drive

Store this total in superblock (As appropriate), and make it available to
userspace via sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      | 11 +++++++++++
 drivers/md/md.c           | 27 ++++++++++++++++++++++++++-
 drivers/md/raid1.c        |  2 ++
 drivers/md/raid10.c       | 11 ++++++++---
 drivers/md/raid5.c        |  3 +++
 drivers/md/raid6main.c    |  3 +++
 include/linux/raid/md_k.h |  4 ++++
 7 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index fd43fd2..a3eadf8 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -222,6 +222,17 @@ Each directory contains:
 			 of being recoverred to
 	This list make grow in future.
 
+      errors
+	An approximate count of read errors that have been detected on
+	this device but have not caused the device to be evicted from
+	the array (either because they were corrected or because they
+	happened while the array was read-only).  When using version-1
+	metadata, this value persists across restarts of the array.
+
+	This value can be written while assembling an array thus
+	providing an ongoing count for arrays with metadata managed by
+	userspace.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 594d8c3..32a4e23 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1000,6 +1000,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	}
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
+	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
 	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1139,6 +1140,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	else
 		sb->resync_offset = cpu_to_le64(0);
 
+	sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1592,9 +1595,30 @@ super_show(mdk_rdev_t *rdev, char *page)
 }
 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
 
+static ssize_t
+errors_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+	if (*buf && (*e == 0 || *e == '\n')) {
+		atomic_set(&rdev->corrected_errors, n);
+		return len;
+	}
+	return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, 0644, errors_show, errors_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
+	&rdev_errors.attr,
 	NULL,
 };
 static ssize_t
@@ -1674,6 +1698,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
 	rdev->data_offset = 0;
 	atomic_set(&rdev->nr_pending, 0);
 	atomic_set(&rdev->read_errors, 0);
+	atomic_set(&rdev->corrected_errors, 0);
 
 	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
 	if (!size) {
@@ -4729,7 +4754,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 	int num = simple_strtoul(val, &e, 10);
 	if (*val && (*e == '\0' || *e == '\n')) {
 		start_readonly = num;
-		return 0;;
+		return 0;
 	}
 	return -EINVAL;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 181c961..a06ff91 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1265,6 +1265,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 						continue;
 					rdev = conf->mirrors[d].rdev;
+					atomic_add(s, &rdev->corrected_errors);
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
@@ -1463,6 +1464,7 @@ static void raid1d(mddev_t *mddev)
 							d = conf->raid_disks;
 						d--;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 201dc71..9e658e5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1122,9 +1122,13 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
-	else if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
-		md_error(r10_bio->mddev,
-			 conf->mirrors[d].rdev);
+	else {
+		atomic_add(r10_bio->sectors,
+			   &conf->mirrors[d].rdev->corrected_errors);
+		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
+			md_error(r10_bio->mddev,
+				 conf->mirrors[d].rdev);
+	}
 
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
@@ -1430,6 +1434,7 @@ static void raid10d(mddev_t *mddev)
 						sl--;
 						d = r10_bio->devs[sl].devnum;
 						rdev = conf->mirrors[d].rdev;
+						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9cc844f..54f4a98 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1400,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 84dd875..8c823d6 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1562,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
 			bi->bi_next = NULL;
+			if (rw == WRITE &&
+			    test_bit(R5_ReWrite, &sh->dev[i].flags))
+				atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 			generic_make_request(bi);
 		} else {
 			if (rw == 1)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 6864631..68b929c 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -95,6 +95,10 @@ struct mdk_rdev_s
 	atomic_t	read_errors;	/* number of consecutive read errors that
 					 * we have tried to ignore.
 					 */
+	atomic_t	corrected_errors; /* number of corrected read errors,
+					   * for reporting to userspace and storing
+					   * in superblock.
+					   */
 };
 
 struct mddev_s
-- 
cgit v1.1


From da943b9912df063322d37b1a1f285460531d481d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:54 -0800
Subject: [PATCH] md: allow md/raid_disks to be settable

If array is active, try to reshape, else just set the value.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  3 +++
 drivers/md/md.c      | 74 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index a3eadf8..69f742d 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -165,6 +165,9 @@ All md devices contain:
      in a fully functional array.  If this is not yet known, the file
      will be empty.  If an array is being resized (not currently
      possible) this will contain the larger of the old and new sizes.
+     Some raid level (RAID1) allow this value to be set while the
+     array is active.  This will reconfigure the array.   Otherwise
+     it can only be set while assembling an array.
 
   chunk_size
      This is the size if bytes for 'chunks' and is only relevant to
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32a4e23..86e9f2e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1843,7 +1843,27 @@ raid_disks_show(mddev_t *mddev, char *page)
 	return sprintf(page, "%d\n", mddev->raid_disks);
 }
 
-static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks);
+static int update_raid_disks(mddev_t *mddev, int raid_disks);
+
+static ssize_t
+raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* can only set raid_disks if array is not yet active */
+	char *e;
+	int rv = 0;
+	unsigned long n = simple_strtoul(buf, &e, 10);
+
+	if (!*buf || (*e && *e != '\n'))
+		return -EINVAL;
+
+	if (mddev->pers)
+		rv = update_raid_disks(mddev, n);
+	else
+		mddev->raid_disks = n;
+	return rv ? rv : len;
+}
+static struct md_sysfs_entry md_raid_disks =
+__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
 
 static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
@@ -3201,6 +3221,33 @@ static int update_size(mddev_t *mddev, unsigned long size)
 	return rv;
 }
 
+static int update_raid_disks(mddev_t *mddev, int raid_disks)
+{
+	int rv;
+	/* change the number of raid disks */
+	if (mddev->pers->reshape == NULL)
+		return -EINVAL;
+	if (raid_disks <= 0 ||
+	    raid_disks >= mddev->max_disks)
+		return -EINVAL;
+	if (mddev->sync_thread)
+		return -EBUSY;
+	rv = mddev->pers->reshape(mddev, raid_disks);
+	if (!rv) {
+		struct block_device *bdev;
+
+		bdev = bdget_disk(mddev->gendisk, 0);
+		if (bdev) {
+			down(&bdev->bd_inode->i_sem);
+			i_size_write(bdev->bd_inode, mddev->array_size << 10);
+			up(&bdev->bd_inode->i_sem);
+			bdput(bdev);
+		}
+	}
+	return rv;
+}
+
+
 /*
  * update_array_info is used to change the configuration of an
  * on-line array.
@@ -3252,28 +3299,9 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	if (mddev->size != info->size)
 		rv = update_size(mddev, info->size);
 
-	if (mddev->raid_disks    != info->raid_disks) {
-		/* change the number of raid disks */
-		if (mddev->pers->reshape == NULL)
-			return -EINVAL;
-		if (info->raid_disks <= 0 ||
-		    info->raid_disks >= mddev->max_disks)
-			return -EINVAL;
-		if (mddev->sync_thread)
-			return -EBUSY;
-		rv = mddev->pers->reshape(mddev, info->raid_disks);
-		if (!rv) {
-			struct block_device *bdev;
-
-			bdev = bdget_disk(mddev->gendisk, 0);
-			if (bdev) {
-				down(&bdev->bd_inode->i_sem);
-				i_size_write(bdev->bd_inode, mddev->array_size << 10);
-				up(&bdev->bd_inode->i_sem);
-				bdput(bdev);
-			}
-		}
-	}
+	if (mddev->raid_disks    != info->raid_disks)
+		rv = update_raid_disks(mddev, info->raid_disks);
+
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
 		if (mddev->pers->quiesce == NULL)
 			return -EINVAL;
-- 
cgit v1.1


From 2bf071bf50580380a8c3afe5eef8152a66be96c7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:55 -0800
Subject: [PATCH] md: keep better track of dev/array size when assembling md
 arrays

Move the checks - that dev size is never less than array size - into
bind_rdev_to_array to make sure it always happens properly (there is one place
where currently it doesn't).

Also reject any superblock which claims an array size smaller than the device
in question can hold.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 86e9f2e..27a9871 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -695,6 +695,10 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	}
 	rdev->size = calc_dev_size(rdev, sb->chunk_size);
 
+	if (rdev->size < sb->size && sb->level > 1)
+		/* "this cannot possibly happen" ... */
+		ret = -EINVAL;
+
  abort:
 	return ret;
 }
@@ -1039,6 +1043,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	rdev->size = le64_to_cpu(sb->data_size)/2;
 	if (le32_to_cpu(sb->chunksize))
 		rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+
+	if (le32_to_cpu(sb->size) > rdev->size*2)
+		return -EINVAL;
 	return 0;
 }
 
@@ -1224,6 +1231,14 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		MD_BUG();
 		return -EINVAL;
 	}
+	/* make sure rdev->size exceeds mddev->size */
+	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+		if (mddev->pers)
+			/* Cannot change size, so fail */
+			return -ENOSPC;
+		else
+			mddev->size = rdev->size;
+	}
 	same_pdev = match_dev_unit(mddev, rdev);
 	if (same_pdev)
 		printk(KERN_WARNING
@@ -2898,12 +2913,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
 			set_bit(WriteMostly, &rdev->flags);
 
-		err = bind_rdev_to_array(rdev, mddev);
-		if (err) {
-			export_rdev(rdev);
-			return err;
-		}
-
 		if (!mddev->persistent) {
 			printk(KERN_INFO "md: nonpersistent superblock ...\n");
 			rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -2911,8 +2920,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
 		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
 
-		if (!mddev->size || (mddev->size > rdev->size))
-			mddev->size = rdev->size;
+		err = bind_rdev_to_array(rdev, mddev);
+		if (err) {
+			export_rdev(rdev);
+			return err;
+		}
 	}
 
 	return 0;
@@ -2984,15 +2996,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	size = calc_dev_size(rdev, mddev->chunk_size);
 	rdev->size = size;
 
-	if (size < mddev->size) {
-		printk(KERN_WARNING 
-			"%s: disk size %llu blocks < array size %llu\n",
-			mdname(mddev), (unsigned long long)size,
-			(unsigned long long)mddev->size);
-		err = -ENOSPC;
-		goto abort_export;
-	}
-
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
 			"md: can not hot-add faulty %s disk to %s!\n",
@@ -3002,7 +3005,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	}
 	clear_bit(In_sync, &rdev->flags);
 	rdev->desc_nr = -1;
-	bind_rdev_to_array(rdev, mddev);
+	err = bind_rdev_to_array(rdev, mddev);
+	if (err)
+		goto abort_export;
 
 	/*
 	 * The rest should better be atomic, we can have disk failures
-- 
cgit v1.1


From 014236d2b8ec6faea2a6134ab8e019d84d67b524 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:55 -0800
Subject: [PATCH] md: expose device slot information via sysfs

This the role that a device has in an array can be viewed and set.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 ++++++++
 drivers/md/md.c      | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 69f742d..d525fff 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,14 @@ Each directory contains:
 	providing an ongoing count for arrays with metadata managed by
 	userspace.
 
+      slot
+        This gives the role that the device has in the array.  It will
+	either be 'none' if the device is not active in the array
+        (i.e. is a spare or has failed) or an integer less than the
+	'raid_disks' number for the array indicating which possition
+	it currently fills.  This can only be set while assembling an
+	array.  A device for which this is set is assumed to be working.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 27a9871..a816956 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1630,10 +1630,45 @@ errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_errors =
 __ATTR(errors, 0644, errors_show, errors_store);
 
+static ssize_t
+slot_show(mdk_rdev_t *rdev, char *page)
+{
+	if (rdev->raid_disk < 0)
+		return sprintf(page, "none\n");
+	else
+		return sprintf(page, "%d\n", rdev->raid_disk);
+}
+
+static ssize_t
+slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	int slot = simple_strtoul(buf, &e, 10);
+	if (strncmp(buf, "none", 4)==0)
+		slot = -1;
+	else if (e==buf || (*e && *e!= '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		/* Cannot set slot in active array (yet) */
+		return -EBUSY;
+	if (slot >= rdev->mddev->raid_disks)
+		return -ENOSPC;
+	rdev->raid_disk = slot;
+	/* assume it is working */
+	rdev->flags = 0;
+	set_bit(In_sync, &rdev->flags);
+	return len;
+}
+
+
+static struct rdev_sysfs_entry rdev_slot =
+__ATTR(slot, 0644, slot_show, slot_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
+	&rdev_slot.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit v1.1


From 93c8cad03f02dbd1532a5413bdced25f000d5728 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:20:56 -0800
Subject: [PATCH] md: export rdev->data_offset via sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  6 ++++++
 drivers/md/md.c      | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index d525fff..866a1a8 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -244,6 +244,12 @@ Each directory contains:
 	it currently fills.  This can only be set while assembling an
 	array.  A device for which this is set is assumed to be working.
 
+      offset
+        This gives the location in the device (in sectors from the
+        start) where data from the array will be stored.  Any part of
+        the device before this offset us not touched, unless it is
+        used for storing metadata (Formats 1.1 and 1.2).
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a816956..742a82a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1664,11 +1664,34 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_slot =
 __ATTR(slot, 0644, slot_show, slot_store);
 
+static ssize_t
+offset_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", rdev->data_offset);
+}
+
+static ssize_t
+offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long offset = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->data_offset = offset;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_offset =
+__ATTR(offset, 0644, offset_show, offset_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
 	&rdev_slot.attr,
+	&rdev_offset.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit v1.1


From 6961ece46c7d02de1bb83914900608e39633787d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:20:59 -0800
Subject: [PATCH] md-export-rdev-data_offset-via-sysfs-fix

drivers/md/md.c: In function `offset_show':
drivers/md/md.c:1670: warning: long long unsigned int format, different type arg (arg 3)

Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 742a82a..27db100 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1667,7 +1667,7 @@ __ATTR(slot, 0644, slot_show, slot_store);
 static ssize_t
 offset_show(mdk_rdev_t *rdev, char *page)
 {
-	return sprintf(page, "%llu\n", rdev->data_offset);
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
 }
 
 static ssize_t
-- 
cgit v1.1


From 83303b613d00718b07ec0a4dee7c99aa66629d96 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:06 -0800
Subject: [PATCH] md: allow available size of component devices to be set via
 sysfs

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  7 +++++++
 drivers/md/md.c      | 25 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 866a1a8..7b3d471 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -250,6 +250,13 @@ Each directory contains:
         the device before this offset us not touched, unless it is
         used for storing metadata (Formats 1.1 and 1.2).
 
+      size
+        The amount of the device, after the offset, that can be used
+        for storage of data.  This will normally be the same as the
+	component_size.  This can be written while assembling an
+        array.  If a value less than the current component_size is
+        written, component_size will be reduced to this value.
+
 
 An active md device will also contain and entry for each active device
 in the array.  These are named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 27db100..40ac7fb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1686,12 +1686,37 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 static struct rdev_sysfs_entry rdev_offset =
 __ATTR(offset, 0644, offset_show, offset_store);
 
+static ssize_t
+rdev_size_show(mdk_rdev_t *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+}
+
+static ssize_t
+rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long size = simple_strtoull(buf, &e, 10);
+	if (e==buf || (*e && *e != '\n'))
+		return -EINVAL;
+	if (rdev->mddev->pers)
+		return -EBUSY;
+	rdev->size = size;
+	if (size < rdev->mddev->size || rdev->mddev->size == 0)
+		rdev->mddev->size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_size =
+__ATTR(size, 0644, rdev_size_show, rdev_size_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_super.attr,
 	&rdev_errors.attr,
 	&rdev_slot.attr,
 	&rdev_offset.attr,
+	&rdev_size.attr,
 	NULL,
 };
 static ssize_t
-- 
cgit v1.1


From 6d7ff7380b2e28c2807da3bf9fa614d91d15bacf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:16 -0800
Subject: [PATCH] md: support adding new devices to md arrays via sysfs

Writing major:minor to md/new_dev will bind that device to the array.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt |  8 +++++++
 drivers/md/md.c      | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 7b3d471..b8d172b 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -200,6 +200,14 @@ All md devices contain:
      This can be written only while the array is being assembled, not
      after it is started.
 
+   new_dev
+     This file can be written but not read.  The value written should
+     be a block device number as major:minor.  e.g. 8:0
+     This will cause that device to be attached to the array, if it is
+     available.  It will then appear at md/dev-XXX (depending on the
+     name of the device) and further configuration is then possible.
+
+
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
       dev-XXX
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 40ac7fb..825e235 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1987,6 +1987,65 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_chunk_size =
 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
 
+static ssize_t
+null_show(mddev_t *mddev, char *page)
+{
+	return -EINVAL;
+}
+
+static ssize_t
+new_dev_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	/* buf must be %d:%d\n? giving major and minor numbers */
+	/* The new device is added to the array.
+	 * If the array has a persistent superblock, we read the
+	 * superblock to initialise info and check validity.
+	 * Otherwise, only checking done is that in bind_rdev_to_array,
+	 * which mainly checks size.
+	 */
+	char *e;
+	int major = simple_strtoul(buf, &e, 10);
+	int minor;
+	dev_t dev;
+	mdk_rdev_t *rdev;
+	int err;
+
+	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
+		return -EINVAL;
+	minor = simple_strtoul(e+1, &e, 10);
+	if (*e && *e != '\n')
+		return -EINVAL;
+	dev = MKDEV(major, minor);
+	if (major != MAJOR(dev) ||
+	    minor != MINOR(dev))
+		return -EOVERFLOW;
+
+
+	if (mddev->persistent) {
+		rdev = md_import_device(dev, mddev->major_version,
+					mddev->minor_version);
+		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
+			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+						       mdk_rdev_t, same_set);
+			err = super_types[mddev->major_version]
+				.load_super(rdev, rdev0, mddev->minor_version);
+			if (err < 0)
+				goto out;
+		}
+	} else
+		rdev = md_import_device(dev, -1, -1);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+	err = bind_rdev_to_array(rdev, mddev);
+ out:
+	if (err)
+		export_rdev(rdev);
+	return err ? err : len;
+}
+
+static struct md_sysfs_entry md_new_device =
+__ATTR(new_dev, 0200, null_show, new_dev_store);
 
 static ssize_t
 size_show(mddev_t *mddev, char *page)
@@ -2144,6 +2203,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_chunk_size.attr,
 	&md_size.attr,
 	&md_metadata.attr,
+	&md_new_device.attr,
 	NULL,
 };
 
-- 
cgit v1.1


From 88202a0c84e1951d6630d1d557d4801a8cc5b5ef Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Jan 2006 00:21:36 -0800
Subject: [PATCH] md: allow sync-speed to be controlled per-device

Also export current (average) speed and status in sysfs.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/md.txt      |  22 ++++++++++
 drivers/md/md.c           | 110 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/raid/md_k.h |   4 ++
 3 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index b8d172b..03a13c4 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -207,6 +207,28 @@ All md devices contain:
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
+   sync_speed_min
+   sync_speed_max
+     This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
+     however they only apply to the particular array.
+     If no value has been written to these, of if the word 'system'
+     is written, then the system-wide value is used.  If a value,
+     in kibibytes-per-second is written, then it is used.
+     When the files are read, they show the currently active value
+     followed by "(local)" or "(system)" depending on whether it is
+     a locally set or system-wide value.
+
+   sync_completed
+     This shows the number of sectors that have been completed of
+     whatever the current sync_action is, followed by the number of
+     sectors in total that could need to be processed.  The two
+     numbers are separated by a '/'  thus effectively showing one
+     value, a fraction of the process that is complete.
+
+   sync_speed
+     This shows the current actual speed, in K/sec, of the current
+     sync_action.  It is averaged over the last 30 seconds.
+
 
 As component devices are added to an md array, they appear in the 'md'
 directory as new directories named
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 825e235..1b76fb2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -81,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
  * idle IO detection.
  *
  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
  */
 
 static int sysctl_speed_limit_min = 1000;
 static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(mddev_t *mddev)
+{
+	return mddev->sync_speed_min ?
+		mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(mddev_t *mddev)
+{
+	return mddev->sync_speed_max ?
+		mddev->sync_speed_max : sysctl_speed_limit_max;
+}
 
 static struct ctl_table_header *raid_table_header;
 
@@ -2197,6 +2209,90 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
 static struct md_sysfs_entry
 md_mismatches = __ATTR_RO(mismatch_cnt);
 
+static ssize_t
+sync_min_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_min(mddev),
+		       mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int min;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_min = 0;
+		return len;
+	}
+	min = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || min <= 0)
+		return -EINVAL;
+	mddev->sync_speed_min = min;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%d (%s)\n", speed_max(mddev),
+		       mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	int max;
+	char *e;
+	if (strncmp(buf, "system", 6)==0) {
+		mddev->sync_speed_max = 0;
+		return len;
+	}
+	max = simple_strtoul(buf, &e, 10);
+	if (buf == e || (*e && *e != '\n') || max <= 0)
+		return -EINVAL;
+	mddev->sync_speed_max = max;
+	return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+
+static ssize_t
+sync_speed_show(mddev_t *mddev, char *page)
+{
+	unsigned long resync, dt, db;
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	dt = ((jiffies - mddev->resync_mark) / HZ);
+	if (!dt) dt++;
+	db = resync - (mddev->resync_mark_cnt);
+	return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry
+md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(mddev_t *mddev, char *page)
+{
+	unsigned long max_blocks, resync;
+
+	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+		max_blocks = mddev->resync_max_sectors;
+	else
+		max_blocks = mddev->size << 1;
+
+	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+	return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+}
+
+static struct md_sysfs_entry
+md_sync_completed = __ATTR_RO(sync_completed);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2210,6 +2306,10 @@ static struct attribute *md_default_attrs[] = {
 static struct attribute *md_redundancy_attrs[] = {
 	&md_scan_mode.attr,
 	&md_mismatches.attr,
+	&md_sync_min.attr,
+	&md_sync_max.attr,
+	&md_sync_speed.attr,
+	&md_sync_completed.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -4433,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
-		" %d KB/sec/disc.\n", sysctl_speed_limit_min);
+		" %d KB/sec/disc.\n", speed_min(mddev));
 	printk(KERN_INFO "md: using maximum available idle IO bandwidth "
 	       "(but not more than %d KB/sec) for reconstruction.\n",
-	       sysctl_speed_limit_max);
+	       speed_max(mddev));
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
 	/* we don't use the checkpoint if there's a bitmap */
@@ -4477,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
 
 		skipped = 0;
 		sectors = mddev->pers->sync_request(mddev, j, &skipped,
-					    currspeed < sysctl_speed_limit_min);
+					    currspeed < speed_min(mddev));
 		if (sectors == 0) {
 			set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 			goto out;
@@ -4542,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
 		currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
-		if (currspeed > sysctl_speed_limit_min) {
-			if ((currspeed > sysctl_speed_limit_max) ||
+		if (currspeed > speed_min(mddev)) {
+			if ((currspeed > speed_max(mddev)) ||
 					!is_mddev_idle(mddev)) {
 				msleep(500);
 				goto repeat;
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 68b929c..617b950 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -143,6 +143,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+	/* if zero, use the system-wide default */
+	int				sync_speed_min;
+	int				sync_speed_max;
+
 	int				ok_start_degraded;
 	/* recovery/resync flags 
 	 * NEEDED:   we might need to start a resync/recover
-- 
cgit v1.1


From 9b847548663ef1039dd49f0eb4463d001e596bc3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Fri, 6 Jan 2006 09:28:07 +0100
Subject: [PATCH] Suspend support for libata

This patch adds suspend patch to libata, and ata_piix in particular. For
most low level drivers, they should just need to add the 4 hooks to
work. As I can only test ata_piix, I didn't enable it for more
though.

Suspend support is the single most important feature on a notebook, and
most new notebooks have sata drives. It's quite embarrassing that we
_still_ do not support this. Right now, it's perfectly possible to
suspend the drive in mid-transfer.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/scsi/ata_piix.c    |   4 ++
 drivers/scsi/libata-core.c | 114 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-scsi.c |  16 +++++++
 drivers/scsi/scsi_sysfs.c  |  31 ++++++++++++
 include/linux/ata.h        |   2 +
 include/linux/libata.h     |   8 ++++
 include/scsi/scsi_host.h   |   6 +++
 7 files changed, 181 insertions(+)

diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 0ea2787..f796303 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -166,6 +166,8 @@ static struct pci_driver piix_pci_driver = {
 	.id_table		= piix_pci_tbl,
 	.probe			= piix_init_one,
 	.remove			= ata_pci_remove_one,
+	.suspend		= ata_pci_device_suspend,
+	.resume			= ata_pci_device_resume,
 };
 
 static struct scsi_host_template piix_sht = {
@@ -186,6 +188,8 @@ static struct scsi_host_template piix_sht = {
 	.slave_configure	= ata_scsi_slave_config,
 	.bios_param		= ata_std_bios_param,
 	.ordered_flush		= 1,
+	.resume			= ata_scsi_device_resume,
+	.suspend		= ata_scsi_device_suspend,
 };
 
 static const struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea1025..9c66d40 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4154,6 +4154,96 @@ err_out:
  *	Inherited from caller.
  */
 
+/*
+ * Execute a 'simple' command, that only consists of the opcode 'cmd' itself,
+ * without filling any other registers
+ */
+static int ata_do_simple_cmd(struct ata_port *ap, struct ata_device *dev,
+			     u8 cmd)
+{
+	struct ata_taskfile tf;
+	int err;
+
+	ata_tf_init(ap, &tf, dev->devno);
+
+	tf.command = cmd;
+	tf.flags |= ATA_TFLAG_DEVICE;
+	tf.protocol = ATA_PROT_NODATA;
+
+	err = ata_exec_internal(ap, dev, &tf, DMA_NONE, NULL, 0);
+	if (err)
+		printk(KERN_ERR "%s: ata command failed: %d\n",
+				__FUNCTION__, err);
+
+	return err;
+}
+
+static int ata_flush_cache(struct ata_port *ap, struct ata_device *dev)
+{
+	u8 cmd;
+
+	if (!ata_try_flush_cache(dev))
+		return 0;
+
+	if (ata_id_has_flush_ext(dev->id))
+		cmd = ATA_CMD_FLUSH_EXT;
+	else
+		cmd = ATA_CMD_FLUSH;
+
+	return ata_do_simple_cmd(ap, dev, cmd);
+}
+
+static int ata_standby_drive(struct ata_port *ap, struct ata_device *dev)
+{
+	return ata_do_simple_cmd(ap, dev, ATA_CMD_STANDBYNOW1);
+}
+
+static int ata_start_drive(struct ata_port *ap, struct ata_device *dev)
+{
+	return ata_do_simple_cmd(ap, dev, ATA_CMD_IDLEIMMEDIATE);
+}
+
+/**
+ *	ata_device_resume - wakeup a previously suspended devices
+ *
+ *	Kick the drive back into action, by sending it an idle immediate
+ *	command and making sure its transfer mode matches between drive
+ *	and host.
+ *
+ */
+int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
+{
+	if (ap->flags & ATA_FLAG_SUSPENDED) {
+		ap->flags &= ~ATA_FLAG_SUSPENDED;
+		ata_set_mode(ap);
+	}
+	if (!ata_dev_present(dev))
+		return 0;
+	if (dev->class == ATA_DEV_ATA)
+		ata_start_drive(ap, dev);
+
+	return 0;
+}
+
+/**
+ *	ata_device_suspend - prepare a device for suspend
+ *
+ *	Flush the cache on the drive, if appropriate, then issue a
+ *	standbynow command.
+ *
+ */
+int ata_device_suspend(struct ata_port *ap, struct ata_device *dev)
+{
+	if (!ata_dev_present(dev))
+		return 0;
+	if (dev->class == ATA_DEV_ATA)
+		ata_flush_cache(ap, dev);
+
+	ata_standby_drive(ap, dev);
+	ap->flags |= ATA_FLAG_SUSPENDED;
+	return 0;
+}
+
 int ata_port_start (struct ata_port *ap)
 {
 	struct device *dev = ap->host_set->dev;
@@ -4902,6 +4992,23 @@ int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
 
 	return (tmp == bits->val) ? 1 : 0;
 }
+
+int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, PCI_D3hot);
+	return 0;
+}
+
+int ata_pci_device_resume(struct pci_dev *pdev)
+{
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_enable_device(pdev);
+	pci_set_master(pdev);
+	return 0;
+}
 #endif /* CONFIG_PCI */
 
 
@@ -5005,4 +5112,11 @@ EXPORT_SYMBOL_GPL(ata_pci_host_stop);
 EXPORT_SYMBOL_GPL(ata_pci_init_native_mode);
 EXPORT_SYMBOL_GPL(ata_pci_init_one);
 EXPORT_SYMBOL_GPL(ata_pci_remove_one);
+EXPORT_SYMBOL_GPL(ata_pci_device_suspend);
+EXPORT_SYMBOL_GPL(ata_pci_device_resume);
 #endif /* CONFIG_PCI */
+
+EXPORT_SYMBOL_GPL(ata_device_suspend);
+EXPORT_SYMBOL_GPL(ata_device_resume);
+EXPORT_SYMBOL_GPL(ata_scsi_device_suspend);
+EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be..c1ebede 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -396,6 +396,22 @@ void ata_dump_status(unsigned id, struct ata_taskfile *tf)
 	}
 }
 
+int ata_scsi_device_resume(struct scsi_device *sdev)
+{
+	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_device *dev = &ap->device[sdev->id];
+
+	return ata_device_resume(ap, dev);
+}
+
+int ata_scsi_device_suspend(struct scsi_device *sdev)
+{
+	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
+	struct ata_device *dev = &ap->device[sdev->id];
+
+	return ata_device_suspend(ap, dev);
+}
+
 /**
  *	ata_to_sense_error - convert ATA error to SCSI error
  *	@id: ATA device number
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 15842b1..ea7f3a4 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -263,9 +263,40 @@ static int scsi_bus_match(struct device *dev, struct device_driver *gendrv)
 	return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0;
 }
 
+static int scsi_bus_suspend(struct device * dev, pm_message_t state)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct scsi_host_template *sht = sdev->host->hostt;
+	int err;
+
+	err = scsi_device_quiesce(sdev);
+	if (err)
+		return err;
+
+	if (sht->suspend)
+		err = sht->suspend(sdev);
+
+	return err;
+}
+
+static int scsi_bus_resume(struct device * dev)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct scsi_host_template *sht = sdev->host->hostt;
+	int err = 0;
+
+	if (sht->resume)
+		err = sht->resume(sdev);
+
+	scsi_device_resume(sdev);
+	return err;
+}
+
 struct bus_type scsi_bus_type = {
         .name		= "scsi",
         .match		= scsi_bus_match,
+	.suspend	= scsi_bus_suspend,
+	.resume		= scsi_bus_resume,
 };
 
 int scsi_sysfs_register(void)
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b7..3eb80c3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -141,6 +141,8 @@ enum {
 	ATA_CMD_PACKET		= 0xA0,
 	ATA_CMD_VERIFY		= 0x40,
 	ATA_CMD_VERIFY_EXT	= 0x42,
+ 	ATA_CMD_STANDBYNOW1	= 0xE0,
+ 	ATA_CMD_IDLEIMMEDIATE	= 0xE1,
 	ATA_CMD_INIT_DEV_PARAMS	= 0x91,
 
 	/* SETFEATURES stuff */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e17..cdab75c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -124,6 +124,8 @@ enum {
 	ATA_FLAG_DEBUGMSG	= (1 << 10),
 	ATA_FLAG_NO_ATAPI	= (1 << 11), /* No ATAPI support */
 
+	ATA_FLAG_SUSPENDED	= (1 << 12), /* port is suspended */
+
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
@@ -436,6 +438,8 @@ extern void ata_std_ports(struct ata_ioports *ioaddr);
 extern int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info,
 			     unsigned int n_ports);
 extern void ata_pci_remove_one (struct pci_dev *pdev);
+extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state);
+extern int ata_pci_device_resume(struct pci_dev *pdev);
 #endif /* CONFIG_PCI */
 extern int ata_device_add(const struct ata_probe_ent *ent);
 extern void ata_host_set_remove(struct ata_host_set *host_set);
@@ -445,6 +449,10 @@ extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
 extern int ata_scsi_error(struct Scsi_Host *host);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
+extern int ata_scsi_device_resume(struct scsi_device *);
+extern int ata_scsi_device_suspend(struct scsi_device *);
+extern int ata_device_resume(struct ata_port *, struct ata_device *);
+extern int ata_device_suspend(struct ata_port *, struct ata_device *);
 extern int ata_ratelimit(void);
 
 /*
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6cbb198..6297885 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -296,6 +296,12 @@ struct scsi_host_template {
 	int (*proc_info)(struct Scsi_Host *, char *, char **, off_t, int, int);
 
 	/*
+	 * suspend support
+	 */
+	int (*resume)(struct scsi_device *);
+	int (*suspend)(struct scsi_device *);
+
+	/*
 	 * Name of proc directory
 	 */
 	char *proc_name;
-- 
cgit v1.1


From 7ed40918a386afc2e14a6d3da563ea6d13686c25 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 6 Jan 2006 08:43:16 -0800
Subject: x86: remove bogus 'pci=usepirqmask' suggestion when no irq is defined

This was harmless, but for the case of a device that had no irq
pre-defined we would incorrectly suggest that "usepirqmask" might make a
difference.  It never would, and the message was just confusing people.

Reported in the dmesg of Etienne Lorrain.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/pci/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index 19e6f48..ee8e016 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -846,7 +846,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	 * reported by the device if possible.
 	 */
 	newirq = dev->irq;
-	if (!((1 << newirq) & mask)) {
+	if (newirq && !((1 << newirq) & mask)) {
 		if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
 		else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev));
 	}
-- 
cgit v1.1