52 files changed, 1639 insertions, 2101 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index e3ca863..7c3ac0c 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -81,7 +81,6 @@ DRIVER_SOURCES = \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
 	brw_wm_iz.c \
-	brw_wm_glsl.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
 	brw_wm_pass2.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index d3a1233..d286c9d 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -239,7 +239,7 @@ static void upload_blend_constant_color(struct brw_context *brw)
    struct brw_blend_constant_color bcc;
 
    memset(&bcc, 0, sizeof(bcc));
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.opcode = _3DSTATE_BLEND_CONSTANT_COLOR;
    bcc.header.length = sizeof(bcc)/4-2;
    bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
    bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index cb0a8b9..8fc322f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -122,9 +122,6 @@ GLboolean brwCreateContext( int api,
 	 (i == MESA_SHADER_FRAGMENT);
       ctx->ShaderCompilerOptions[i].EmitNoIndirectTemp =
 	 (i == MESA_SHADER_FRAGMENT);
-
-      if (intel->gen == 6)
-	 ctx->ShaderCompilerOptions[i].EmitNoIfs = (i == MESA_SHADER_VERTEX);
    }
 
    ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
@@ -154,6 +151,13 @@ GLboolean brwCreateContext( int api,
       MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
 	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   /* Gen6 converts quads to polygon in beginning of 3D pipeline,
+      but we're not sure how it's actually done for vertex order,
+      that affect provoking vertex decision. Always use last vertex
+      convention for quad primitive which works as expected for now. */
+   if (intel->gen == 6)
+       ctx->Const.QuadsFollowProvokingVertexConvention = GL_FALSE;
+
    if (intel->is_g4x || intel->gen >= 5) {
       brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_GM45;
       brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_GM45;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 3353395..7069724 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -171,7 +171,6 @@ struct brw_vertex_program {
 struct brw_fragment_program {
    struct gl_fragment_program program;
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -211,6 +210,7 @@ struct brw_wm_prog_data {
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    GLboolean error;
+   int dispatch_width;
 
    /* Pointer to tracked values (only valid once
     * _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 7b823eb..877b22f 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -242,21 +242,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (vp->use_const_buffer) {
-	 /* Load the subset of push constants that will get used when
-	  * we also have a pull constant buffer.
-	  */
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       assert(brw->vs.constant_map[i] <= nr);
-	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr; i++) {
-	    memcpy(buf + offset + i * 4,
+      /* Load the subset of push constants that will get used when
+       * we also have a pull constant buffer.
+       */
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    assert(brw->vs.constant_map[i] <= nr);
+	    memcpy(buf + offset + brw->vs.constant_map[i] * 4,
 		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 239586a..2f7dcc2 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -35,28 +35,6 @@
 
 /* 3D state:
  */
-#define _3DOP_3DSTATE_PIPELINED       0x0
-#define _3DOP_3DSTATE_NONPIPELINED    0x1
-#define _3DOP_3DCONTROL               0x2
-#define _3DOP_3DPRIMITIVE             0x3
-
-#define _3DSTATE_PIPELINED_POINTERS       0x00
-#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
-#define _3DSTATE_VERTEX_BUFFERS           0x08
-#define _3DSTATE_VERTEX_ELEMENTS          0x09
-#define _3DSTATE_INDEX_BUFFER             0x0A
-#define _3DSTATE_VF_STATISTICS            0x0B
-#define _3DSTATE_DRAWING_RECTANGLE            0x00
-#define _3DSTATE_CONSTANT_COLOR               0x01
-#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
-#define _3DSTATE_CHROMA_KEY                   0x04
-#define _3DSTATE_DEPTH_BUFFER                 0x05
-#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
-#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
-#define _3DSTATE_LINE_STIPPLE                 0x08
-#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
-#define _3DCONTROL    0x00
-
 #define PIPE_CONTROL_NOWRITE          0x00
 #define PIPE_CONTROL_WRITEIMMEDIATE   0x01
 #define PIPE_CONTROL_WRITEDEPTH       0x02
@@ -389,6 +367,7 @@
 #define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
 #define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
 #define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
+#define BRW_SURFACEFORMAT_DXT1_RGB_SRGB                  0x180
 #define BRW_SURFACEFORMAT_R1_UINT                        0x181 
 #define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
 #define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
@@ -462,6 +441,13 @@
 #define BRW_COMPRESSION_2NDHALF       1
 #define BRW_COMPRESSION_COMPRESSED    2
 
+#define GEN6_COMPRESSION_1Q		0
+#define GEN6_COMPRESSION_2Q		1
+#define GEN6_COMPRESSION_3Q		2
+#define GEN6_COMPRESSION_4Q		3
+#define GEN6_COMPRESSION_1H		0
+#define GEN6_COMPRESSION_2H		2
+
 #define BRW_CONDITIONAL_NONE  0
 #define BRW_CONDITIONAL_Z     1
 #define BRW_CONDITIONAL_NZ    2
@@ -837,7 +823,7 @@
 # define GEN6_BINDING_TABLE_MODIFY_GS	(1 << 9)
 # define GEN6_BINDING_TABLE_MODIFY_PS	(1 << 12)
 
-#define CMD_3D_SAMPLER_STATE_POINTERS			0x7802 /* SNB+ */
+#define _3DSTATE_SAMPLER_STATE_POINTERS		0x7802 /* GEN6+ */
 # define PS_SAMPLER_STATE_CHANGE				(1 << 12)
 # define GS_SAMPLER_STATE_CHANGE				(1 << 9)
 # define VS_SAMPLER_STATE_CHANGE				(1 << 8)
@@ -878,27 +864,29 @@
 #define CMD_INDEX_BUFFER              0x780a
 #define CMD_VF_STATISTICS_965          0x780b
 #define CMD_VF_STATISTICS_GM45        0x680b
-#define CMD_3D_CC_STATE_POINTERS      0x780e /* GEN6+ */
+#define _3DSTATE_CC_STATE_POINTERS		0x780e /* GEN6+ */
 
-#define CMD_URB					0x7805 /* GEN6+ */
+#define _3DSTATE_URB				0x7805 /* GEN6+ */
 # define GEN6_URB_VS_SIZE_SHIFT				16
 # define GEN6_URB_VS_ENTRIES_SHIFT			0
 # define GEN6_URB_GS_ENTRIES_SHIFT			8
 # define GEN6_URB_GS_SIZE_SHIFT				0
 
-#define CMD_VIEWPORT_STATE_POINTERS			0x780d /* GEN6+ */
+#define _3DSTATE_VIEWPORT_STATE_POINTERS	0x780d /* GEN6+ */
 # define GEN6_CC_VIEWPORT_MODIFY			(1 << 12)
 # define GEN6_SF_VIEWPORT_MODIFY			(1 << 11)
 # define GEN6_CLIP_VIEWPORT_MODIFY			(1 << 10)
 
-#define CMD_3D_SCISSOR_STATE_POINTERS		0x780f /* GEN6+ */
+#define _3DSTATE_SCISSOR_STATE_POINTERS		0x780f /* GEN6+ */
 
-#define CMD_3D_VS_STATE		      0x7810 /* GEN6+ */
+#define _3DSTATE_VS				0x7810 /* GEN6+ */
 /* DW2 */
 # define GEN6_VS_SPF_MODE				(1 << 31)
 # define GEN6_VS_VECTOR_MASK_ENABLE			(1 << 30)
 # define GEN6_VS_SAMPLER_COUNT_SHIFT			27
 # define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
+# define GEN6_VS_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
+# define GEN6_VS_FLOATING_POINT_MODE_ALT		(1 << 16)
 /* DW4 */
 # define GEN6_VS_DISPATCH_START_GRF_SHIFT		20
 # define GEN6_VS_URB_READ_LENGTH_SHIFT			11
@@ -909,7 +897,7 @@
 # define GEN6_VS_CACHE_DISABLE				(1 << 1)
 # define GEN6_VS_ENABLE					(1 << 0)
 
-#define CMD_3D_GS_STATE		      0x7811 /* GEN6+ */
+#define _3DSTATE_GS		      		0x7811 /* GEN6+ */
 /* DW2 */
 # define GEN6_GS_SPF_MODE				(1 << 31)
 # define GEN6_GS_VECTOR_MASK_ENABLE			(1 << 30)
@@ -927,7 +915,7 @@
 /* DW6 */
 # define GEN6_GS_ENABLE					(1 << 15)
 
-#define CMD_3D_CLIP_STATE		      0x7812 /* GEN6+ */
+#define _3DSTATE_CLIP				0x7812 /* GEN6+ */
 /* DW1 */
 # define GEN6_CLIP_STATISTICS_ENABLE			(1 << 10)
 /**
@@ -957,7 +945,7 @@
 # define GEN6_CLIP_MAX_POINT_WIDTH_SHIFT		6
 # define GEN6_CLIP_FORCE_ZERO_RTAINDEX			(1 << 5)
 
-#define CMD_3D_SF_STATE				0x7813 /* GEN6+ */
+#define _3DSTATE_SF				0x7813 /* GEN6+ */
 /* DW1 */
 # define GEN6_SF_NUM_OUTPUTS_SHIFT			22
 # define GEN6_SF_SWIZZLE_ENABLE				(1 << 21)
@@ -1022,18 +1010,27 @@
 # define ATTRIBUTE_0_CONST_SOURCE_SHIFT			9
 # define ATTRIBUTE_0_SWIZZLE_SHIFT			6
 # define ATTRIBUTE_0_SOURCE_SHIFT			0
+
+# define ATTRIBUTE_SWIZZLE_INPUTATTR                    0
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING             1
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_W                  2
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING_W           3
+# define ATTRIBUTE_SWIZZLE_SHIFT                        6
+
 /* DW16: Point sprite texture coordinate enables */
 /* DW17: Constant interpolation enables */
 /* DW18: attr 0-7 wrap shortest enables */
 /* DW19: attr 8-16 wrap shortest enables */
 
-#define CMD_3D_WM_STATE		      0x7814 /* GEN6+ */
+#define _3DSTATE_WM				0x7814 /* GEN6+ */
 /* DW1: kernel pointer */
 /* DW2 */
 # define GEN6_WM_SPF_MODE				(1 << 31)
 # define GEN6_WM_VECTOR_MASK_ENABLE			(1 << 30)
 # define GEN6_WM_SAMPLER_COUNT_SHIFT			27
 # define GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
+# define GEN6_WM_FLOATING_POINT_MODE_IEEE_754		(0 << 16)
+# define GEN6_WM_FLOATING_POINT_MODE_ALT		(1 << 16)
 /* DW3: scratch space */
 /* DW4 */
 # define GEN6_WM_STATISTICS_ENABLE			(1 << 31)
@@ -1088,34 +1085,34 @@
 /* DW7: kernel 1 pointer */
 /* DW8: kernel 2 pointer */
 
-#define CMD_3D_CONSTANT_VS_STATE	      0x7815 /* GEN6+ */
-#define CMD_3D_CONSTANT_GS_STATE	      0x7816 /* GEN6+ */
-#define CMD_3D_CONSTANT_PS_STATE	      0x7817 /* GEN6+ */
+#define _3DSTATE_CONSTANT_VS		      0x7815 /* GEN6+ */
+#define _3DSTATE_CONSTANT_GS		      0x7816 /* GEN6+ */
+#define _3DSTATE_CONSTANT_PS		      0x7817 /* GEN6+ */
 # define GEN6_CONSTANT_BUFFER_3_ENABLE			(1 << 15)
 # define GEN6_CONSTANT_BUFFER_2_ENABLE			(1 << 14)
 # define GEN6_CONSTANT_BUFFER_1_ENABLE			(1 << 13)
 # define GEN6_CONSTANT_BUFFER_0_ENABLE			(1 << 12)
 
-#define CMD_3D_SAMPLE_MASK			0x7818 /* GEN6+ */
+#define _3DSTATE_SAMPLE_MASK			0x7818 /* GEN6+ */
 
-#define CMD_DRAW_RECT                 0x7900
-#define CMD_BLEND_CONSTANT_COLOR      0x7901
-#define CMD_CHROMA_KEY                0x7904
-#define CMD_DEPTH_BUFFER              0x7905
-#define CMD_POLY_STIPPLE_OFFSET       0x7906
-#define CMD_POLY_STIPPLE_PATTERN      0x7907
-#define CMD_LINE_STIPPLE_PATTERN      0x7908
-#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
-#define CMD_AA_LINE_PARAMETERS        0x790a
+#define _3DSTATE_DRAWING_RECTANGLE		0x7900
+#define _3DSTATE_BLEND_CONSTANT_COLOR		0x7901
+#define _3DSTATE_CHROMA_KEY			0x7904
+#define _3DSTATE_DEPTH_BUFFER			0x7905
+#define _3DSTATE_POLY_STIPPLE_OFFSET		0x7906
+#define _3DSTATE_POLY_STIPPLE_PATTERN		0x7907
+#define _3DSTATE_LINE_STIPPLE_PATTERN		0x7908
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP	0x7909
+#define _3DSTATE_AA_LINE_PARAMETERS		0x790a /* G45+ */
 
-#define CMD_GS_SVB_INDEX			0x790b /* CTG+ */
+#define _3DSTATE_GS_SVB_INDEX			0x790b /* CTG+ */
 /* DW1 */
 # define SVB_INDEX_SHIFT				29
 # define SVB_LOAD_INTERNAL_VERTEX_COUNT			(1 << 0) /* SNB+ */
 /* DW2: SVB index */
 /* DW3: SVB maximum index */
 
-#define CMD_3D_MULTISAMPLE			0x790d /* SNB+ */
+#define _3DSTATE_MULTISAMPLE			0x790d /* GEN6+ */
 /* DW1 */
 # define MS_PIXEL_LOCATION_CENTER			(0 << 4)
 # define MS_PIXEL_LOCATION_UPPER_LEFT			(1 << 4)
@@ -1123,7 +1120,10 @@
 # define MS_NUMSAMPLES_4				(2 << 1)
 # define MS_NUMSAMPLES_8				(3 << 1)
 
-#define CMD_3D_CLEAR_PARAMS			0x7910 /* ILK+ */
+#define _3DSTATE_STENCIL_BUFFER			0x790e /* ILK, SNB */
+#define _3DSTATE_HIER_DEPTH_BUFFER		0x790f /* ILK, SNB */
+
+#define _3DSTATE_CLEAR_PARAMS			0x7910 /* ILK+ */
 # define DEPTH_CLEAR_VALID				(1 << 15)
 /* DW1: depth clear value */
 
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 962c041..111cb99 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -899,7 +899,8 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	err |= dest (file, inst);
     } else if (gen >= 6 && (inst->header.opcode == BRW_OPCODE_IF ||
 			    inst->header.opcode == BRW_OPCODE_ELSE ||
-			    inst->header.opcode == BRW_OPCODE_ENDIF)) {
+			    inst->header.opcode == BRW_OPCODE_ENDIF ||
+			    inst->header.opcode == BRW_OPCODE_WHILE)) {
        format (file, " %d", inst->bits1.branch_gen6.jump_count);
     }
 
@@ -972,7 +973,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 			inst->bits3.dp_render_cache.send_commit_msg,
 			inst->bits3.dp_render_cache.msg_length,
 			inst->bits3.dp_render_cache.response_length);
-	    } else if (gen >= 5) {
+	    } else if (gen >= 5 /* FINISHME: || is_g4x */) {
 		format (file, " (%d, %d, %d)",
 			inst->bits3.dp_read_gen5.binding_table_index,
 			inst->bits3.dp_read_gen5.msg_control,
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index a1f403c..7eb16b7 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -159,7 +159,7 @@ static void brw_emit_prim(struct brw_context *brw,
    }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
-			      sizeof(prim_packet));
+			      sizeof(prim_packet), false);
    }
    if (intel->always_flush_cache) {
       intel_batchbuffer_emit_mi_flush(intel->batch);
@@ -351,7 +351,8 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx,
        * an upper bound of how much we might emit in a single
        * brw_try_draw_prims().
        */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4);
+      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
+				      false);
 
       hw_prim = brw_set_prim(brw, &prim[i]);
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index 2ff39e8..3b5c4c0 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -72,7 +72,37 @@ void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
 
 void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
 {
-   p->current->header.compression_control = compression_control;
+   p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED);
+
+   if (p->brw->intel.gen >= 6) {
+      /* Since we don't use the 32-wide support in gen6, we translate
+       * the pre-gen6 compression control here.
+       */
+      switch (compression_control) {
+      case BRW_COMPRESSION_NONE:
+	 /* This is the "use the first set of bits of dmask/vmask/arf
+	  * according to execsize" option.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1Q;
+	 break;
+      case BRW_COMPRESSION_2NDHALF:
+	 /* For 8-wide, this is "use the second set of 8 bits." */
+	 p->current->header.compression_control = GEN6_COMPRESSION_2Q;
+	 break;
+      case BRW_COMPRESSION_COMPRESSED:
+	 /* For 16-wide instruction compression, use the first set of 16 bits
+	  * since we don't do 32-wide dispatch.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      default:
+	 assert(!"not reached");
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      }
+   } else {
+      p->current->header.compression_control = compression_control;
+   }
 }
 
 void brw_set_mask_control( struct brw_compile *p, GLuint value )
@@ -95,6 +125,7 @@ void brw_push_insn_state( struct brw_compile *p )
 {
    assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
    memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->compressed_stack[p->current - p->stack] = p->compressed;
    p->current++;   
 }
 
@@ -102,6 +133,7 @@ void brw_pop_insn_state( struct brw_compile *p )
 {
    assert(p->current != p->stack);
    p->current--;
+   p->compressed = p->compressed_stack[p->current - p->stack];
 }
 
 
@@ -112,6 +144,7 @@ void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
    p->brw = brw;
    p->nr_insn = 0;
    p->current = p->stack;
+   p->compressed = false;
    memset(p->current, 0, sizeof(p->current[0]));
 
    /* Some defaults?
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index b4538e6..119ffc7 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -33,6 +33,7 @@
 #ifndef BRW_EU_H
 #define BRW_EU_H
 
+#include <stdbool.h>
 #include "brw_structs.h"
 #include "brw_defines.h"
 #include "program/prog_instruction.h"
@@ -106,10 +107,12 @@ struct brw_compile {
    /* Allow clients to push/pop instruction state:
     */
    struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   bool compressed_stack[BRW_EU_MAX_INSN_STACK];
    struct brw_instruction *current;
 
    GLuint flag_value;
    GLboolean single_program_flow;
+   bool compressed;
    struct brw_context *brw;
 
    struct brw_glsl_label *first_label;  /**< linked list of labels */
@@ -858,7 +861,8 @@ void brw_fb_WRITE(struct brw_compile *p,
 		   GLuint binding_table_index,
 		   GLuint msg_length,
 		   GLuint response_length,
-		   GLboolean eot);
+		   GLboolean eot,
+		   GLboolean header_present);
 
 void brw_SAMPLE(struct brw_compile *p,
 		struct brw_reg dest,
@@ -954,6 +958,8 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 	       struct brw_instruction *patch_insn);
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count);
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn);
 struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count);
 /* Forward jumps:
  */
@@ -1009,6 +1015,7 @@ void brw_math_invert( struct brw_compile *p,
 void brw_set_src1( struct brw_instruction *insn,
                           struct brw_reg reg );
 
+void brw_set_uip_jip(struct brw_compile *p);
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 9cb941d..88131c4 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -41,19 +41,20 @@
  * Internal helper for constructing instructions
  */
 
-static void guess_execution_size( struct brw_instruction *insn,
-				  struct brw_reg reg )
+static void guess_execution_size(struct brw_compile *p,
+				 struct brw_instruction *insn,
+				 struct brw_reg reg)
 {
-   if (reg.width == BRW_WIDTH_8 && 
-       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+   if (reg.width == BRW_WIDTH_8 && p->compressed)
       insn->header.execution_size = BRW_EXECUTE_16;
    else
       insn->header.execution_size = reg.width;	/* note - definitions are compatible */
 }
 
 
-static void brw_set_dest( struct brw_instruction *insn,
-			  struct brw_reg dest )
+static void brw_set_dest(struct brw_compile *p,
+			 struct brw_instruction *insn,
+			 struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
@@ -100,7 +101,7 @@ static void brw_set_dest( struct brw_instruction *insn,
    /* NEW: Set the execution size based on dest.width and
     * insn->compression_control:
     */
-   guess_execution_size(insn, dest);
+   guess_execution_size(p, insn, dest);
 }
 
 extern int reg_type_size[];
@@ -535,6 +536,16 @@ brw_set_dp_read_message(struct brw_context *brw,
        insn->bits3.dp_read_gen5.end_of_thread = 0;
        insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
        insn->bits2.send_gen5.end_of_thread = 0;
+   } else if (intel->is_g4x) {
+       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
+       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
+       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read_g4x.pad1 = 0;
+       insn->bits3.dp_read_g4x.end_of_thread = 0;
    } else {
        insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
        insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
@@ -629,7 +640,7 @@ static struct brw_instruction *brw_alu1( struct brw_compile *p,
 					 struct brw_reg src )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);   
    return insn;
 }
@@ -641,7 +652,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
 					struct brw_reg src1 )
 {
    struct brw_instruction *insn = next_insn(p, opcode);   
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
    return insn;
@@ -680,7 +691,7 @@ void brw_##OP(struct brw_compile *p,					      \
 {									      \
    struct brw_instruction *rnd, *add;					      \
    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
-   brw_set_dest(rnd, dest);						      \
+   brw_set_dest(p, rnd, dest);						      \
    brw_set_src0(rnd, src);						      \
    rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
 									      \
@@ -779,7 +790,7 @@ struct brw_instruction *brw_MUL(struct brw_compile *p,
 void brw_NOP(struct brw_compile *p)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
-   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src1(insn, brw_imm_ud(0x0));
 }
@@ -840,11 +851,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
    /* Override the defaults for this instruction:
     */
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -870,7 +881,7 @@ brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
 
    insn = next_insn(p, BRW_OPCODE_IF);
 
-   brw_set_dest(insn, brw_imm_w(0));
+   brw_set_dest(p, insn, brw_imm_w(0));
    insn->header.execution_size = BRW_EXECUTE_8;
    insn->bits1.branch_gen6.jump_count = 0;
    brw_set_src0(insn, src0);
@@ -905,11 +916,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    }
 
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -965,11 +976,11 @@ void brw_ENDIF(struct brw_compile *p,
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
 
       if (intel->gen < 6) {
-	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src1(insn, brw_imm_d(0x0));
       } else {
-	 brw_set_dest(insn, brw_imm_w(0));
+	 brw_set_dest(p, insn, brw_imm_w(0));
 	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       }
@@ -1029,16 +1040,44 @@ void brw_ENDIF(struct brw_compile *p,
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+
    insn = next_insn(p, BRW_OPCODE_BREAK);
-   brw_set_dest(insn, brw_ip_reg());
+   if (intel->gen >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(insn, brw_ip_reg());
+      brw_set_src1(insn, brw_imm_d(0x0));
+      insn->bits3.if_else.pad0 = 0;
+      insn->bits3.if_else.pop_count = pop_count;
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+
+   return insn;
+}
+
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+   int br = 2;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->bits3.break_cont.uip = br * (do_insn - insn);
+
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   insn->bits3.if_else.pad0 = 0;
-   insn->bits3.if_else.pop_count = pop_count;
    return insn;
 }
 
@@ -1046,7 +1085,7 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 {
    struct brw_instruction *insn;
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
-   brw_set_dest(insn, brw_ip_reg());
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
    insn->header.compression_control = BRW_COMPRESSION_NONE;
@@ -1058,17 +1097,33 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 }
 
 /* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 {
-   if (p->single_program_flow) {
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6 || p->single_program_flow) {
       return &p->store[p->nr_insn];
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
 
       /* Override the defaults for this instruction:
        */
-      brw_set_dest(insn, brw_null_reg());
+      brw_set_dest(p, insn, brw_null_reg());
       brw_set_src0(insn, brw_null_reg());
       brw_set_src1(insn, brw_null_reg());
 
@@ -1094,34 +1149,42 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
    if (intel->gen >= 5)
       br = 2;
 
-   if (p->single_program_flow)
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   else
+   if (intel->gen >= 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = do_insn->header.execution_size;
+      assert(insn->header.execution_size == BRW_EXECUTE_8);
+   } else {
+      if (p->single_program_flow) {
+	 insn = next_insn(p, BRW_OPCODE_ADD);
 
-   if (p->single_program_flow) {
-      insn->header.execution_size = BRW_EXECUTE_1;
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
+	 insn->header.execution_size = BRW_EXECUTE_1;
+      } else {
+	 insn = next_insn(p, BRW_OPCODE_WHILE);
 
-      insn->bits3.d = (do_insn - insn) * 16;
-   } else {
-      insn->header.execution_size = do_insn->header.execution_size;
+	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
 
-      assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
-      insn->bits3.if_else.pop_count = 0;
-      insn->bits3.if_else.pad0 = 0;
-   }
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d(0));
 
-/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+	 insn->header.execution_size = do_insn->header.execution_size;
+	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+	 insn->bits3.if_else.pop_count = 0;
+	 insn->bits3.if_else.pad0 = 0;
+      }
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
    return insn;
 }
 
@@ -1159,7 +1222,7 @@ void brw_CMP(struct brw_compile *p,
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
    insn->header.destreg__conditionalmod = conditional;
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 
@@ -1184,7 +1247,7 @@ void brw_WAIT (struct brw_compile *p)
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
    struct brw_reg src = brw_notification_1_reg();
 
-   brw_set_dest(insn, src);
+   brw_set_dest(p, insn, src);
    brw_set_src0(insn, src);
    brw_set_src1(insn, brw_null_reg());
    insn->header.execution_size = 0; /* must */
@@ -1219,6 +1282,10 @@ void brw_math( struct brw_compile *p,
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
+
       if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
 	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
 	 assert(src.type == BRW_REGISTER_TYPE_F);
@@ -1228,8 +1295,9 @@ void brw_math( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
    } else {
@@ -1242,7 +1310,7 @@ void brw_math( struct brw_compile *p,
       insn->header.predicate_control = 0;
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_math_message(p->brw,
 			   insn,
@@ -1284,12 +1352,18 @@ void brw_math2(struct brw_compile *p,
       assert(src1.type == BRW_REGISTER_TYPE_F);
    }
 
+   /* Source modifiers are ignored for extended math instructions. */
+   assert(!src0.negate);
+   assert(!src0.abs);
+   assert(!src1.negate);
+   assert(!src1.abs);
+
    /* Math is the same ISA format as other opcodes, except that CondModifier
     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
     */
    insn->header.destreg__conditionalmod = function;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 }
@@ -1318,8 +1392,13 @@ void brw_math_16( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
 
-      brw_set_dest(insn, dest);
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
+
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
       return;
@@ -1334,7 +1413,7 @@ void brw_math_16( struct brw_compile *p,
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw,
 			insn, 
@@ -1351,7 +1430,7 @@ void brw_math_16( struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
    insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
-   brw_set_dest(insn, offset(dest,1));
+   brw_set_dest(p, insn, offset(dest,1));
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw, 
 			insn, 
@@ -1446,7 +1525,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 	 send_commit_msg = 1;
       }
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_write_message(p->brw,
@@ -1516,7 +1595,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
       insn->header.compression_control = BRW_COMPRESSION_NONE;
       insn->header.destreg__conditionalmod = mrf.nr;
 
-      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_dest(p, insn, dest);	/* UW? */
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_read_message(p->brw,
@@ -1569,7 +1648,7 @@ void brw_oword_block_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    if (intel->gen >= 6) {
       brw_set_src0(insn, mrf);
    } else {
@@ -1614,7 +1693,7 @@ void brw_dword_scattered_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_null_reg());
 
    brw_set_dp_read_message(p->brw,
@@ -1639,29 +1718,22 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
-   struct brw_reg b;
 
-   /*
-   printf("vs const read msg, location %u, msg_reg_nr %d\n",
-          location, msg_reg_nr);
-   */
+   if (intel->gen >= 6)
+      location /= 16;
 
    /* Setup MRF[1] with location/offset into const buffer */
    brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-    */
-   b = brw_message_reg(msg_reg_nr);
-   b = retype(b, BRW_REGISTER_TYPE_UD);
-   /*b = get_element_ud(b, 2);*/
-   brw_MOV(p, b, brw_imm_ud(location));
-
+   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
+		     BRW_REGISTER_TYPE_UD),
+	   brw_imm_ud(location));
    brw_pop_insn_state(p);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1671,8 +1743,12 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    insn->header.destreg__conditionalmod = msg_reg_nr;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, brw_null_reg());
+   brw_set_dest(p, insn, dest);
+   if (intel->gen >= 6) {
+      brw_set_src0(insn, brw_message_reg(msg_reg_nr));
+   } else {
+      brw_set_src0(insn, brw_null_reg());
+   }
 
    brw_set_dp_read_message(p->brw,
 			   insn,
@@ -1699,6 +1775,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
 
    /* Setup MRF[1] with offset into const buffer */
    brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1706,7 +1783,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
     * fields ignored.
     */
-   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
 	   addr_reg, brw_imm_d(offset));
    brw_pop_insn_state(p);
 
@@ -1717,7 +1794,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    insn->header.destreg__conditionalmod = 0;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_vec8_grf(0, 0));
 
    if (intel->gen == 6)
@@ -1747,12 +1824,12 @@ void brw_fb_WRITE(struct brw_compile *p,
                   GLuint binding_table_index,
                   GLuint msg_length,
                   GLuint response_length,
-                  GLboolean eot)
+                  GLboolean eot,
+                  GLboolean header_present)
 {
    struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_control, msg_type;
-   GLboolean header_present = GL_TRUE;
 
    if (intel->gen >= 6 && binding_table_index == 0) {
       insn = next_insn(p, BRW_OPCODE_SENDC);
@@ -1764,9 +1841,6 @@ void brw_fb_WRITE(struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_NONE;
 
    if (intel->gen >= 6) {
-      if (msg_length == 4)
-	 header_present = GL_FALSE;
-
        /* headerless version, just submit color payload */
        src0 = brw_message_reg(msg_reg_nr);
 
@@ -1782,7 +1856,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_dp_write_message(p->brw,
 			    insn,
@@ -1860,7 +1934,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
-	 guess_execution_size(p->current, dest);
+	 guess_execution_size(p, p->current, dest);
 	 if (p->current->header.execution_size == BRW_EXECUTE_16)
 	    dispatch_16 = GL_TRUE;
 
@@ -1871,7 +1945,8 @@ void brw_SAMPLE(struct brw_compile *p,
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_set_mask_control(p, BRW_MASK_DISABLE);
 
-	 brw_MOV(p, m1, brw_vec8_grf(0,0));	 
+	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
+		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
   	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 
 
 	 brw_pop_insn_state(p);
@@ -1895,12 +1970,15 @@ void brw_SAMPLE(struct brw_compile *p,
        * and the first message register index comes from src0.
        */
       if (intel->gen >= 6) {
-	  brw_push_insn_state(p);
-	  brw_set_mask_control( p, BRW_MASK_DISABLE );
-	  /* m1 contains header? */
-	  brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
-	  brw_pop_insn_state(p);
-	  src0 = brw_message_reg(msg_reg_nr);
+	 if (src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	     src0.nr != BRW_ARF_NULL) {
+	    brw_push_insn_state(p);
+	    brw_set_mask_control( p, BRW_MASK_DISABLE );
+	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	    brw_MOV(p, retype(brw_message_reg(msg_reg_nr), src0.type), src0);
+	    brw_pop_insn_state(p);
+	 }
+	 src0 = brw_message_reg(msg_reg_nr);
       }
 
       insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1909,7 +1987,7 @@ void brw_SAMPLE(struct brw_compile *p,
       if (intel->gen < 6)
 	  insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src0);
       brw_set_sampler_message(p->brw, insn,
 			      binding_table_index,
@@ -1929,7 +2007,8 @@ void brw_SAMPLE(struct brw_compile *p,
        */
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_MOV(p, reg, reg);	      
+      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
+	      retype(reg, BRW_REGISTER_TYPE_UD));
       brw_pop_insn_state(p);
    }
 
@@ -1961,7 +2040,8 @@ void brw_urb_WRITE(struct brw_compile *p,
    if (intel->gen >= 6) {
       brw_push_insn_state(p);
       brw_set_mask_control( p, BRW_MASK_DISABLE );
-      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+	      retype(src0, BRW_REGISTER_TYPE_UD));
       brw_pop_insn_state(p);
       src0 = brw_message_reg(msg_reg_nr);
    }
@@ -1970,7 +2050,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    assert(msg_length < BRW_MAX_MRF);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
@@ -1989,6 +2069,80 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+static int
+brw_find_next_block_end(struct brw_compile *p, int start)
+{
+   int ip;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_WHILE:
+	 return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_compile *p, int start)
+{
+   int ip;
+   int br = 2;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      if (insn->header.opcode == BRW_OPCODE_WHILE) {
+	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
+	    return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK and CONT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int ip;
+   int br = 2;
+
+   if (intel->gen < 6)
+      return;
+
+   for (ip = 0; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_BREAK:
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* JIP is set at CONTINUE emit time, since that's when we
+	  * know where the start of the loop is.
+	  */
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 assert(insn->bits3.break_cont.uip != 0);
+	 assert(insn->bits3.break_cont.jip != 0);
+	 break;
+      }
+   }
+}
+
 void brw_ff_sync(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -2013,7 +2167,7 @@ void brw_ff_sync(struct brw_compile *p,
    }
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 6796fb2..d0b0c22 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -36,8 +36,6 @@
 #include "swrast/swrast.h"
 #include "tnl/tnl.h"
 #include "brw_context.h"
-#include "intel_fbo.h"
-#include "intel_regions.h"
 
 #define FILE_DEBUG_FLAG DEBUG_FALLBACKS
 
@@ -63,49 +61,14 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
       if (texUnit->_ReallyEnabled) {
-	 struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current);
-	 struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel];
+	 struct gl_texture_object *tex_obj = texUnit->_Current;
+	 struct gl_texture_image *texImage = tex_obj->Image[0][tex_obj->BaseLevel];
 	 if (texImage->Border) {
 	    DBG("FALLBACK: texture border\n");
 	    return GL_TRUE;
 	 }
       }
    }
-   
-   /* _NEW_STENCIL 
-    */
-   if (ctx->Stencil._Enabled &&
-       (ctx->DrawBuffer->Name == 0 && !brw->intel.hw_stencil)) {
-      DBG("FALLBACK: stencil\n");
-      return GL_TRUE;
-   }
-
-   /* _NEW_BUFFERS */
-   if (!brw->has_surface_tile_offset) {
-      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
-	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-
-	 /* The original gen4 hardware couldn't set up WM surfaces pointing
-	  * at an offset within a tile, which can happen when rendering to
-	  * anything but the base level of a texture or the +X face/0 depth.
-	  * This was fixed with the 4 Series hardware.
-	  *
-	  * For these original chips, you would have to make the depth and
-	  * color destination surfaces include information on the texture
-	  * type, LOD, face, and various limits to use them as a destination.
-	  * I would have done this, but there's also a nasty requirement that
-	  * the depth and the color surfaces all be of the same LOD, which
-	  * may be a worse requirement than this alignment.  (Also, we may
-	  * want to just demote the texture to untiled, instead).
-	  */
-	 if (irb->region && irb->region->tiling != I915_TILING_NONE &&
-	     (irb->region->draw_offset & 4095)) {
-	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
-	    return GL_TRUE;
-	 }
-      }
-   }
 
    return GL_FALSE;
 }
@@ -117,7 +80,7 @@ static void check_fallback(struct brw_context *brw)
 
 const struct brw_tracked_state brw_check_fallback = {
    .dirty = {
-      .mesa = _NEW_BUFFERS | _NEW_RENDERMODE | _NEW_TEXTURE | _NEW_STENCIL,
+      .mesa = _NEW_RENDERMODE | _NEW_TEXTURE | _NEW_STENCIL,
       .brw  = 0,
       .cache = 0
    },
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index edb02fa..a35687d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -48,6 +48,7 @@ extern "C" {
 #include "../glsl/ir_optimization.h"
 #include "../glsl/ir_print_visitor.h"
 
+#define MAX_INSTRUCTION (1 << 30)
 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
 
 struct gl_shader *
@@ -89,6 +90,9 @@ brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
 GLboolean
 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 {
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = &brw->intel;
+
    struct brw_shader *shader =
       (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
    if (shader != NULL) {
@@ -107,7 +111,15 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 			 SUB_TO_ADD_NEG |
 			 EXP_TO_EXP2 |
 			 LOG_TO_LOG2);
+
+      /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
+       * if-statements need to be flattened.
+       */
+      if (intel->gen < 6)
+	 lower_if_to_cond_assign(shader->ir, 16);
+
       do_lower_texture_projection(shader->ir);
+      do_vec_index_to_cond_assign(shader->ir);
       brw_do_cubemap_normalize(shader->ir);
 
       do {
@@ -474,8 +486,13 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
    wpos.reg_offset++;
 
    /* gl_FragCoord.z */
-   emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
-		interp_reg(FRAG_ATTRIB_WPOS, 2)));
+   if (intel->gen >= 6) {
+      emit(fs_inst(BRW_OPCODE_MOV, wpos,
+		   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
+   } else {
+      emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
+		   interp_reg(FRAG_ATTRIB_WPOS, 2)));
+   }
    wpos.reg_offset++;
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
@@ -518,25 +535,40 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
 	    continue;
 	 }
 
-	 for (unsigned int c = 0; c < type->vector_elements; c++) {
-	    struct brw_reg interp = interp_reg(location, c);
-	    emit(fs_inst(FS_OPCODE_LINTERP,
-			 attr,
-			 this->delta_x,
-			 this->delta_y,
-			 fs_reg(interp)));
-	    attr.reg_offset++;
-	 }
-
-	 if (intel->gen < 6) {
-	    attr.reg_offset -= type->vector_elements;
+	 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
+				   location == FRAG_ATTRIB_COL1)) {
+	    /* Constant interpolation (flat shading) case. The SF has
+	     * handed us defined values in only the constant offset
+	     * field of the setup reg.
+	     */
 	    for (unsigned int c = 0; c < type->vector_elements; c++) {
-	       emit(fs_inst(BRW_OPCODE_MUL,
-			    attr,
+	       struct brw_reg interp = interp_reg(location, c);
+	       interp = suboffset(interp, 3);
+	       emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp)));
+	       attr.reg_offset++;
+	    }
+	 } else {
+	    /* Perspective interpolation case. */
+	    for (unsigned int c = 0; c < type->vector_elements; c++) {
+	       struct brw_reg interp = interp_reg(location, c);
+	       emit(fs_inst(FS_OPCODE_LINTERP,
 			    attr,
-			    this->pixel_w));
+			    this->delta_x,
+			    this->delta_y,
+			    fs_reg(interp)));
 	       attr.reg_offset++;
 	    }
+
+	    if (intel->gen < 6) {
+	       attr.reg_offset -= type->vector_elements;
+	       for (unsigned int c = 0; c < type->vector_elements; c++) {
+		  emit(fs_inst(BRW_OPCODE_MUL,
+			       attr,
+			       attr,
+			       this->pixel_w));
+		  attr.reg_offset++;
+	       }
+	    }
 	 }
 	 location++;
       }
@@ -600,8 +632,13 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
     * might be able to do better by doing execsize = 1 math and then
     * expanding that result out, but we would need to be careful with
     * masking.
+    *
+    * The hardware ignores source modifiers (negate and abs) on math
+    * instructions, so we also move to a temp to set those up.
     */
-   if (intel->gen >= 6 && src.file == UNIFORM) {
+   if (intel->gen >= 6 && (src.file == UNIFORM ||
+			   src.abs ||
+			   src.negate)) {
       fs_reg expanded = fs_reg(this, glsl_type::float_type);
       emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
       src = expanded;
@@ -765,6 +802,30 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
    return true;
 }
 
+static uint32_t
+brw_conditional_for_comparison(unsigned int op)
+{
+   switch (op) {
+   case ir_binop_less:
+      return BRW_CONDITIONAL_L;
+   case ir_binop_greater:
+      return BRW_CONDITIONAL_G;
+   case ir_binop_lequal:
+      return BRW_CONDITIONAL_LE;
+   case ir_binop_gequal:
+      return BRW_CONDITIONAL_GE;
+   case ir_binop_equal:
+   case ir_binop_all_equal: /* same as equal for scalars */
+      return BRW_CONDITIONAL_Z;
+   case ir_binop_nequal:
+   case ir_binop_any_nequal: /* same as nequal for scalars */
+      return BRW_CONDITIONAL_NZ;
+   default:
+      assert(!"not reached: bad operation for comparison");
+      return BRW_CONDITIONAL_NZ;
+   }
+}
+
 void
 fs_visitor::visit(ir_expression *ir)
 {
@@ -814,6 +875,7 @@ fs_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_abs:
       op[0].abs = true;
+      op[0].negate = false;
       this->result = op[0];
       break;
    case ir_unop_sign:
@@ -880,35 +942,20 @@ fs_visitor::visit(ir_expression *ir)
       break;
 
    case ir_binop_less:
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_L;
-      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
-      break;
    case ir_binop_greater:
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_G;
-      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
-      break;
    case ir_binop_lequal:
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_LE;
-      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
-      break;
    case ir_binop_gequal:
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_GE;
-      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
-      break;
    case ir_binop_equal:
-   case ir_binop_all_equal: /* same as nequal for scalars */
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_Z;
-      emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
-      break;
+   case ir_binop_all_equal:
    case ir_binop_nequal:
-   case ir_binop_any_nequal: /* same as nequal for scalars */
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+   case ir_binop_any_nequal:
+      temp = this->result;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1]));
+      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
       emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
       break;
 
@@ -933,6 +980,10 @@ fs_visitor::visit(ir_expression *ir)
       assert(!"not reached: should be handled by lower_noise");
       break;
 
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+
    case ir_unop_sqrt:
       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
       break;
@@ -949,7 +1000,12 @@ fs_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_f2b:
    case ir_unop_i2b:
-      inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f)));
+      temp = this->result;
+      /* original gen4 does implicit conversion before comparison. */
+      if (intel->gen < 5)
+	 temp.type = op[0].type;
+
+      inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)));
       inst->conditional_mod = BRW_CONDITIONAL_NZ;
       inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
 			  this->result, fs_reg(1)));
@@ -1423,28 +1479,70 @@ fs_visitor::visit(ir_discard *ir)
 void
 fs_visitor::visit(ir_constant *ir)
 {
-   fs_reg reg(this, ir->type);
-   this->result = reg;
+   /* Set this->result to reg at the bottom of the function because some code
+    * paths will cause this visitor to be applied to other fields.  This will
+    * cause the value stored in this->result to be modified.
+    *
+    * Make reg constant so that it doesn't get accidentally modified along the
+    * way.  Yes, I actually had this problem. :(
+    */
+   const fs_reg reg(this, ir->type);
+   fs_reg dst_reg = reg;
 
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      switch (ir->type->base_type) {
-      case GLSL_TYPE_FLOAT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
-	 break;
-      case GLSL_TYPE_UINT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
-	 break;
-      case GLSL_TYPE_INT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
-	 break;
-      case GLSL_TYPE_BOOL:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
-	 break;
-      default:
-	 assert(!"Non-float/uint/int/bool constant");
+   if (ir->type->is_array()) {
+      const unsigned size = type_size(ir->type->fields.array);
+
+      for (unsigned i = 0; i < ir->type->length; i++) {
+	 ir->array_elements[i]->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else if (ir->type->is_record()) {
+      foreach_list(node, &ir->components) {
+	 ir_instruction *const field = (ir_instruction *) node;
+	 const unsigned size = type_size(field->type);
+
+	 field->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else {
+      const unsigned size = type_size(ir->type);
+
+      for (unsigned i = 0; i < size; i++) {
+	 switch (ir->type->base_type) {
+	 case GLSL_TYPE_FLOAT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
+	    break;
+	 case GLSL_TYPE_UINT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
+	    break;
+	 case GLSL_TYPE_INT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
+	    break;
+	 default:
+	    assert(!"Non-float/uint/int/bool constant");
+	 }
+	 dst_reg.reg_offset++;
       }
-      reg.reg_offset++;
    }
+
+   this->result = reg;
 }
 
 void
@@ -1490,7 +1588,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 	    inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
 				op[0], fs_reg(0.0f)));
 	 } else {
-	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
+	    inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0]));
 	 }
 	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
 	 break;
@@ -1505,31 +1603,18 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
 	 break;
 
       case ir_binop_greater:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_G;
-	 break;
       case ir_binop_gequal:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_GE;
-	 break;
       case ir_binop_less:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_L;
-	 break;
       case ir_binop_lequal:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_LE;
-	 break;
       case ir_binop_equal:
       case ir_binop_all_equal:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 break;
       case ir_binop_nequal:
       case ir_binop_any_nequal:
-	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]));
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
 	 break;
+
       default:
 	 assert(!"not reached");
 	 this->fail = true;
@@ -1574,7 +1659,7 @@ fs_visitor::emit_if_gen6(ir_if *ir)
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(1)));
+	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
 	 return;
 
@@ -1608,30 +1693,16 @@ fs_visitor::emit_if_gen6(ir_if *ir)
 	 return;
 
       case ir_binop_greater:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_G;
-	 return;
       case ir_binop_gequal:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_GE;
-	 return;
       case ir_binop_less:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_L;
-	 return;
       case ir_binop_lequal:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_LE;
-	 return;
       case ir_binop_equal:
       case ir_binop_all_equal:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 return;
       case ir_binop_nequal:
       case ir_binop_any_nequal:
 	 inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
+	 inst->conditional_mod =
+	    brw_conditional_for_comparison(expr->operation);
 	 return;
       default:
 	 assert(!"not reached");
@@ -1713,32 +1784,9 @@ fs_visitor::visit(ir_loop *ir)
       this->base_ir = ir->to;
       ir->to->accept(this);
 
-      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
+      fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp,
 				   counter, this->result));
-      switch (ir->cmp) {
-      case ir_binop_equal:
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 break;
-      case ir_binop_nequal:
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-      case ir_binop_gequal:
-	 inst->conditional_mod = BRW_CONDITIONAL_GE;
-	 break;
-      case ir_binop_lequal:
-	 inst->conditional_mod = BRW_CONDITIONAL_LE;
-	 break;
-      case ir_binop_greater:
-	 inst->conditional_mod = BRW_CONDITIONAL_G;
-	 break;
-      case ir_binop_less:
-	 inst->conditional_mod = BRW_CONDITIONAL_L;
-	 break;
-      default:
-	 assert(!"not reached: unknown loop condition");
-	 this->fail = true;
-	 break;
-      }
+      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
 
       inst = emit(fs_inst(BRW_OPCODE_BREAK));
       inst->predicated = true;
@@ -1951,7 +1999,7 @@ fs_visitor::emit_interpolation_setup_gen6()
    emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
 
    this->current_annotation = "compute 1/pos.w";
-   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
+   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
    this->pixel_w = fs_reg(this, glsl_type::float_type);
    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
 
@@ -1979,17 +2027,17 @@ fs_visitor::emit_fb_writes()
       nr += 2;
    }
 
-   if (c->key.aa_dest_stencil_reg) {
+   if (c->aa_dest_stencil_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
    }
 
    /* Reserve space for color. It'll be filled in per MRT below. */
    int color_mrf = nr;
    nr += 4;
 
-   if (c->key.source_depth_to_render_target) {
-      if (c->key.computes_depth) {
+   if (c->source_depth_to_render_target) {
+      if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));
@@ -1998,20 +2046,22 @@ fs_visitor::emit_fb_writes()
       } else {
 	 /* Pass through the payload depth. */
 	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
+		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
       }
    }
 
-   if (c->key.dest_depth_reg) {
+   if (c->dest_depth_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
    }
 
    fs_reg color = reg_undef;
    if (this->frag_color)
       color = *(variable_storage(this->frag_color));
-   else if (this->frag_data)
+   else if (this->frag_data) {
       color = *(variable_storage(this->frag_data));
+      color.type = BRW_REGISTER_TYPE_F;
+   }
 
    for (int target = 0; target < c->key.nr_color_regions; target++) {
       this->current_annotation = talloc_asprintf(this->mem_ctx,
@@ -2105,7 +2155,8 @@ fs_visitor::generate_fb_write(fs_inst *inst)
 		inst->target,
 		inst->mlen,
 		0,
-		eot);
+		eot,
+		inst->header_present);
 }
 
 void
@@ -2452,7 +2503,7 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
+   c->prog_data.first_curbe_grf = c->nr_payload_regs;
    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
@@ -2522,12 +2573,15 @@ fs_visitor::assign_urb_setup()
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
-      if (inst->opcode != FS_OPCODE_LINTERP)
-	 continue;
-
-      assert(inst->src[2].file == FIXED_HW_REG);
+      if (inst->opcode == FS_OPCODE_LINTERP) {
+	 assert(inst->src[2].file == FIXED_HW_REG);
+	 inst->src[2].fixed_hw_reg.nr += urb_start;
+      }
 
-      inst->src[2].fixed_hw_reg.nr += urb_start;
+      if (inst->opcode == FS_OPCODE_CINTERP) {
+	 assert(inst->src[0].file == FIXED_HW_REG);
+	 inst->src[0].fixed_hw_reg.nr += urb_start;
+      }
    }
 
    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
@@ -2618,6 +2672,7 @@ fs_visitor::split_virtual_grfs()
 	 }
       }
    }
+   this->live_intervals_valid = false;
 }
 
 /**
@@ -2692,8 +2747,11 @@ fs_visitor::calculate_live_intervals()
    int loop_start = 0;
    int bb_header_ip = 0;
 
+   if (this->live_intervals_valid)
+      return;
+
    for (int i = 0; i < num_vars; i++) {
-      def[i] = 1 << 30;
+      def[i] = MAX_INSTRUCTION;
       use[i] = -1;
    }
 
@@ -2771,6 +2829,8 @@ fs_visitor::calculate_live_intervals()
    talloc_free(this->virtual_grf_use);
    this->virtual_grf_def = def;
    this->virtual_grf_use = use;
+
+   this->live_intervals_valid = true;
 }
 
 /**
@@ -2786,6 +2846,8 @@ fs_visitor::propagate_constants()
 {
    bool progress = false;
 
+   calculate_live_intervals();
+
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
@@ -2843,6 +2905,7 @@ fs_visitor::propagate_constants()
 		  /* Fit this constant in by commuting the operands */
 		  scan_inst->src[0] = scan_inst->src[1];
 		  scan_inst->src[1] = inst->src[0];
+		  progress = true;
 	       }
 	       break;
 	    case BRW_OPCODE_CMP:
@@ -2863,6 +2926,9 @@ fs_visitor::propagate_constants()
       }
    }
 
+   if (progress)
+       this->live_intervals_valid = false;
+
    return progress;
 }
 /**
@@ -2877,6 +2943,8 @@ fs_visitor::dead_code_eliminate()
    bool progress = false;
    int pc = 0;
 
+   calculate_live_intervals();
+
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
@@ -2888,6 +2956,9 @@ fs_visitor::dead_code_eliminate()
       pc++;
    }
 
+   if (progress)
+      live_intervals_valid = false;
+
    return progress;
 }
 
@@ -2895,10 +2966,35 @@ bool
 fs_visitor::register_coalesce()
 {
    bool progress = false;
+   int if_depth = 0;
+   int loop_depth = 0;
 
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
+      /* Make sure that we dominate the instructions we're going to
+       * scan for interfering with our coalescing, or we won't have
+       * scanned enough to see if anything interferes with our
+       * coalescing.  We don't dominate the following instructions if
+       * we're in a loop or an if block.
+       */
+      switch (inst->opcode) {
+      case BRW_OPCODE_DO:
+	 loop_depth++;
+	 break;
+      case BRW_OPCODE_WHILE:
+	 loop_depth--;
+	 break;
+      case BRW_OPCODE_IF:
+	 if_depth++;
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 if_depth--;
+	 break;
+      }
+      if (loop_depth || if_depth)
+	 continue;
+
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->predicated ||
 	  inst->saturate ||
@@ -2916,14 +3012,6 @@ fs_visitor::register_coalesce()
       for (; scan_iter.has_next(); scan_iter.next()) {
 	 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
 
-	 if (scan_inst->opcode == BRW_OPCODE_DO ||
-	     scan_inst->opcode == BRW_OPCODE_WHILE ||
-	     scan_inst->opcode == BRW_OPCODE_ENDIF) {
-	    interfered = true;
-	    iter = scan_iter;
-	    break;
-	 }
-
 	 if (scan_inst->dst.file == GRF) {
 	    if (scan_inst->dst.reg == inst->dst.reg &&
 		(scan_inst->dst.reg_offset == inst->dst.reg_offset ||
@@ -2943,10 +3031,6 @@ fs_visitor::register_coalesce()
 	 continue;
       }
 
-      /* Update live interval so we don't have to recalculate. */
-      this->virtual_grf_use[inst->src[0].reg] = MAX2(virtual_grf_use[inst->src[0].reg],
-						     virtual_grf_use[inst->dst.reg]);
-
       /* Rewrite the later usage to point at the source of the move to
        * be removed.
        */
@@ -2971,6 +3055,9 @@ fs_visitor::register_coalesce()
       progress = true;
    }
 
+   if (progress)
+      live_intervals_valid = false;
+
    return progress;
 }
 
@@ -2981,6 +3068,8 @@ fs_visitor::compute_to_mrf()
    bool progress = false;
    int next_ip = 0;
 
+   calculate_live_intervals();
+
    foreach_iter(exec_list_iterator, iter, this->instructions) {
       fs_inst *inst = (fs_inst *)iter.get();
 
@@ -3184,15 +3273,16 @@ fs_visitor::virtual_grf_interferes(int a, int b)
    int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
    int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
 
-   /* For dead code, just check if the def interferes with the other range. */
-   if (this->virtual_grf_use[a] == -1) {
-      return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] &&
-	      this->virtual_grf_def[a] < this->virtual_grf_use[b]);
-   }
-   if (this->virtual_grf_use[b] == -1) {
-      return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] &&
-	      this->virtual_grf_def[b] < this->virtual_grf_use[a]);
-   }
+   /* We can't handle dead register writes here, without iterating
+    * over the whole instruction stream to find every single dead
+    * write to that register to compare to the live interval of the
+    * other register.  Just assert that dead_code_eliminate() has been
+    * called.
+    */
+   assert((this->virtual_grf_use[a] != -1 ||
+	   this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
+	  (this->virtual_grf_use[b] != -1 ||
+	   this->virtual_grf_def[b] == MAX_INSTRUCTION));
 
    return start < end;
 }
@@ -3227,6 +3317,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 	 break;
       default:
 	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
 	 break;
       }
       break;
@@ -3241,6 +3332,10 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
       assert(!"not reached");
       brw_reg = brw_null_reg();
       break;
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
    }
    if (reg->abs)
       brw_reg = brw_abs(brw_reg);
@@ -3373,10 +3468,6 @@ fs_visitor::generate_code()
 	 break;
 
       case BRW_OPCODE_DO:
-	 /* FINISHME: We need to write the loop instruction support still. */
-	 if (intel->gen >= 6)
-	    this->fail = true;
-
 	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
 	 if_depth_in_loop[loop_stack_depth] = 0;
 	 break;
@@ -3386,7 +3477,11 @@ fs_visitor::generate_code()
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
       case BRW_OPCODE_CONTINUE:
-	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
 
@@ -3400,16 +3495,18 @@ fs_visitor::generate_code()
 	 assert(loop_stack_depth > 0);
 	 loop_stack_depth--;
 	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
-	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
-	 while (inst0 > loop_stack[loop_stack_depth]) {
-	    inst0--;
-	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-		inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 	    }
-	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-		     inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
 	    }
 	 }
       }
@@ -3425,6 +3522,9 @@ fs_visitor::generate_code()
       case FS_OPCODE_COS:
 	 generate_math(inst, dst, src);
 	 break;
+      case FS_OPCODE_CINTERP:
+	 brw_MOV(p, dst, src[0]);
+	 break;
       case FS_OPCODE_LINTERP:
 	 generate_linterp(inst, dst, src);
 	 break;
@@ -3486,6 +3586,26 @@ fs_visitor::generate_code()
 
       last_native_inst = p->nr_insn;
    }
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
 }
 
 GLboolean
@@ -3553,7 +3673,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 
 	 progress = v.remove_duplicate_mrf_writes() || progress;
 
-	 v.calculate_live_intervals();
 	 progress = v.propagate_constants() || progress;
 	 progress = v.register_coalesce() || progress;
 	 progress = v.compute_to_mrf() || progress;
@@ -3566,7 +3685,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 	 for (int i = 1; i < virtual_grf_count; i++) {
 	    v.spill_reg(i);
 	 }
-	 v.calculate_live_intervals();
       }
 
       if (0)
@@ -3575,8 +3693,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 	 while (!v.assign_regs()) {
 	    if (v.fail)
 	       break;
-
-	    v.calculate_live_intervals();
 	 }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index de7b153..82d96f6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -68,6 +68,7 @@ enum fs_opcodes {
    FS_OPCODE_COS,
    FS_OPCODE_DDX,
    FS_OPCODE_DDY,
+   FS_OPCODE_CINTERP,
    FS_OPCODE_LINTERP,
    FS_OPCODE_TEX,
    FS_OPCODE_TXB,
@@ -348,6 +349,23 @@ public:
 					  hash_table_pointer_hash,
 					  hash_table_pointer_compare);
 
+      /* There's a question that appears to be left open in the spec:
+       * How do implicit dst conversions interact with the CMP
+       * instruction or conditional mods?  On gen6, the instruction:
+       *
+       * CMP null<d> src0<f> src1<f>
+       *
+       * will do src1 - src0 and compare that result as if it was an
+       * integer.  On gen4, it will do src1 - src0 as float, convert
+       * the result to int, and compare as int.  In between, it
+       * appears that it does src1 - src0 and does the compare in the
+       * execution type so dst type doesn't matter.
+       */
+      if (this->intel->gen > 4)
+	 this->reg_null_cmp = reg_null_d;
+      else
+	 this->reg_null_cmp = reg_null_f;
+
       this->frag_color = NULL;
       this->frag_data = NULL;
       this->frag_depth = NULL;
@@ -361,6 +379,7 @@ public:
       this->virtual_grf_array_size = 0;
       this->virtual_grf_def = NULL;
       this->virtual_grf_use = NULL;
+      this->live_intervals_valid = false;
 
       this->kill_emitted = false;
    }
@@ -462,6 +481,7 @@ public:
    int virtual_grf_array_size;
    int *virtual_grf_def;
    int *virtual_grf_use;
+   bool live_intervals_valid;
 
    struct hash_table *variable_ht;
    ir_variable *frag_color, *frag_data, *frag_depth;
@@ -485,6 +505,7 @@ public:
    fs_reg pixel_w;
    fs_reg delta_x;
    fs_reg delta_y;
+   fs_reg reg_null_cmp;
 
    int grf_used;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 3b7b03a..20bfa4c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -205,6 +205,8 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_round_even:
    case ir_unop_sin:
    case ir_unop_cos:
+   case ir_unop_sin_reduced:
+   case ir_unop_cos_reduced:
    case ir_unop_dFdx:
    case ir_unop_dFdy:
       for (i = 0; i < vector_elements; i++) {
@@ -328,6 +330,9 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_noise:
       assert(!"noise should have been broken down to function call");
       break;
+   case ir_quadop_vector:
+      assert(!"should have been lowered");
+      break;
    }
 
    ir->remove();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index bbb210c..078a349 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -94,6 +94,8 @@ fs_visitor::assign_regs()
    int class_count = 0;
    int aligned_pair_class = -1;
 
+   calculate_live_intervals();
+
    /* Set up the register classes.
     *
     * The base registers store a scalar value.  For texture samples,
@@ -416,4 +418,6 @@ fs_visitor::spill_reg(int spill_reg)
 	 }
       }
    }
+
+   this->live_intervals_valid = false;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index b0c76f4..70c451d 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -96,6 +96,9 @@ static void compile_gs_prog( struct brw_context *brw,
       brw_gs_quad_strip( &c, key );
       break;
    case GL_LINE_LOOP:
+      /* Gen6: LINELOOP is converted to LINESTRIP at the beginning of the 3D pipeline */
+      if (intel->gen == 6)
+          return;
       brw_gs_lines( &c );
       break;
    case GL_LINES:
@@ -166,6 +169,9 @@ static void populate_key( struct brw_context *brw,
 			  struct brw_gs_prog_key *key )
 {
    struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   int prim_gs_always;
+
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -185,10 +191,14 @@ static void populate_key( struct brw_context *brw,
       key->pv_first = GL_TRUE;
    }
 
-   key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == GL_QUADS ||
+   if (intel->gen == 6)
+       prim_gs_always = 0;
+   else
+       prim_gs_always = brw->primitive == GL_QUADS ||
 			brw->primitive == GL_QUAD_STRIP ||
-			brw->primitive == GL_LINE_LOOP);
+			brw->primitive == GL_LINE_LOOP;
+
+   key->need_gs_prog = (key->hint_gs_always || prim_gs_always);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
@@ -205,8 +215,10 @@ static void prepare_gs_prog(struct brw_context *brw)
       brw->gs.prog_active = key.need_gs_prog;
    }
 
+   drm_intel_bo_unreference(brw->gs.prog_bo);
+   brw->gs.prog_bo = NULL;
+
    if (brw->gs.prog_active) {
-      drm_intel_bo_unreference(brw->gs.prog_bo);
       brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
 					 &key, sizeof(key),
 					 NULL, 0,
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index a91b052..79afe19 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -214,7 +214,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 
    if (region == NULL) {
       BEGIN_BATCH(len);
-      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
 		(BRW_SURFACE_NULL << 29));
       OUT_BATCH(0);
@@ -251,7 +251,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 assert(region->tiling != I915_TILING_NONE);
 
       BEGIN_BATCH(len);
-      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
 		(format << 18) |
 		(BRW_TILEWALK_YMAJOR << 26) |
@@ -277,7 +277,7 @@ static void emit_depthbuffer(struct brw_context *brw)
    /* Initialize it for safety. */
    if (intel->gen >= 6) {
       BEGIN_BATCH(2);
-      OUT_BATCH(CMD_3D_CLEAR_PARAMS << 16 | (2 - 2));
+      OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | (2 - 2));
       OUT_BATCH(0);
       ADVANCE_BATCH();
    }
@@ -309,7 +309,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
       return;
 
    memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps.header.opcode = _3DSTATE_POLY_STIPPLE_PATTERN;
    bps.header.length = sizeof(bps)/4-2;
 
    /* Polygon stipple is provided in OpenGL order, i.e. bottom
@@ -354,7 +354,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
       return;
 
    memset(&bpso, 0, sizeof(bpso));
-   bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+   bpso.header.opcode = _3DSTATE_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
 
    /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
@@ -401,7 +401,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 
    /* use legacy aa line coverage computation */
    memset(&balp, 0, sizeof(balp));
-   balp.header.opcode = CMD_AA_LINE_PARAMETERS;
+   balp.header.opcode = _3DSTATE_AA_LINE_PARAMETERS;
    balp.header.length = sizeof(balp) / 4 - 2;
    
    BRW_CACHED_BATCH_STRUCT(brw, &balp);
@@ -431,7 +431,7 @@ static void upload_line_stipple(struct brw_context *brw)
       return;
 
    memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.opcode = _3DSTATE_LINE_STIPPLE_PATTERN;
    bls.header.length = sizeof(bls)/4 - 2;
 
    bls.bits0.pattern = ctx->Line.StipplePattern;
@@ -481,7 +481,7 @@ static void upload_invarient_state( struct brw_context *brw )
 
       /* Disable depth offset clamping. 
        */
-      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.opcode = _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP;
       gdo.header.length = sizeof(gdo)/4 - 2;
       gdo.depth_offset_clamp = 0.0;
 
@@ -492,20 +492,20 @@ static void upload_invarient_state( struct brw_context *brw )
       int i;
 
       BEGIN_BATCH(3);
-      OUT_BATCH(CMD_3D_MULTISAMPLE << 16 | (3 - 2));
+      OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (3 - 2));
       OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
 		MS_NUMSAMPLES_1);
       OUT_BATCH(0); /* positions for 4/8-sample */
       ADVANCE_BATCH();
 
       BEGIN_BATCH(2);
-      OUT_BATCH(CMD_3D_SAMPLE_MASK << 16 | (2 - 2));
+      OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2));
       OUT_BATCH(1);
       ADVANCE_BATCH();
 
       for (i = 0; i < 4; i++) {
 	 BEGIN_BATCH(4);
-	 OUT_BATCH(CMD_GS_SVB_INDEX << 16 | (4 - 2));
+	 OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2));
 	 OUT_BATCH(i << SVB_INDEX_SHIFT);
 	 OUT_BATCH(0);
 	 OUT_BATCH(0xffffffff);
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 1367d81..94efa79 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -142,7 +142,6 @@ static GLboolean brwProgramStringNotify( struct gl_context *ctx,
       if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = brw->program_id++;      
-      newFP->isGLSL = brw_wm_is_glsl(fprog);
 
       /* Don't reject fragment shaders for their Mesa IR state when we're
        * using the new FS backend.
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index f28f286..656aad6 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -232,6 +232,12 @@ brw_prepare_query_begin(struct brw_context *brw)
       brw->query.bo = NULL;
 
       brw->query.bo = drm_intel_bo_alloc(intel->bufmgr, "query", 4096, 1);
+
+      /* clear target buffer */
+      drm_intel_bo_map(brw->query.bo, GL_TRUE);
+      memset((char *)brw->query.bo->virtual, 0, 4096);
+      drm_intel_bo_unmap(brw->query.bo);
+
       brw->query.index = 0;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 3beed16..4bb93e7 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -164,7 +164,8 @@ void brw_destroy_caches( struct brw_context *brw );
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)))
+#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data(brw->intel.batch, (s), \
+							sizeof(*(s)), false)
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index be3989e..a21af13 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -48,7 +48,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz);
+      intel_batchbuffer_data(brw->intel.batch, data, sz, false);
       return GL_TRUE;
    }
 
@@ -75,7 +75,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz);
+   intel_batchbuffer_data(brw->intel.batch, data, sz, false);
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 58ff528..7045888 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -58,8 +58,6 @@
 
 #include "main/imports.h"
 #include "brw_state.h"
-#include "intel_batchbuffer.h"
-#include "brw_wm.h"
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 338f387..eba4411 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -129,7 +129,7 @@ const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
    &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
-   &gen6_wm_constants, /* Before wm_surfaces and constant_buffer */
+   &gen6_wm_constants, /* Before wm_state */
 
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 8ce9af9..6687a89 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1017,7 +1017,14 @@ struct brw_wm_unit_state
       GLuint enable_32_pix:1; 
       GLuint enable_con_32_pix:1;
       GLuint enable_con_64_pix:1;
-      GLuint pad0:5;
+      GLuint pad0:1;
+
+      /* These next four bits are for Ironlake+ */
+      GLuint fast_span_coverage_enable:1;
+      GLuint depth_buffer_clear:1;
+      GLuint depth_buffer_resolve_enable:1;
+      GLuint hierarchical_depth_buffer_resolve_enable:1;
+
       GLuint legacy_global_depth_bias:1; 
       GLuint line_stipple:1; 
       GLuint depth_offset:1; 
@@ -1064,6 +1071,15 @@ struct brw_sampler_default_color {
    GLfloat color[4];
 };
 
+struct gen5_sampler_default_color {
+   uint8_t ub[4];
+   float f[4];
+   uint16_t hf[4];
+   uint16_t us[4];
+   int16_t s[4];
+   uint8_t b[4];
+};
+
 struct brw_sampler_state
 {
    
@@ -1169,7 +1185,12 @@ struct brw_surface_state
       GLuint cube_neg_y:1; 
       GLuint cube_pos_x:1; 
       GLuint cube_neg_x:1; 
-      GLuint pad:4;
+      GLuint pad:2;
+      /* Required on gen6 for surfaces accessed through render cache messages.
+       */
+      GLuint render_cache_read_write:1;
+      /* Ironlake and newer: instead of replicating one of the texels */
+      GLuint cube_corner_average:1;
       GLuint mipmap_layout_mode:1; 
       GLuint vert_line_stride_ofs:1; 
       GLuint vert_line_stride:1; 
@@ -1539,6 +1560,21 @@ struct brw_instruction
 	 GLuint  pad0:12;
       } if_else;
 
+      struct
+      {
+	 /* Signed jump distance to the ip to jump to if all channels
+	  * are disabled after the break or continue.  It should point
+	  * to the end of the innermost control flow block, as that's
+	  * where some channel could get re-enabled.
+	  */
+	 int jip:16;
+
+	 /* Signed jump distance to the location to resume execution
+	  * of this channel if it's enabled for the break or continue.
+	  */
+	 int uip:16;
+      } break_cont;
+
       struct {
 	 GLuint function:4;
 	 GLuint int_type:1;
@@ -1636,6 +1672,18 @@ struct brw_instruction
 
       struct {
 	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint msg_type:3;
+	 GLuint target_cache:2;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } dp_read_g4x;
+
+      struct {
+	 GLuint binding_table_index:8;
 	 GLuint msg_control:3;  
 	 GLuint msg_type:3;  
 	 GLuint target_cache:2;    
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 4a41c7a..6ae75d2 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -99,8 +99,8 @@ static void do_vs_prog( struct brw_context *brw,
    (void) ctx;
 
    aux_size = sizeof(c.prog_data);
-   if (c.vp->use_const_buffer)
-      aux_size += c.vp->program.Base.Parameters->NumParameters;
+   /* constant_map */
+   aux_size += c.vp->program.Base.Parameters->NumParameters;
 
    drm_intel_bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
@@ -130,6 +130,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
    key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
 			ctx->Polygon.BackMode != GL_FILL);
+   key.two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_POINT */
    if (ctx->Point.PointSprite) {
@@ -157,7 +158,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT,
+      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT | _NEW_LIGHT,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 9338a6b..0b88cc1 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -44,6 +44,7 @@ struct brw_vs_prog_key {
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
    GLuint point_coord_replace:8;
+   GLuint two_side_color: 1;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index a13c3ca..0411ce0 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -140,9 +140,13 @@ clear_current_const(struct brw_vs_compile *c)
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    struct intel_context *intel = &c->func.brw->intel;
-   GLuint i, reg = 0, mrf;
+   GLuint i, reg = 0, mrf, j;
    int attributes_in_vue;
    int first_reladdr_output;
+   int max_constant;
+   int constant = 0;
+   int vert_result_reoder[VERT_RESULT_MAX];
+   int bfc = 0;
 
    /* Determine whether to use a real constant buffer or use a block
     * of GRF registers for constants.  The later is faster but only
@@ -181,62 +185,81 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    }
 
-   /* Vertex program parameters from curbe:
+   /* Assign some (probably all) of the vertex program constants to
+    * the push constant buffer/CURBE.
+    *
+    * There's an obvious limit to the numer of push constants equal to
+    * the number of register available, and that number is smaller
+    * than the minimum maximum number of vertex program parameters, so
+    * support for pull constants is required if we overflow.
+    * Additionally, on gen6 the number of push constants is even
+    * lower.
+    *
+    * When there's relative addressing, we don't know what range of
+    * Mesa IR registers can be accessed.  And generally, when relative
+    * addressing is used we also have too many constants to load them
+    * all as push constants.  So, we'll just support relative
+    * addressing out of the pull constant buffers, and try to load as
+    * many statically-accessed constants into the push constant buffer
+    * as we can.
     */
-   if (c->vp->use_const_buffer) {
-      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
-      int constant = 0;
-
-      /* We've got more constants than we can load with the push
-       * mechanism.  This is often correlated with reladdr loads where
-       * we should probably be using a pull mechanism anyway to avoid
-       * excessive reading.  However, the pull mechanism is slow in
-       * general.  So, we try to allocate as many non-reladdr-loaded
-       * constants through the push buffer as we can before giving up.
-       */
-      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
-      for (i = 0;
-	   i < c->vp->program.Base.NumInstructions && constant < max_constant;
-	   i++) {
-	 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
-	 int arg;
-
-	 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
-	    if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
-		 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
-		 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
-		 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
-		 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
-		inst->SrcReg[arg].RelAddr)
-	       continue;
-
-	    if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
-	       c->constant_map[inst->SrcReg[arg].Index] = constant++;
-	    }
+   if (intel->gen >= 6) {
+      /* We can only load 32 regs of push constants. */
+      max_constant = 32 * 2 - c->key.nr_userclip;
+   } else {
+      max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+   }
+
+   /* constant_map maps from ParameterValues[] index to index in the
+    * push constant buffer, or -1 if it's only in the pull constant
+    * buffer.
+    */
+   memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+   for (i = 0;
+	i < c->vp->program.Base.NumInstructions && constant < max_constant;
+	i++) {
+      struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+      int arg;
+
+      for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+	 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+	     inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+	     inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+	     inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+	     inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
+	    continue;
+	 }
+
+	 if (inst->SrcReg[arg].RelAddr) {
+	    c->vp->use_const_buffer = GL_TRUE;
+	    continue;
 	 }
-      }
 
-      for (i = 0; i < constant; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
-							      (i%2) * 4),
-						 0, 4, 1);
+	 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+	    c->constant_map[inst->SrcReg[arg].Index] = constant++;
+	 }
       }
-      reg += (constant + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
-      /* XXX 0 causes a bug elsewhere... */
-      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
-   else {
-      /* use a section of the GRF for constants */
-      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
-      for (i = 0; i < nr_params; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-      }
-      reg += (nr_params + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
 
-      c->prog_data.nr_params = nr_params * 4;
+   /* If we ran out of push constant space, then we'll also upload all
+    * constants through the pull constant buffer so that they can be
+    * accessed no matter what.  For relative addressing (the common
+    * case) we need them all in place anyway.
+    */
+   if (constant == max_constant)
+      c->vp->use_const_buffer = GL_TRUE;
+
+   for (i = 0; i < constant; i++) {
+      c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
+							  (i % 2) * 4),
+					     0, 4, 1);
    }
+   reg += (constant + 1) / 2;
+   c->prog_data.curb_read_length = reg - 1;
+   c->prog_data.nr_params = constant * 4;
+   /* XXX 0 causes a bug elsewhere... */
+   if (intel->gen < 6 && c->prog_data.nr_params == 0)
+      c->prog_data.nr_params = 4;
 
    /* Allocate input regs:  
     */
@@ -270,7 +293,36 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       mrf = 4;
 
    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
+       vert_result_reoder[i] = i;
+
+   /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
+   if (intel->gen >= 6 && c->key.two_side_color) {
+       if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+
+       if (bfc) {
+           for (i = 0; i < bfc; i++) {
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
+           }
+
+           for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
+               vert_result_reoder[i] = i - bfc;
+           }
+       }
+   }
+
+   for (j = 0; j < VERT_RESULT_MAX; j++) {
+      i = vert_result_reoder[j];
+
       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 	 c->nr_outputs++;
          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
@@ -281,7 +333,6 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 else if (i == VERT_RESULT_PSIZ) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
 	    /* Two restrictions on our compute-to-MRF here.  The
@@ -607,6 +658,22 @@ static void emit_min( struct brw_compile *p,
    }
 }
 
+static void emit_arl(struct brw_compile *p,
+		     struct brw_reg dst,
+		     struct brw_reg src)
+{
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
+
+      brw_RNDD(p, dst_f, src);
+      brw_MOV(p, dst, dst_f);
+   } else {
+      brw_RNDD(p, dst, src);
+   }
+}
+
 static void emit_math1_gen4(struct brw_vs_compile *c,
 			    GLuint function,
 			    struct brw_reg dst,
@@ -698,7 +765,7 @@ emit_math1(struct brw_vs_compile *c,
       emit_math1_gen4(c, function, dst, arg0, precision);
 }
 
-static void emit_math2( struct brw_vs_compile *c, 
+static void emit_math2_gen4( struct brw_vs_compile *c, 
 			GLuint function,
 			struct brw_reg dst,
 			struct brw_reg arg0,
@@ -706,14 +773,11 @@ static void emit_math2( struct brw_vs_compile *c,
 			GLuint precision)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
    GLboolean need_tmp = GL_FALSE;
 
-   if (dst.file != BRW_GENERAL_REGISTER_FILE)
-      need_tmp = GL_TRUE;
-
-   if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
+   if (dst.file != BRW_GENERAL_REGISTER_FILE ||
+       dst.dw1.bits.writemask != 0xf)
       need_tmp = GL_TRUE;
 
    if (need_tmp) 
@@ -736,6 +800,53 @@ static void emit_math2( struct brw_vs_compile *c,
    }
 }
 
+static void emit_math2_gen6( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp_src0, tmp_src1, tmp_dst;
+
+   tmp_src0 = get_tmp(c);
+   tmp_src1 = get_tmp(c);
+   tmp_dst = get_tmp(c);
+
+   brw_MOV(p, tmp_src0, arg0);
+   brw_MOV(p, tmp_src1, arg1);
+   
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math2(p,
+	    tmp_dst,
+	    function,
+	    tmp_src0,
+	    tmp_src1);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   brw_MOV(p, dst, tmp_dst);
+
+   release_tmp(c, tmp_src0);
+   release_tmp(c, tmp_src1);
+   release_tmp(c, tmp_dst);
+}
+
+static void emit_math2( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6)
+      emit_math2_gen6(c, function, dst, arg0, arg1, precision);
+   else
+      emit_math2_gen4(c, function, dst, arg0, arg1, precision);
+}
 
 static void emit_exp_noalias( struct brw_vs_compile *c,
 			      struct brw_reg dst,
@@ -1008,8 +1119,6 @@ get_constant(struct brw_vs_compile *c,
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    if (c->current_const[argIndex].index != src->Index) {
       /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
@@ -1027,7 +1136,7 @@ get_constant(struct brw_vs_compile *c,
    }
 
    /* replicate lower four floats into upper half (to get XYZWXYZW) */
-   const_reg = stride(const_reg, 0, 4, 0);
+   const_reg = stride(const_reg, 0, 4, 1);
    const_reg.subnr = 0;
 
    return const_reg;
@@ -1040,14 +1149,14 @@ get_reladdr_constant(struct brw_vs_compile *c,
 {
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg const_reg = c->current_const[argIndex].reg;
-   struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
-   struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   uint32_t offset;
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    /* Can't reuse a reladdr constant load. */
    c->current_const[argIndex].index = -1;
 
@@ -1056,15 +1165,21 @@ get_reladdr_constant(struct brw_vs_compile *c,
 	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
 
-   brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
+   if (intel->gen >= 6) {
+      offset = src->Index;
+   } else {
+      struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+      brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
+      addr_reg = byte_addr_reg;
+      offset = 16 * src->Index;
+   }
 
    /* fetch the first vec4 */
    brw_dp_READ_4_vs_relative(p,
-			     const_reg,                     /* writeback dest */
-			     byte_addr_reg,                 /* address register */
-			     16 * src->Index,               /* byte offset */
-			     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
-			     );
+			     const_reg,
+			     addr_reg,
+			     offset,
+			     SURF_INDEX_VERT_CONST_BUFFER);
 
    return const_reg;
 }
@@ -1259,22 +1374,18 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_UNIFORM:
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
-      if (c->vp->use_const_buffer) {
-	 if (!relAddr && c->constant_map[index] != -1) {
-	    assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
-	    return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
-	 } else if (relAddr)
+      if (!relAddr && c->constant_map[index] != -1) {
+	 /* Take from the push constant buffer if possible. */
+	 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+	 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+      } else {
+	 /* Must be in the pull constant buffer then .*/
+	 assert(c->vp->use_const_buffer);
+	 if (relAddr)
 	    return get_reladdr_constant(c, inst, argIndex);
 	 else
 	    return get_constant(c, inst, argIndex);
       }
-      else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
-      }
-      else {
-         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-         return c->regs[PROGRAM_STATE_VAR][index];
-      }
    case PROGRAM_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
@@ -1315,11 +1426,10 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
 					  GET_SWZ(src->Swizzle, 1),
 					  GET_SWZ(src->Swizzle, 2),
 					  GET_SWZ(src->Swizzle, 3));
-   }
 
-   /* Note this is ok for non-swizzle instructions: 
-    */
-   reg.negate = src->Negate ? 1 : 0;   
+      /* Note this is ok for non-swizzle ARB_vp instructions */
+      reg.negate = src->Negate ? 1 : 0;
+   }
 
    return reg;
 }
@@ -1603,6 +1713,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 	 break;
       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
 	 continue;
+      if (i == VERT_RESULT_PSIZ)
+	 continue;
 
       if (i >= VERT_RESULT_TEX0 &&
 	  c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
@@ -1830,6 +1942,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
       switch (inst->Opcode) {
       case OPCODE_ABS:
+	 args[0].negate = false;
 	 brw_MOV(p, dst, brw_abs(args[0]));
 	 break;
       case OPCODE_ADD:
@@ -1866,7 +1979,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_ARL:
-	 brw_RNDD(p, dst, args[0]);
+	 emit_arl(p, dst, args[0]);
 	 break;
       case OPCODE_FLR:
 	 brw_RNDD(p, dst, args[0]);
@@ -1913,7 +2026,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_RSQ:
-	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
 	 break;
 
       case OPCODE_SEQ:
@@ -1987,35 +2100,42 @@ void brw_vs_emit(struct brw_vs_compile *c )
          break;
       case OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
-	 brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 if (intel->gen >= 6) {
+	    brw_CONT_gen6(p, loop_inst[loop_depth - 1]);
+	 } else {
+	    brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 }
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
-         {
-	    clear_current_const(c);
-            struct brw_instruction *inst0, *inst1;
-	    GLuint br = 1;
-
-            loop_depth--;
-
-	    if (intel->gen == 5)
-	       br = 2;
-
-            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-            while (inst0 > loop_inst[loop_depth]) {
-               inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+
+      case OPCODE_ENDLOOP: {
+	 clear_current_const(c);
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 loop_depth--;
+
+	 if (intel->gen == 5)
+	    br = 2;
+
+	 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+	    while (inst0 > loop_inst[loop_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
 		   inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-               }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-               }
-            }
-         }
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	       } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			  inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
          break;
+
       case OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
@@ -2106,6 +2226,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
    }
 
    brw_resolve_cals(p);
+   brw_set_uip_jip(p);
 
    brw_optimize(p);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index eabac51..b0b0544 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -82,6 +82,15 @@ prepare_vs_constants(struct brw_context *brw)
 	     params->ParameterValues[i],
 	     4 * sizeof(float));
    }
+
+   if (0) {
+      for (i = 0; i < params->NumParameters; i++) {
+	 float *row = (float *)brw->vs.const_bo->virtual + i * 4;
+	 printf("vs const surface %3d: %4.3f %4.3f %4.3f %4.3f\n",
+		i, row[0], row[1], row[2], row[3]);
+      }
+   }
+
    drm_intel_gem_bo_unmap_gtt(brw->vs.const_bo);
    brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 3d7a98c..100a21b 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -203,4 +203,5 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
+   brw->intel.vtbl.render_target_supported = brw_render_target_supported;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index ccdc18e..656501b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -119,6 +119,62 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_emit(c);
 }
 
+static void
+brw_wm_payload_setup(struct brw_context *brw,
+		     struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
+
+   if (intel->gen >= 6) {
+      /* R0-1: masks, pixel X/Y coordinates. */
+      c->nr_payload_regs = 2;
+      /* R2: only for 32-pixel dispatch.*/
+      /* R3-4: perspective pixel location barycentric */
+      c->nr_payload_regs += 2;
+      /* R5-6: perspective pixel location bary for dispatch width != 8 */
+      if (c->dispatch_width == 16) {
+	 c->nr_payload_regs += 2;
+      }
+      /* R7-10: perspective centroid barycentric */
+      /* R11-14: perspective sample barycentric */
+      /* R15-18: linear pixel location barycentric */
+      /* R19-22: linear centroid barycentric */
+      /* R23-26: linear sample barycentric */
+
+      /* R27: interpolated depth if uses source depth */
+      if (uses_depth) {
+	 c->source_depth_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R28: interpolated depth if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
+       */
+      if (uses_depth) {
+	 c->source_w_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R30: interpolated W if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R31: MSAA position offsets. */
+      /* R32-: bary for 32-pixel. */
+      /* R58-59: interp W for 32-pixel. */
+
+      if (c->fp->program.Base.OutputsWritten &
+	  BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+	 c->source_depth_to_render_target = GL_TRUE;
+	 c->computes_depth = GL_TRUE;
+      }
+   } else {
+      brw_wm_lookup_iz(intel, c);
+   }
+}
 
 /**
  * All Mesa program -> GPU code generation goes through this function.
@@ -167,23 +223,18 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   /* temporary sanity check assertion */
-   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   brw_wm_payload_setup(brw, c);
 
    if (!brw_wm_fs_emit(brw, c)) {
       /*
        * Shader which use GLSL features such as flow control are handled
        * differently from "simple" shaders.
        */
-      if (fp->isGLSL) {
-	 c->dispatch_width = 8;
-	 brw_wm_glsl_emit(brw, c);
-      }
-      else {
-	 c->dispatch_width = 16;
-	 brw_wm_non_glsl_emit(brw, c);
-      }
+      c->dispatch_width = 16;
+      brw_wm_payload_setup(brw, c);
+      brw_wm_non_glsl_emit(brw, c);
    }
+   c->prog_data.dispatch_width = c->dispatch_width;
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
@@ -220,12 +271,10 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
-   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
-   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -285,57 +334,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
       }
    }
 
-   if (intel->gen >= 6) {
-      /* R0-1: masks, pixel X/Y coordinates. */
-      key->nr_payload_regs = 2;
-      /* R2: only for 32-pixel dispatch.*/
-      /* R3-4: perspective pixel location barycentric */
-      key->nr_payload_regs += 2;
-      /* R5-6: perspective pixel location bary for dispatch width != 8 */
-      if (!fp->isGLSL) { /* dispatch_width != 8 */
-	 key->nr_payload_regs += 2;
-      }
-      /* R7-10: perspective centroid barycentric */
-      /* R11-14: perspective sample barycentric */
-      /* R15-18: linear pixel location barycentric */
-      /* R19-22: linear centroid barycentric */
-      /* R23-26: linear sample barycentric */
-
-      /* R27: interpolated depth if uses source depth */
-      if (uses_depth) {
-	 key->source_depth_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R28: interpolated depth if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
-       */
-      if (uses_depth) {
-	 key->source_w_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R30: interpolated W if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R31: MSAA position offsets. */
-      /* R32-: bary for 32-pixel. */
-      /* R58-59: interp W for 32-pixel. */
-
-      if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 key->source_depth_to_render_target = GL_TRUE;
-	 key->computes_depth = GL_TRUE;
-      }
-
-   } else {
-      brw_wm_lookup_iz(intel,
-	      	       line_aa,
-		       lookup,
-		       uses_depth,
-		       key);
-   }
+   key->iz_lookup = lookup;
+   key->line_aa = line_aa;
+   key->stats_wm = brw->intel.stats_wm;
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
    key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
@@ -377,6 +378,10 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	       swizzles[2] = SWIZZLE_ZERO;
 	    } else if (t->DepthMode == GL_LUMINANCE) {
 	       swizzles[3] = SWIZZLE_ONE;
+	    } else if (t->DepthMode == GL_RED) {
+	       swizzles[1] = SWIZZLE_ZERO;
+	       swizzles[2] = SWIZZLE_ZERO;
+	       swizzles[3] = SWIZZLE_ZERO;
 	    }
 	 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 2ca6857..d9cae75 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -59,16 +59,9 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
-   GLuint source_depth_reg:3;
-   GLuint source_w_reg:3;
-   GLuint aa_dest_stencil_reg:3;
-   GLuint dest_depth_reg:3;
-   GLuint nr_payload_regs:4;
-   GLuint computes_depth:1;	/* could be derived from program string */
-   GLuint source_depth_to_render_target:1;
+   GLuint stats_wm:1;
    GLuint flat_shade:1;
    GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
-   GLuint runtime_check_aads_emit:1;
    GLuint nr_color_regions:5;
    GLuint render_to_fbo:1;
 
@@ -81,6 +74,8 @@ struct brw_wm_prog_key {
 
    GLushort drawable_height;
    GLbitfield64 vp_outputs_written;
+   GLuint iz_lookup;
+   GLuint line_aa;
    GLuint program_string_id:32;
 };
 
@@ -204,6 +199,15 @@ struct brw_wm_compile {
       PASS2_DONE
    } state;
 
+   GLuint source_depth_reg:3;
+   GLuint source_w_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_payload_regs:4;
+   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint source_depth_to_render_target:1;
+   GLuint runtime_check_aads_emit:1;
+
    /* Initial pass - translate fp instructions to fp instructions,
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
@@ -306,14 +310,9 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 void brw_wm_print_program( struct brw_wm_compile *c,
 			   const char *stage );
 
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key );
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c);
 
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 /* brw_wm_emit.c */
@@ -381,7 +380,6 @@ void emit_fb_write(struct brw_wm_compile *c,
 void emit_frontfacing(struct brw_compile *p,
 		      const struct brw_reg *dst,
 		      GLuint mask);
-void emit_kil_nv(struct brw_wm_compile *c);
 void emit_linterp(struct brw_compile *p,
 		  const struct brw_reg *dst,
 		  GLuint mask,
@@ -476,5 +474,6 @@ struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint typ
 struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint name);
 
 bool brw_color_buffer_write_enabled(struct brw_context *brw);
+bool brw_render_target_supported(gl_format format);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 96fecc9..2336e27 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -219,43 +219,45 @@ void emit_wpos_xy(struct brw_wm_compile *c,
 		  const struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
+   struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
 
    if (mask & WRITEMASK_X) {
+      if (intel->gen >= 6) {
+	 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
+	 brw_MOV(p, delta_x_f, delta_x);
+	 delta_x = delta_x_f;
+      }
+
       if (c->fp->program.PixelCenterInteger) {
 	 /* X' = X */
-	 brw_MOV(p,
-		 dst[0],
-		 retype(arg0[0], BRW_REGISTER_TYPE_W));
+	 brw_MOV(p, dst[0], delta_x);
       } else {
 	 /* X' = X + 0.5 */
-	 brw_ADD(p,
-		 dst[0],
-		 retype(arg0[0], BRW_REGISTER_TYPE_W),
-		 brw_imm_f(0.5));
+	 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
       }
    }
 
    if (mask & WRITEMASK_Y) {
+      if (intel->gen >= 6) {
+	 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
+	 brw_MOV(p, delta_y_f, delta_y);
+	 delta_y = delta_y_f;
+      }
+
       if (c->fp->program.OriginUpperLeft) {
 	 if (c->fp->program.PixelCenterInteger) {
 	    /* Y' = Y */
-	    brw_MOV(p,
-		    dst[1],
-		    retype(arg0[1], BRW_REGISTER_TYPE_W));
+	    brw_MOV(p, dst[1], delta_y);
 	 } else {
-	    /* Y' = Y + 0.5 */
-	    brw_ADD(p,
-		    dst[1],
-		    retype(arg0[1], BRW_REGISTER_TYPE_W),
-		    brw_imm_f(0.5));
+	    brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
 	 }
       } else {
 	 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
 
 	 /* Y' = (height - 1) - Y + center */
-	 brw_ADD(p,
-		 dst[1],
-		 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+	 brw_ADD(p, dst[1], negate(delta_y),
 		 brw_imm_f(c->key.drawable_height - 1 + center_offset));
       }
    }
@@ -896,10 +898,14 @@ void emit_math1(struct brw_wm_compile *c,
 		      BRW_MATH_SATURATE_NONE);
    struct brw_reg src;
 
-   if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
-			   arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
+   if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
+			    arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
+			   arg0[0].negate || arg0[0].abs)) {
       /* Gen6 math requires that source and dst horizontal stride be 1,
        * and that the argument be in the GRF.
+       *
+       * The hardware ignores source modifiers (negate and abs) on math
+       * instructions, so we also move to a temp to set those up.
        */
       src = dst[dst_chan];
       brw_MOV(p, src, arg0[0]);
@@ -967,34 +973,23 @@ void emit_math2(struct brw_wm_compile *c,
       struct brw_reg temp_dst = dst[dst_chan];
 
       if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
-	 if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
-	    /* Both scalar arguments.  Do scalar calc. */
-	    src0.hstride = BRW_HORIZONTAL_STRIDE_1;
-	    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
-	    temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1;
-	    temp_dst.width = BRW_WIDTH_1;
-
-	    if (arg0[0].subnr != 0) {
-	       brw_MOV(p, temp_dst, src0);
-	       src0 = temp_dst;
-
-	       /* Ouch.  We've used the temp as a dst, and we still
-		* need a temp to store arg1 in, because src and dst
-		* offsets have to be equal.  Leaving this up to
-		* glsl2-965 to handle correctly.
-		*/
-	       assert(arg1[0].subnr == 0);
-	    } else if (arg1[0].subnr != 0) {
-	       brw_MOV(p, temp_dst, src1);
-	       src1 = temp_dst;
-	    }
-	 } else {
-	    brw_MOV(p, temp_dst, src0);
-	    src0 = temp_dst;
-	 }
-      } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
-	 brw_MOV(p, temp_dst, src1);
-	 src1 = temp_dst;
+	 brw_MOV(p, temp_dst, src0);
+	 src0 = temp_dst;
+      }
+
+      if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
+	 /* This is a heinous hack to get a temporary register for use
+	  * in case both arg0 and arg1 are constants.  Why you're
+	  * doing exponentiation on constant values in the shader, we
+	  * don't know.
+	  *
+	  * max_wm_grf is almost surely less than the maximum GRF, and
+	  * gen6 doesn't care about the number of GRFs used in a
+	  * shader like pre-gen6 did.
+	  */
+	 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
+	 brw_MOV(p, temp, src1);
+	 src1 = temp;
       }
 
       brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
@@ -1012,14 +1007,6 @@ void emit_math2(struct brw_wm_compile *c,
 		   sechalf(src0),
 		   sechalf(src1));
       }
-
-      /* Splat a scalar result into all the channels. */
-      if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 &&
-	  arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
-	 temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0;
-	 temp_dst.vstride = BRW_VERTICAL_STRIDE_0;
-	 brw_MOV(p, dst[dst_chan], temp_dst);
-      }
    } else {
       GLuint saturate = ((mask & SATURATE) ?
 			 BRW_MATH_SATURATE_SATURATE :
@@ -1301,9 +1288,15 @@ static void emit_kil( struct brw_wm_compile *c,
 		      struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg pixelmask;
    GLuint i, j;
 
+   if (intel->gen >= 6)
+      pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+   else
+      pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
    for (i = 0; i < 4; i++) {
       /* Check if we've already done the comparison for this reg
        * -- common when someone does KIL TEMP.wwww.
@@ -1319,26 +1312,11 @@ static void emit_kil( struct brw_wm_compile *c,
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
       brw_set_predicate_control_flag_value(p, 0xff);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_AND(p, r0uw, brw_flag_reg(), r0uw);
+      brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
       brw_pop_insn_state(p);
    }
 }
 
-/* KIL_NV kills the pixels that are currently executing, not based on a test
- * of the arguments.
- */
-void emit_kil_nv( struct brw_wm_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
-   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
-   brw_pop_insn_state(p);
-}
-
 static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint base_reg,
 			   GLuint nr,
@@ -1355,9 +1333,11 @@ static void fire_fb_write( struct brw_wm_compile *c,
       dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 
    /* Pass through control information:
+    * 
+    * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
     */
 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
-   if (intel->gen < 6) /* gen6, use headerless for fb write */
+   if (intel->gen < 6)
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
@@ -1378,7 +1358,8 @@ static void fire_fb_write( struct brw_wm_compile *c,
 		target,		
 		nr,
 		0, 
-		eot);
+		eot,
+		GL_TRUE);
 }
 
 
@@ -1387,8 +1368,8 @@ static void emit_aa( struct brw_wm_compile *c,
 		     GLuint reg )
 {
    struct brw_compile *p = &c->func;
-   GLuint comp = c->key.aa_dest_stencil_reg / 2;
-   GLuint off = c->key.aa_dest_stencil_reg % 2;
+   GLuint comp = c->aa_dest_stencil_reg / 2;
+   GLuint off = c->aa_dest_stencil_reg % 2;
    struct brw_reg aa = offset(arg1[comp], off);
 
    brw_push_insn_state(p);
@@ -1416,11 +1397,10 @@ void emit_fb_write(struct brw_wm_compile *c,
    struct intel_context *intel = &brw->intel;
    GLuint nr = 2;
    GLuint channel;
-   int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
 
    /* Reserve a space for AA - may not be needed:
     */
-   if (c->key.aa_dest_stencil_reg)
+   if (c->aa_dest_stencil_reg)
       nr += 1;
 
    /* I don't really understand how this achieves the color interleave
@@ -1428,11 +1408,6 @@ void emit_fb_write(struct brw_wm_compile *c,
     */
    brw_push_insn_state(p);
 
-   if (intel->gen >= 6)
-	base_reg = nr;
-   else
-	base_reg = 0;
-
    for (channel = 0; channel < 4; channel++) {
       if (intel->gen >= 6) {
 	 /* gen6 SIMD16 single source DP write looks like:
@@ -1493,9 +1468,9 @@ void emit_fb_write(struct brw_wm_compile *c,
 
    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
+   if (c->source_depth_to_render_target)
    {
-      if (c->key.computes_depth) 
+      if (c->computes_depth)
 	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
       else 
 	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
@@ -1503,10 +1478,10 @@ void emit_fb_write(struct brw_wm_compile *c,
       nr += 2;
    }
 
-   if (c->key.dest_depth_reg)
+   if (c->dest_depth_reg)
    {
-      GLuint comp = c->key.dest_depth_reg / 2;
-      GLuint off = c->key.dest_depth_reg % 2;
+      GLuint comp = c->dest_depth_reg / 2;
+      GLuint off = c->dest_depth_reg % 2;
 
       if (off != 0) {
          brw_push_insn_state(p);
@@ -1524,15 +1499,28 @@ void emit_fb_write(struct brw_wm_compile *c,
    }
 
    if (intel->gen >= 6) {
-      /* Subtract off the message header, since we send headerless. */
-      nr -= 2;
+      /* Load the message header.  There's no implied move from src0
+       * to the base mrf on gen6.
+       */
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
+	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+      brw_pop_insn_state(p);
+
+      if (target != 0) {
+	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+					0,
+					2), BRW_REGISTER_TYPE_UD),
+		 brw_imm_ud(target));
+      }
    }
 
-   if (!c->key.runtime_check_aads_emit) {
-      if (c->key.aa_dest_stencil_reg)
+   if (!c->runtime_check_aads_emit) {
+      if (c->aa_dest_stencil_reg)
 	 emit_aa(c, arg1, 2);
 
-      fire_fb_write(c, base_reg, nr, target, eot);
+      fire_fb_write(c, 0, nr, target, eot);
    }
    else {
       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
@@ -1897,10 +1885,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_kil(c, args[0]);
 	 break;
 
-      case OPCODE_KIL_NV:
-	 emit_kil_nv(c);
-	 break;
-
       default:
 	 printf("Unsupported opcode %i (%s) in fragment shader\n",
 		inst->opcode, inst->opcode < MAX_OPCODE ?
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 2cae698..4759b28 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -338,11 +338,13 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
 
 static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 {
-   /* This is only called for producing 1/w in pre-gen6 interp.  for
-    * gen6, the interp opcodes don't use this argument.
+   /* This is called for producing 1/w in pre-gen6 interp.  for gen6,
+    * the interp opcodes don't use this argument.  But to keep the
+    * nr_args = 3 expectations of pinterp happy, just stuff delta_xy
+    * into the slot.
     */
    if (c->func.brw->intel.gen >= 6)
-      return src_undef();
+      return c->delta_xy;
 
    if (src_is_undef(c->pixel_w)) {
       struct prog_dst_register pixel_w = get_temp(c);
@@ -373,11 +375,7 @@ static void emit_interp( struct brw_wm_compile *c,
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
    struct prog_src_register deltas;
 
-   if (c->func.brw->intel.gen < 6) {
-      deltas = get_delta_xy(c);
-   } else {
-      deltas = src_undef();
-   }
+   deltas = get_delta_xy(c);
 
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
@@ -1133,6 +1131,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 precalc_lit(c, inst);
 	 break;
 
+      case OPCODE_RSQ:
+	 out = emit_scalar_insn(c, inst);
+	 out->SrcReg[0].Abs = GL_TRUE;
+	 break;
+
       case OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
deleted file mode 100644
index 7fe8ab1..0000000
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ /dev/null
@@ -1,1035 +0,0 @@
-#include "main/macros.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "program/prog_optimize.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-                                  const struct prog_instruction *inst,
-                                  GLuint component);
-
-/**
- * Determine if the given fragment program uses GLSL features such
- * as flow conditionals, loops, subroutines.
- * Some GLSL shaders may use these features, others might not.
- */
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
-{
-    int i;
-
-    if (unlikely(INTEL_DEBUG & DEBUG_GLSL_FORCE))
-       return GL_TRUE;
-
-    for (i = 0; i < fp->Base.NumInstructions; i++) {
-	const struct prog_instruction *inst = &fp->Base.Instructions[i];
-	switch (inst->Opcode) {
-	    case OPCODE_ARL:
-	    case OPCODE_IF:
-	    case OPCODE_ENDIF:
-	    case OPCODE_CAL:
-	    case OPCODE_BRK:
-	    case OPCODE_RET:
-	    case OPCODE_BGNLOOP:
-		return GL_TRUE; 
-	    default:
-		break;
-	}
-    }
-    return GL_FALSE; 
-}
-
-
-
-static void
-reclaim_temps(struct brw_wm_compile *c);
-
-
-/** Mark GRF register as used. */
-static void
-prealloc_grf(struct brw_wm_compile *c, int r)
-{
-   c->used_grf[r] = GL_TRUE;
-}
-
-
-/** Mark given GRF register as not in use. */
-static void
-release_grf(struct brw_wm_compile *c, int r)
-{
-   /*assert(c->used_grf[r]);*/
-   c->used_grf[r] = GL_FALSE;
-   c->first_free_grf = MIN2(c->first_free_grf, r);
-}
-
-
-/** Return index of a free GRF, mark it as used. */
-static int
-alloc_grf(struct brw_wm_compile *c)
-{
-   GLuint r;
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   /* no free temps, try to reclaim some */
-   reclaim_temps(c);
-   c->first_free_grf = 0;
-
-   /* try alloc again */
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
-      assert(c->used_grf[r]);
-   }
-
-   /* really, no free GRF regs found */
-   if (!c->out_of_regs) {
-      /* print warning once per compilation */
-      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
-      c->out_of_regs = GL_TRUE;
-   }
-
-   return -1;
-}
-
-
-/** Return number of GRF registers used */
-static int
-num_grf_used(const struct brw_wm_compile *c)
-{
-   int r;
-   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
-      if (c->used_grf[r])
-         return r + 1;
-   return 0;
-}
-
-
-
-/**
- * Record the mapping of a Mesa register to a hardware register.
- */
-static void set_reg(struct brw_wm_compile *c, int file, int index, 
-	int component, struct brw_reg reg)
-{
-    c->wm_regs[file][index][component].reg = reg;
-    c->wm_regs[file][index][component].inited = GL_TRUE;
-}
-
-static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
-{
-    struct brw_reg reg;
-
-    /* if we need to allocate another temp, grow the tmp_regs[] array */
-    if (c->tmp_index == c->tmp_max) {
-       int r = alloc_grf(c);
-       if (r < 0) {
-          /*printf("Out of temps in %s\n", __FUNCTION__);*/
-          r = 50; /* XXX random register! */
-       }
-       c->tmp_regs[ c->tmp_max++ ] = r;
-    }
-
-    /* form the GRF register */
-    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
-    /*printf("alloc_temp %d\n", reg.nr);*/
-    assert(reg.nr < BRW_WM_MAX_GRF);
-    return reg;
-
-}
-
-/**
- * Save current temp register info.
- * There must be a matching call to release_tmps().
- */
-static int mark_tmps(struct brw_wm_compile *c)
-{
-    return c->tmp_index;
-}
-
-static void release_tmps(struct brw_wm_compile *c, int mark)
-{
-    c->tmp_index = mark;
-}
-
-/**
- * Convert Mesa src register to brw register.
- *
- * Since we're running in SOA mode each Mesa register corresponds to four
- * hardware registers.  We allocate the hardware registers as needed here.
- *
- * \param file  register file, one of PROGRAM_x
- * \param index  register number
- * \param component  src component (X=0, Y=1, Z=2, W=3)
- * \param nr  not used?!?
- * \param neg  negate value?
- * \param abs  take absolute value?
- */
-static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component,
-        int nr, GLuint neg, GLuint abs)
-{
-    struct brw_reg reg;
-    switch (file) {
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_UNIFORM:
-	    file = PROGRAM_STATE_VAR;
-	    break;
-	case PROGRAM_UNDEFINED:
-	    return brw_null_reg();	
-	case PROGRAM_TEMPORARY:
-	case PROGRAM_INPUT:
-	case PROGRAM_OUTPUT:
-	case PROGRAM_PAYLOAD:
-	    break;
-	default:
-	    _mesa_problem(NULL, "Unexpected file in get_reg()");
-	    return brw_null_reg();
-    }
-
-    assert(index < 256);
-    assert(component < 4);
-
-    /* see if we've already allocated a HW register for this Mesa register */
-    if (c->wm_regs[file][index][component].inited) {
-       /* yes, re-use */
-       reg = c->wm_regs[file][index][component].reg;
-    }
-    else {
-	/* no, allocate new register */
-       int grf = alloc_grf(c);
-       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
-       if (grf < 0) {
-          /* totally out of temps */
-          grf = 51; /* XXX random register! */
-       }
-
-       reg = brw_vec8_grf(grf, 0);
-       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
-
-       set_reg(c, file, index, component, reg);
-    }
-
-    if (neg & (1 << component)) {
-	reg = negate(reg);
-    }
-    if (abs)
-	reg = brw_abs(reg);
-    return reg;
-}
-
-
-
-/**
- * This is called if we run out of GRF registers.  Examine the live intervals
- * of temp regs in the program and free those which won't be used again.
- */
-static void
-reclaim_temps(struct brw_wm_compile *c)
-{
-   GLint intBegin[MAX_PROGRAM_TEMPS];
-   GLint intEnd[MAX_PROGRAM_TEMPS];
-   int index;
-
-   /*printf("Reclaim temps:\n");*/
-
-   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
-                             intBegin, intEnd);
-
-   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
-      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
-         /* program temp[i] can be freed */
-         int component;
-         /*printf("  temp[%d] is dead\n", index);*/
-         for (component = 0; component < 4; component++) {
-            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
-               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
-               release_grf(c, r);
-               /*
-               printf("  Reclaim temp %d, reg %d at inst %d\n",
-                      index, r, c->cur_inst);
-               */
-               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
-            }
-         }
-      }
-   }
-}
-
-
-
-
-/**
- * Preallocate registers.  This sets up the Mesa to hardware register
- * mapping for certain registers, such as constants (uniforms/state vars)
- * and shader inputs.
- */
-static void prealloc_reg(struct brw_wm_compile *c)
-{
-    struct intel_context *intel = &c->func.brw->intel;
-    int i, j;
-    struct brw_reg reg;
-    int urb_read_length = 0;
-    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
-    GLuint reg_index = 0;
-
-    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
-    c->first_free_grf = 0;
-
-    for (i = 0; i < 4; i++) {
-	if (i < (c->key.nr_payload_regs + 1) / 2)
-            reg = brw_vec8_grf(i * 2, 0);
-        else
-            reg = brw_vec8_grf(0, 0);
-	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
-    }
-    set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_W, 0,
-	    brw_vec8_grf(c->key.source_w_reg, 0));
-    reg_index += c->key.nr_payload_regs;
-
-    /* constants */
-    {
-        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
-        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
-
-        /* use a real constant buffer, or just use a section of the GRF? */
-        /* XXX this heuristic may need adjustment... */
-        if ((nr_params + nr_temps) * 4 + reg_index > 80) {
-	   for (i = 0; i < nr_params; i++) {
-	      float *pv = c->fp->program.Base.Parameters->ParameterValues[i];
-	      for (j = 0; j < 4; j++) {
-		 c->prog_data.pull_param[c->prog_data.nr_pull_params] = &pv[j];
-		 c->prog_data.nr_pull_params++;
-	      }
-	   }
-
-	   c->prog_data.nr_params = 0;
-	}
-        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
-
-        if (!c->prog_data.nr_pull_params) {
-           const struct gl_program_parameter_list *plist = 
-              c->fp->program.Base.Parameters;
-           int index = 0;
-
-           /* number of float constants in CURBE */
-           c->prog_data.nr_params = 4 * nr_params;
-
-           /* loop over program constants (float[4]) */
-           for (i = 0; i < nr_params; i++) {
-              /* loop over XYZW channels */
-              for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
-                 /* Save pointer to parameter/constant value.
-                  * Constants will be copied in prepare_constant_buffer()
-                  */
-                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
-                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-              }
-           }
-           /* number of constant regs used (each reg is float[8]) */
-	   c->nr_creg = ALIGN(nr_params, 2) / 2;
-	   reg_index += c->nr_creg;
-        }
-    }
-
-    /* fragment shader inputs: One 2-reg pair of interpolation
-     * coefficients for each vec4 to be set up.
-     */
-    if (intel->gen >= 6) {
-       for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	  if (!(c->fp->program.Base.InputsRead & BITFIELD64_BIT(i)))
-	     continue;
-
-	  reg = brw_vec8_grf(reg_index, 0);
-	  for (j = 0; j < 4; j++) {
-	     set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	  }
-	  reg_index += 2;
-       }
-       urb_read_length = reg_index;
-    } else {
-       for (i = 0; i < VERT_RESULT_MAX; i++) {
-	  int fp_input;
-
-	  if (i >= VERT_RESULT_VAR0)
-	     fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
-	  else if (i <= VERT_RESULT_TEX7)
-	     fp_input = i;
-	  else
-	     fp_input = -1;
-
-	  if (fp_input >= 0 && inputs & (1 << fp_input)) {
-	     urb_read_length = reg_index;
-	     reg = brw_vec8_grf(reg_index, 0);
-	     for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
-	  }
-	  if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
-	     reg_index += 2;
-	  }
-       }
-    }
-
-    c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
-    c->prog_data.urb_read_length = urb_read_length;
-    c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index += 2;
-
-    /* mark GRF regs [0..reg_index-1] as in-use */
-    for (i = 0; i < reg_index; i++)
-       prealloc_grf(c, i);
-
-    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
-    prealloc_grf(c, 126);
-    prealloc_grf(c, 127);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-	struct brw_reg dst[4];
-
-	switch (inst->Opcode) {
-	case OPCODE_TEX:
-	case OPCODE_TXB:
-	    /* Allocate the channels of texture results contiguously,
-	     * since they are written out that way by the sampler unit.
-	     */
-	    for (j = 0; j < 4; j++) {
-		dst[j] = get_dst_reg(c, inst, j);
-		if (j != 0)
-		    assert(dst[j].nr == dst[j - 1].nr + 1);
-	    }
-	    break;
-	default:
-	    break;
-	}
-    }
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-
-	switch (inst->Opcode) {
-	case WM_DELTAXY:
-	    /* Allocate WM_DELTAXY destination on G45/GM45 to an
-	     * even-numbered GRF if possible so that we can use the PLN
-	     * instruction.
-	     */
-	    if (inst->DstReg.WriteMask == WRITEMASK_XY &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
-		(IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
-		int grf;
-
-		for (grf = c->first_free_grf & ~1;
-		     grf < BRW_WM_MAX_GRF;
-		     grf += 2)
-		{
-		    if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
-			c->used_grf[grf] = GL_TRUE;
-			c->used_grf[grf + 1] = GL_TRUE;
-			c->first_free_grf = grf + 2;  /* a guess */
-
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
-				brw_vec8_grf(grf, 0));
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
-				brw_vec8_grf(grf + 1, 0));
-			break;
-		    }
-		}
-	    }
-	default:
-	    break;
-	}
-    }
-
-    /* An instruction may reference up to three constants.
-     * They'll be found in these registers.
-     * XXX alloc these on demand!
-     */
-    if (c->prog_data.nr_pull_params) {
-       for (i = 0; i < 3; i++) {
-          c->current_const[i].index = -1;
-          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
-       }
-    }
-#if 0
-    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
-#endif
-}
-
-
-/**
- * Check if any of the instruction's src registers are constants, uniforms,
- * or statevars.  If so, fetch any constants that we don't already have in
- * the three GRF slots.
- */
-static void fetch_constants(struct brw_wm_compile *c,
-                            const struct prog_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   GLuint i;
-
-   /* loop over instruction src regs */
-   for (i = 0; i < 3; i++) {
-      const struct prog_src_register *src = &inst->SrcReg[i];
-      if (src->File == PROGRAM_STATE_VAR ||
-          src->File == PROGRAM_CONSTANT ||
-          src->File == PROGRAM_UNIFORM) {
-	 c->current_const[i].index = src->Index;
-
-#if 0
-	 printf("  fetch const[%d] for arg %d into reg %d\n",
-		src->Index, i, c->current_const[i].reg.nr);
-#endif
-
-	 /* need to fetch the constant now */
-	 brw_oword_block_read(p,
-			      c->current_const[i].reg,
-			      brw_message_reg(1),
-			      16 * src->Index,
-			      SURF_INDEX_FRAG_CONST_BUFFER);
-      }
-   }
-}
-
-
-/**
- * Convert Mesa dst register to brw register.
- */
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint component)
-{
-    const int nr = 1;
-    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
-	    0, 0);
-}
-
-
-static struct brw_reg
-get_src_reg_const(struct brw_wm_compile *c,
-                  const struct prog_instruction *inst,
-                  GLuint srcRegIndex, GLuint component)
-{
-   /* We should have already fetched the constant from the constant
-    * buffer in fetch_constants().  Now we just have to return a
-    * register description that extracts the needed component and
-    * smears it across all eight vector components.
-    */
-   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-   struct brw_reg const_reg;
-
-   assert(component < 4);
-   assert(srcRegIndex < 3);
-   assert(c->current_const[srcRegIndex].index != -1);
-   const_reg = c->current_const[srcRegIndex].reg;
-
-   /* extract desired float from the const_reg, and smear */
-   const_reg = stride(const_reg, 0, 1, 0);
-   const_reg.subnr = component * 4;
-
-   if (src->Negate & (1 << component))
-      const_reg = negate(const_reg);
-   if (src->Abs)
-      const_reg = brw_abs(const_reg);
-
-#if 0
-   printf("  form const[%d].%d for arg %d, reg %d\n",
-          c->current_const[srcRegIndex].index,
-          component,
-          srcRegIndex,
-          const_reg.nr);
-#endif
-
-   return const_reg;
-}
-
-
-/**
- * Convert Mesa src register to brw register.
- */
-static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint srcRegIndex, GLuint channel)
-{
-    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    const GLuint nr = 1;
-    const GLuint component = GET_SWZ(src->Swizzle, channel);
-
-    /* Only one immediate value can be used per native opcode, and it
-     * has be in the src1 slot, so not all Mesa instructions will get
-     * to take advantage of immediate constants.
-     */
-    if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
-       const struct gl_program_parameter_list *params;
-
-       params = c->fp->program.Base.Parameters;
-
-       /* Extended swizzle terms */
-       if (component == SWIZZLE_ZERO) {
-	  return brw_imm_f(0.0F);
-       } else if (component == SWIZZLE_ONE) {
-	  if (src->Negate)
-	     return brw_imm_f(-1.0F);
-	  else
-	     return brw_imm_f(1.0F);
-       }
-
-       if (src->File == PROGRAM_CONSTANT) {
-	  float f = params->ParameterValues[src->Index][component];
-
-	  if (src->Abs)
-	     f = fabs(f);
-	  if (src->Negate)
-	     f = -f;
-
-	  return brw_imm_f(f);
-       }
-    }
-
-    if (c->prog_data.nr_pull_params &&
-        (src->File == PROGRAM_STATE_VAR ||
-         src->File == PROGRAM_CONSTANT ||
-         src->File == PROGRAM_UNIFORM)) {
-       return get_src_reg_const(c, inst, srcRegIndex, component);
-    }
-    else {
-       /* other type of source register */
-       return get_reg(c, src->File, src->Index, component, nr, 
-                      src->Negate, src->Abs);
-    }
-}
-
-static void emit_arl(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, addr_reg;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
-                           BRW_ARF_ADDRESS, 0);
-    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
-    brw_MOV(p, addr_reg, src0);
-    brw_set_saturate(p, 0);
-}
-
-static INLINE struct brw_reg high_words( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
-		   0, 8, 2 );
-}
-
-static INLINE struct brw_reg low_words( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
-}
-
-static INLINE struct brw_reg even_bytes( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
-}
-
-static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
-		   0, 16, 2 );
-}
-
-/**
- * Resolve subroutine calls after code emit is done.
- */
-static void post_wm_emit( struct brw_wm_compile *c )
-{
-    brw_resolve_cals(&c->func);
-}
-
-static void
-get_argument_regs(struct brw_wm_compile *c,
-		  const struct prog_instruction *inst,
-		  int index,
-		  struct brw_reg *dst,
-		  struct brw_reg *regs,
-		  int mask)
-{
-    struct brw_compile *p = &c->func;
-    int i, j;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1 << i)) {
-	    regs[i] = get_src_reg(c, inst, index, i);
-
-	    /* Unalias destination registers from our sources. */
-	    if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
-	       for (j = 0; j < 4; j++) {
-		   if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
-		       struct brw_reg tmp = alloc_tmp(c);
-		       brw_MOV(p, tmp, regs[i]);
-		       regs[i] = tmp;
-		       break;
-		   }
-	       }
-	    }
-	}
-    }
-}
-
-static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
-{
-   struct intel_context *intel = &brw->intel;
-#define MAX_IF_DEPTH 32
-#define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
-    int if_depth_in_loop[MAX_LOOP_DEPTH];
-    GLuint i, if_depth = 0, loop_depth = 0;
-    struct brw_compile *p = &c->func;
-    struct brw_indirect stack_index = brw_indirect(0, 0);
-
-    c->out_of_regs = GL_FALSE;
-
-    if_depth_in_loop[loop_depth] = 0;
-
-    prealloc_reg(c);
-    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
-
-    if (intel->gen >= 6)
-	brw_set_acc_write_control(p, 1);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-        const struct prog_instruction *inst = &c->prog_instructions[i];
-	int dst_flags;
-	struct brw_reg args[3][4], dst[4];
-	int j;
-	int mark = mark_tmps( c );
-
-        c->cur_inst = i;
-
-#if 0
-        printf("Inst %d: ", i);
-        _mesa_print_instruction(inst);
-#endif
-
-        /* fetch any constants that this instruction needs */
-        if (c->prog_data.nr_pull_params)
-           fetch_constants(c, inst);
-
-	if (inst->Opcode != OPCODE_ARL) {
-	   for (j = 0; j < 4; j++) {
-	      if (inst->DstReg.WriteMask & (1 << j))
-		 dst[j] = get_dst_reg(c, inst, j);
-	      else
-		 dst[j] = brw_null_reg();
-	   }
-	}
-	for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
-	    get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
-
-	dst_flags = inst->DstReg.WriteMask;
-	if (inst->SaturateMode == SATURATE_ZERO_ONE)
-	    dst_flags |= SATURATE;
-
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
-
-	switch (inst->Opcode) {
-	    case WM_PIXELXY:
-		emit_pixel_xy(c, dst, dst_flags);
-		break;
-	    case WM_DELTAXY: 
-		emit_delta_xy(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_PIXELW:
-		emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
-		break;	
-	    case WM_LINTERP:
-		emit_linterp(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case WM_PINTERP:
-		emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case WM_CINTERP:
-		emit_cinterp(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_WPOSXY:
-		emit_wpos_xy(c, dst, dst_flags, args[0]);
-		break;
-	    case WM_FB_WRITE:
-		emit_fb_write(c, args[0], args[1], args[2],
-			      INST_AUX_GET_TARGET(inst->Aux),
-			      inst->Aux & INST_AUX_EOT);
-		break;
-	    case WM_FRONTFACING:
-		emit_frontfacing(p, dst, dst_flags);
-		break;
-	    case OPCODE_ADD:
-		emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_ARL:
-		emit_arl(c, inst);
-		break;
-	    case OPCODE_FRC:
-		emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_FLR:
-		emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LRP:
-		emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TRUNC:
-		emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
-		emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_DP2:
-		emit_dp2(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP3:
-		emit_dp3(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP4:
-		emit_dp4(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_XPD:
-		emit_xpd(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DPH:
-		emit_dph(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_RCP:
-		emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_RSQ:
-		emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_SIN:
-		emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_COS:
-		emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_EX2:
-		emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LG2:
-		emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_CMP:
-		emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_MIN:	
-		emit_min(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAX:	
-		emit_max(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DDX:
-	    case OPCODE_DDY:
-		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
-			  args[0]);
-                break;
-	    case OPCODE_SLT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_L, args[0], args[1]);
-		break;
-	    case OPCODE_SLE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_LE, args[0], args[1]);
-		break;
-	    case OPCODE_SGT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_G, args[0], args[1]);
-		break;
-	    case OPCODE_SGE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_GE, args[0], args[1]);
-		break;
-	    case OPCODE_SEQ:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_EQ, args[0], args[1]);
-		break;
-	    case OPCODE_SNE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_NEQ, args[0], args[1]);
-		break;
-	    case OPCODE_SSG:
-		emit_sign(p, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MUL:
-		emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_POW:
-		emit_math2(c, BRW_MATH_FUNCTION_POW,
-			   dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAD:
-		emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TEX:
-		emit_tex(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 inst->TexSrcUnit,
-			 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
-		break;
-	    case OPCODE_TXB:
-		emit_txb(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
-		break;
-	    case OPCODE_KIL_NV:
-		emit_kil_nv(c);
-		break;
-	    case OPCODE_IF:
-		assert(if_depth < MAX_IF_DEPTH);
-		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth]++;
-		break;
-	    case OPCODE_ELSE:
-		assert(if_depth > 0);
-		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
-		break;
-	    case OPCODE_ENDIF:
-		assert(if_depth > 0);
-		brw_ENDIF(p, if_inst[--if_depth]);
-		if_depth_in_loop[loop_depth]--;
-		break;
-	    case OPCODE_BGNSUB:
-		brw_save_label(p, inst->Comment, p->nr_insn);
-		break;
-	    case OPCODE_ENDSUB:
-		/* no-op */
-		break;
-	    case OPCODE_CAL: 
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-                brw_ADD(p, get_addr_reg(stack_index),
-                         get_addr_reg(stack_index), brw_imm_d(4));
-		brw_save_call(&c->func, inst->Comment, p->nr_insn);
-                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-                brw_pop_insn_state(p);
-		break;
-
-	    case OPCODE_RET:
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_ADD(p, get_addr_reg(stack_index),
-                        get_addr_reg(stack_index), brw_imm_d(-4));
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-		brw_pop_insn_state(p);
-
-		break;
-	    case OPCODE_BGNLOOP:
-                /* XXX may need to invalidate the current_constant regs */
-		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth] = 0;
-		break;
-	    case OPCODE_BRK:
-		brw_BREAK(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_CONT:
-		brw_CONT(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_ENDLOOP: 
-               {
-                  struct brw_instruction *inst0, *inst1;
-                  GLuint br = 1;
-
-                  if (intel->gen == 5)
-                     br = 2;
-
-		  assert(loop_depth > 0);
-                  loop_depth--;
-                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
-                  while (inst0 > loop_inst[loop_depth]) {
-                     inst0--;
-                     if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-			 inst0->bits3.if_else.jump_count == 0) {
-			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-                     }
-                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			      inst0->bits3.if_else.jump_count == 0) {
-                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-                     }
-                  }
-               }
-               break;
-	    default:
-		printf("unsupported opcode %d (%s) in fragment shader\n",
-		       inst->Opcode, inst->Opcode < MAX_OPCODE ?
-		       _mesa_opcode_string(inst->Opcode) : "unknown");
-	}
-
-	/* Release temporaries containing any unaliased source regs. */
-	release_tmps( c, mark );
-
-	if (inst->CondUpdate)
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	else
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    }
-    post_wm_emit(c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
-	 brw_disasm(stdout, &p->store[i], intel->gen);
-      printf("\n");
-    }
-}
-
-/**
- * Do GPU code generation for shaders that use GLSL features such as
- * flow control.  Other shaders will be compiled with the 
- */
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
-{
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        printf("brw_wm_glsl_emit:\n");
-    }
-
-    /* initial instruction translation/simplification */
-    brw_wm_pass_fp(c);
-
-    /* actual code generation */
-    brw_wm_emit_glsl(brw, c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        brw_wm_print_program(c, "brw_wm_glsl_emit done");
-    }
-
-    c->prog_data.total_grf = num_grf_used(c);
-    c->prog_data.total_scratch = 0;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index 62e5566..471ea1c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -120,14 +120,14 @@ const struct {
  * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
  * \param lookup  bitmask of IZ_* flags
  */
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key )
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c)
 {
    GLuint reg = 2;
    GLboolean kill_stats_promoted_workaround = GL_FALSE;
+   int lookup = c->key.iz_lookup;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
 
    assert (lookup < IZ_BIT_MAX);
 
@@ -136,36 +136,36 @@ void brw_wm_lookup_iz( struct intel_context *intel,
     * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
     * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
     */
-   if (intel->stats_wm &&
+   if (c->key.stats_wm &&
        (lookup & IZ_PS_KILL_ALPHATEST_BIT) &&
        wm_iz_table[lookup].mode == P) {
       kill_stats_promoted_workaround = GL_TRUE;
    }
 
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
-      key->computes_depth = 1;
+      c->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present || ps_uses_depth ||
+   if (wm_iz_table[lookup].sd_present || uses_depth ||
        kill_stats_promoted_workaround) {
-      key->source_depth_reg = reg;
+      c->source_depth_reg = reg;
       reg += 2;
    }
 
    if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
-      key->source_depth_to_render_target = 1;
+      c->source_depth_to_render_target = 1;
 
-   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
-      key->aa_dest_stencil_reg = reg;
-      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
-				      line_aa == AA_SOMETIMES);
+   if (wm_iz_table[lookup].ds_present || c->key.line_aa != AA_NEVER) {
+      c->aa_dest_stencil_reg = reg;
+      c->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				    c->key.line_aa == AA_SOMETIMES);
       reg++;
    }
 
    if (wm_iz_table[lookup].dd_present) {
-      key->dest_depth_reg = reg;
+      c->dest_depth_reg = reg;
       reg+=2;
    }
 
-   key->nr_payload_regs = reg;
+   c->nr_payload_regs = reg;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 8315252..f78bdc3 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -380,7 +380,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
    GLuint i;
 
    for (i = 0; i < 4; i++) {
-      GLuint j = i >= (c->key.nr_payload_regs + 1) / 2 ? 0 : i;
+      GLuint j = i >= (c->nr_payload_regs + 1) / 2 ? 0 : i;
       pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, 
 			     &c->payload.depth[j] );
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index 3a2874b..7d6a3fa 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -128,8 +128,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       if (inst->opcode == WM_FB_WRITE) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
 	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
-	 if (c->key.source_depth_to_render_target &&
-	     c->key.computes_depth)
+	 if (c->source_depth_to_render_target && c->computes_depth)
 	    track_arg(c, inst, 2, WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
@@ -281,7 +280,6 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 44e3953..8c2b9e7 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -69,6 +69,8 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
+   struct brw_context *brw = c->func.brw;
+   struct intel_context *intel = &brw->intel;
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -76,32 +78,41 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->grf_limit; j++) 
       c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
 
-   for (j = 0; j < (c->key.nr_payload_regs + 1) / 2; j++)
+   for (j = 0; j < (c->nr_payload_regs + 1) / 2; j++)
       prealloc_reg(c, &c->payload.depth[j], i++);
 
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < VERT_RESULT_MAX; j++) {
-      if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
-	 int fp_index;
-
-	 if (j >= VERT_RESULT_VAR0)
-	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
-	 else if (j <= VERT_RESULT_TEX7)
-	    fp_index = j;
-	 else
-	    fp_index = -1;
-
-	 nr_interp_regs++;
-	 if (fp_index >= 0)
-	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+   if (intel->gen >= 6) {
+      for (unsigned int j = 0; j < FRAG_ATTRIB_MAX; j++) {
+	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(j)) {
+	    nr_interp_regs++;
+	    prealloc_reg(c, &c->payload.input_interp[j], i++);
+	 }
+      }
+   } else {
+      for (j = 0; j < VERT_RESULT_MAX; j++) {
+	 if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
+	    int fp_index;
+
+	    if (j >= VERT_RESULT_VAR0)
+	       fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	    else if (j <= VERT_RESULT_TEX7)
+	       fp_index = j;
+	    else
+	       fp_index = -1;
+
+	    nr_interp_regs++;
+	    if (fp_index >= 0)
+	       prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+	 }
       }
+      assert(nr_interp_regs >= 1);
    }
 
-   assert(nr_interp_regs >= 1);
 
-   c->prog_data.first_curbe_grf = ALIGN(c->key.nr_payload_regs, 2);
+   c->prog_data.first_curbe_grf = ALIGN(c->nr_payload_regs, 2);
    c->prog_data.urb_read_length = nr_interp_regs * 2;
    c->prog_data.curb_read_length = c->nr_creg * 2;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index fea96d3..30672b4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -69,12 +69,43 @@ static GLuint translate_wrap_mode( GLenum wrap )
 static drm_intel_bo *upload_default_color( struct brw_context *brw,
 				     const GLfloat *color )
 {
-   struct brw_sampler_default_color sdc;
+   struct intel_context *intel = &brw->intel;
 
-   COPY_4V(sdc.color, color); 
-   
-   return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			 &sdc, sizeof(sdc));
+   if (intel->gen >= 5) {
+      struct gen5_sampler_default_color sdc;
+
+      memset(&sdc, 0, sizeof(sdc));
+
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[0], color[0]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[1], color[1]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[2], color[2]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[0], color[0]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[1], color[1]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[2], color[2]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[0], color[0]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[1], color[1]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[2], color[2]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[3], color[3]);
+
+      /* XXX: Fill in half floats */
+      /* XXX: Fill in signed bytes */
+
+      COPY_4V(sdc.f, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   } else {
+      struct brw_sampler_default_color sdc;
+
+      COPY_4V(sdc.color, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   }
 }
 
 
@@ -245,9 +276,8 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 struct wm_sampler_entry *entry = &key->sampler[unit];
 	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 	 struct gl_texture_object *texObj = texUnit->_Current;
-	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
 	 struct gl_texture_image *firstImage =
-	    texObj->Image[0][intelObj->firstLevel];
+	    texObj->Image[0][texObj->BaseLevel];
 
 	 memset(last_entry_end, 0, 
 		(char*)entry - last_entry_end + sizeof(*entry));
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 76de7b7..e9ef635 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -87,7 +87,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
    struct gl_context *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
-   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -132,7 +131,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = bfp->isGLSL;
 
    /* If using the fragment shader backend, the program is always
     * 8-wide.
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76fc94d..233fe3b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -42,7 +42,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-
+#include "brw_wm.h"
 
 static GLuint translate_tex_target( GLenum target )
 {
@@ -68,104 +68,72 @@ static GLuint translate_tex_target( GLenum target )
    }
 }
 
+static uint32_t brw_format_for_mesa_format[MESA_FORMAT_COUNT] =
+{
+   [MESA_FORMAT_L8] = BRW_SURFACEFORMAT_L8_UNORM,
+   [MESA_FORMAT_I8] = BRW_SURFACEFORMAT_I8_UNORM,
+   [MESA_FORMAT_A8] = BRW_SURFACEFORMAT_A8_UNORM,
+   [MESA_FORMAT_AL88] = BRW_SURFACEFORMAT_L8A8_UNORM,
+   [MESA_FORMAT_AL1616] = BRW_SURFACEFORMAT_L16A16_UNORM,
+   [MESA_FORMAT_R8] = BRW_SURFACEFORMAT_R8_UNORM,
+   [MESA_FORMAT_R16] = BRW_SURFACEFORMAT_R16_UNORM,
+   [MESA_FORMAT_RG88] = BRW_SURFACEFORMAT_R8G8_UNORM,
+   [MESA_FORMAT_RG1616] = BRW_SURFACEFORMAT_R16G16_UNORM,
+   [MESA_FORMAT_ARGB8888] = BRW_SURFACEFORMAT_B8G8R8A8_UNORM,
+   [MESA_FORMAT_XRGB8888] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM,
+   [MESA_FORMAT_RGB565] = BRW_SURFACEFORMAT_B5G6R5_UNORM,
+   [MESA_FORMAT_ARGB1555] = BRW_SURFACEFORMAT_B5G5R5A1_UNORM,
+   [MESA_FORMAT_ARGB4444] = BRW_SURFACEFORMAT_B4G4R4A4_UNORM,
+   [MESA_FORMAT_YCBCR_REV] = BRW_SURFACEFORMAT_YCRCB_NORMAL,
+   [MESA_FORMAT_YCBCR] = BRW_SURFACEFORMAT_YCRCB_SWAPUVY,
+   [MESA_FORMAT_RGB_FXT1] = BRW_SURFACEFORMAT_FXT1,
+   [MESA_FORMAT_RGBA_FXT1] = BRW_SURFACEFORMAT_FXT1,
+   [MESA_FORMAT_RGB_DXT1] = BRW_SURFACEFORMAT_DXT1_RGB,
+   [MESA_FORMAT_RGBA_DXT1] = BRW_SURFACEFORMAT_BC1_UNORM,
+   [MESA_FORMAT_RGBA_DXT3] = BRW_SURFACEFORMAT_BC2_UNORM,
+   [MESA_FORMAT_RGBA_DXT5] = BRW_SURFACEFORMAT_BC3_UNORM,
+   [MESA_FORMAT_SRGB_DXT1] = BRW_SURFACEFORMAT_DXT1_RGB_SRGB,
+   [MESA_FORMAT_SRGBA_DXT1] = BRW_SURFACEFORMAT_BC1_UNORM_SRGB,
+   [MESA_FORMAT_SRGBA_DXT3] = BRW_SURFACEFORMAT_BC2_UNORM_SRGB,
+   [MESA_FORMAT_SRGBA_DXT5] = BRW_SURFACEFORMAT_BC3_UNORM_SRGB,
+   [MESA_FORMAT_SARGB8] = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB,
+   [MESA_FORMAT_SLA8] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB,
+   [MESA_FORMAT_SL8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB,
+   [MESA_FORMAT_DUDV8] = BRW_SURFACEFORMAT_R8G8_SNORM,
+   [MESA_FORMAT_SIGNED_RGBA8888_REV] = BRW_SURFACEFORMAT_R8G8B8A8_SNORM,
+};
+
+bool
+brw_render_target_supported(gl_format format)
+{
+   if (format == MESA_FORMAT_S8_Z24 ||
+       format == MESA_FORMAT_X8_Z24 ||
+       format == MESA_FORMAT_Z16) {
+      return true;
+   }
+
+   /* Not exactly true, as some of those formats are not renderable.
+    * But at least we know how to translate them.
+    */
+   return brw_format_for_mesa_format[format] != 0;
+}
 
 static GLuint translate_tex_format( gl_format mesa_format,
                                     GLenum internal_format,
 				    GLenum depth_mode )
 {
    switch( mesa_format ) {
-   case MESA_FORMAT_L8:
-      return BRW_SURFACEFORMAT_L8_UNORM;
-
-   case MESA_FORMAT_I8:
-      return BRW_SURFACEFORMAT_I8_UNORM;
-
-   case MESA_FORMAT_A8:
-      return BRW_SURFACEFORMAT_A8_UNORM; 
-
-   case MESA_FORMAT_AL88:
-      return BRW_SURFACEFORMAT_L8A8_UNORM;
-
-   case MESA_FORMAT_AL1616:
-      return BRW_SURFACEFORMAT_L16A16_UNORM;
-
-   case MESA_FORMAT_R8:
-      return BRW_SURFACEFORMAT_R8_UNORM;
-
-   case MESA_FORMAT_R16:
-      return BRW_SURFACEFORMAT_R16_UNORM;
-
-   case MESA_FORMAT_RG88:
-      return BRW_SURFACEFORMAT_R8G8_UNORM;
-
-   case MESA_FORMAT_RG1616:
-      return BRW_SURFACEFORMAT_R16G16_UNORM;
-
-   case MESA_FORMAT_RGB888:
-      assert(0);		/* not supported for sampling */
-      return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
-
-   case MESA_FORMAT_ARGB8888:
-      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-   case MESA_FORMAT_XRGB8888:
-      return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
-
-   case MESA_FORMAT_RGBA8888_REV:
-      _mesa_problem(NULL, "unexpected format in i965:translate_tex_format()");
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
-
-   case MESA_FORMAT_RGB565:
-      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
-
-   case MESA_FORMAT_ARGB1555:
-      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
-
-   case MESA_FORMAT_ARGB4444:
-      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
-
-   case MESA_FORMAT_YCBCR_REV:
-      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
-
-   case MESA_FORMAT_YCBCR:
-      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
-
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-      return BRW_SURFACEFORMAT_FXT1;
 
    case MESA_FORMAT_Z16:
       if (depth_mode == GL_INTENSITY) 
 	  return BRW_SURFACEFORMAT_I16_UNORM;
       else if (depth_mode == GL_ALPHA)
 	  return BRW_SURFACEFORMAT_A16_UNORM;
+      else if (depth_mode == GL_RED)
+	  return BRW_SURFACEFORMAT_R16_UNORM;
       else
 	  return BRW_SURFACEFORMAT_L16_UNORM;
 
-   case MESA_FORMAT_RGB_DXT1:
-       return BRW_SURFACEFORMAT_DXT1_RGB;
-
-   case MESA_FORMAT_RGBA_DXT1:
-       return BRW_SURFACEFORMAT_BC1_UNORM;
-       
-   case MESA_FORMAT_RGBA_DXT3:
-       return BRW_SURFACEFORMAT_BC2_UNORM;
-       
-   case MESA_FORMAT_RGBA_DXT5:
-       return BRW_SURFACEFORMAT_BC3_UNORM;
-
-   case MESA_FORMAT_SARGB8:
-      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
-
-   case MESA_FORMAT_SLA8:
-      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
-
-   case MESA_FORMAT_SL8:
-      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
-
-   case MESA_FORMAT_SRGB_DXT1:
-      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
-
    case MESA_FORMAT_S8_Z24:
       /* XXX: these different surface formats don't seem to
        * make any difference for shadow sampler/compares.
@@ -174,18 +142,14 @@ static GLuint translate_tex_format( gl_format mesa_format,
          return BRW_SURFACEFORMAT_I24X8_UNORM;
       else if (depth_mode == GL_ALPHA)
          return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else if (depth_mode == GL_RED)
+         return BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS;
       else
          return BRW_SURFACEFORMAT_L24X8_UNORM;
 
-   case MESA_FORMAT_DUDV8:
-      return BRW_SURFACEFORMAT_R8G8_SNORM;
-
-   case MESA_FORMAT_SIGNED_RGBA8888_REV:
-      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
-
    default:
-      assert(0);
-      return 0;
+      assert(brw_format_for_mesa_format[mesa_format] != 0);
+      return brw_format_for_mesa_format[mesa_format];
    }
 }
 
@@ -214,7 +178,7 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
    struct brw_context *brw = brw_context(ctx);
    struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
-   struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
+   struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel];
    const GLuint surf_index = SURF_INDEX_TEXTURE(unit);
    struct brw_surface_state surf;
    void *map;
@@ -232,7 +196,7 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit )
 /*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
    surf.ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */
 
-   surf.ss2.mip_count = intelObj->lastLevel - intelObj->firstLevel;
+   surf.ss2.mip_count = intelObj->_MaxLevel - tObj->BaseLevel;
    surf.ss2.width = firstImage->Width - 1;
    surf.ss2.height = firstImage->Height - 1;
    brw_set_surface_tiling(&surf, intelObj->mt->region->tiling);
@@ -274,6 +238,7 @@ brw_create_constant_surface(struct brw_context *brw,
 			    drm_intel_bo **out_bo,
 			    uint32_t *out_offset)
 {
+   struct intel_context *intel = &brw->intel;
    const GLint w = width - 1;
    struct brw_surface_state surf;
    void *map;
@@ -284,6 +249,9 @@ brw_create_constant_surface(struct brw_context *brw,
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
+   if (intel->gen >= 6)
+      surf.ss0.render_cache_read_write = 1;
+
    assert(bo);
    surf.ss1.base_addr = bo->offset; /* reloc */
 
@@ -404,6 +372,38 @@ const struct brw_tracked_state brw_wm_constant_surface = {
    .emit = upload_wm_constant_surface,
 };
 
+static void
+brw_update_null_renderbuffer_surface(struct brw_context *brw, unsigned int unit)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_surface_state surf;
+   void *map;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.surface_type = BRW_SURFACE_NULL;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+   surf.ss1.base_addr = 0;
+
+   surf.ss2.width = 0;
+   surf.ss2.height = 0;
+   brw_set_surface_tiling(&surf, I915_TILING_NONE);
+   surf.ss3.pitch = 0;
+
+   if (intel->gen < 6) {
+      /* _NEW_COLOR */
+      surf.ss0.color_blend = 0;
+      surf.ss0.writedisable_red =   1;
+      surf.ss0.writedisable_green = 1;
+      surf.ss0.writedisable_blue =  1;
+      surf.ss0.writedisable_alpha = 1;
+   }
+
+   map = brw_state_batch(brw, sizeof(surf), 32,
+			 &brw->wm.surf_bo[unit],
+			 &brw->wm.surf_offset[unit]);
+   memcpy(map, &surf, sizeof(surf));
+}
 
 /**
  * Sets up a surface state structure to point at the given region.
@@ -417,123 +417,53 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 {
    struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &intel->ctx;
-   drm_intel_bo *region_bo = NULL;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-   struct intel_region *region = irb ? irb->region : NULL;
-   struct {
-      unsigned int surface_type;
-      unsigned int surface_format;
-      unsigned int width, height, pitch, cpp;
-      GLubyte color_mask[4];
-      GLboolean color_blend;
-      uint32_t tiling;
-      uint32_t draw_x;
-      uint32_t draw_y;
-   } key;
+   struct intel_region *region = irb->region;
    struct brw_surface_state surf;
    void *map;
 
-   memset(&key, 0, sizeof(key));
-
-   if (region != NULL) {
-      region_bo = region->buffer;
-
-      key.surface_type = BRW_SURFACE_2D;
-      switch (irb->Base.Format) {
-      /* XRGB and ARGB are treated the same here because the chips in this
-       * family cannot render to XRGB targets.  This means that we have to
-       * mask writes to alpha (ala glColorMask) and reconfigure the alpha
-       * blending hardware to use GL_ONE (or GL_ZERO) for cases where
-       * GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is used.
-       */
-      case MESA_FORMAT_ARGB8888:
-      case MESA_FORMAT_XRGB8888:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-	 break;
-      case MESA_FORMAT_SARGB8:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
-	 break;
-      case MESA_FORMAT_RGB565:
-	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-	 break;
-      case MESA_FORMAT_ARGB1555:
-	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
-	 break;
-      case MESA_FORMAT_ARGB4444:
-	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
-	 break;
-      case MESA_FORMAT_A8:
-	 key.surface_format = BRW_SURFACEFORMAT_A8_UNORM;
-	 break;
-      case MESA_FORMAT_R8:
-	 key.surface_format = BRW_SURFACEFORMAT_R8_UNORM;
-	 break;
-      case MESA_FORMAT_R16:
-	 key.surface_format = BRW_SURFACEFORMAT_R16_UNORM;
-	 break;
-      case MESA_FORMAT_RG88:
-	 key.surface_format = BRW_SURFACEFORMAT_R8G8_UNORM;
-	 break;
-      case MESA_FORMAT_RG1616:
-	 key.surface_format = BRW_SURFACEFORMAT_R16G16_UNORM;
-	 break;
-      default:
-	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->Base.Format);
-      }
-      key.tiling = region->tiling;
-      key.width = rb->Width;
-      key.height = rb->Height;
-      key.pitch = region->pitch;
-      key.cpp = region->cpp;
-      key.draw_x = region->draw_x;
-      key.draw_y = region->draw_y;
-   } else {
-      key.surface_type = BRW_SURFACE_NULL;
-      key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-      key.tiling = I915_TILING_X;
-      key.width = 1;
-      key.height = 1;
-      key.cpp = 4;
-      key.draw_x = 0;
-      key.draw_y = 0;
-   }
-
-   if (intel->gen < 6) {
-      /* _NEW_COLOR */
-      memcpy(key.color_mask, ctx->Color.ColorMask[unit],
-	     sizeof(key.color_mask));
+   memset(&surf, 0, sizeof(surf));
 
-      /* As mentioned above, disable writes to the alpha component when the
-       * renderbuffer is XRGB.
+   switch (irb->Base.Format) {
+   case MESA_FORMAT_XRGB8888:
+      /* XRGB is handled as ARGB because the chips in this family
+       * cannot render to XRGB targets.  This means that we have to
+       * mask writes to alpha (ala glColorMask) and reconfigure the
+       * alpha blending hardware to use GL_ONE (or GL_ZERO) for
+       * cases where GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is
+       * used.
        */
-      if (ctx->DrawBuffer->Visual.alphaBits == 0)
-	 key.color_mask[3] = GL_FALSE;
-
-      key.color_blend = (!ctx->Color._LogicOpEnabled &&
-			 (ctx->Color.BlendEnabled & (1 << unit)));
+      surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      break;
+   case MESA_FORMAT_SARGB8:
+      /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB
+	 surfaces to the blend/update as sRGB */
+      surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      break;
+   default:
+      surf.ss0.surface_format = brw_format_for_mesa_format[irb->Base.Format];
+      assert(surf.ss0.surface_format != 0);
    }
 
-   memset(&surf, 0, sizeof(surf));
-
-   surf.ss0.surface_format = key.surface_format;
-   surf.ss0.surface_type = key.surface_type;
-   if (key.tiling == I915_TILING_NONE) {
-      surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
+   surf.ss0.surface_type = BRW_SURFACE_2D;
+   if (region->tiling == I915_TILING_NONE) {
+      surf.ss1.base_addr = (region->draw_x +
+			    region->draw_y * region->pitch) * region->cpp;
    } else {
       uint32_t tile_base, tile_x, tile_y;
-      uint32_t pitch = key.pitch * key.cpp;
+      uint32_t pitch = region->pitch * region->cpp;
 
-      if (key.tiling == I915_TILING_X) {
-	 tile_x = key.draw_x % (512 / key.cpp);
-	 tile_y = key.draw_y % 8;
-	 tile_base = ((key.draw_y / 8) * (8 * pitch));
-	 tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
+      if (region->tiling == I915_TILING_X) {
+	 tile_x = region->draw_x % (512 / region->cpp);
+	 tile_y = region->draw_y % 8;
+	 tile_base = ((region->draw_y / 8) * (8 * pitch));
+	 tile_base += (region->draw_x - tile_x) / (512 / region->cpp) * 4096;
       } else {
 	 /* Y */
-	 tile_x = key.draw_x % (128 / key.cpp);
-	 tile_y = key.draw_y % 32;
-	 tile_base = ((key.draw_y / 32) * (32 * pitch));
-	 tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
+	 tile_x = region->draw_x % (128 / region->cpp);
+	 tile_y = region->draw_y % 32;
+	 tile_base = ((region->draw_y / 32) * (32 * pitch));
+	 tile_base += (region->draw_x - tile_x) / (128 / region->cpp) * 4096;
       }
       assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0));
       assert(tile_x % 4 == 0);
@@ -545,21 +475,27 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss5.x_offset = tile_x / 4;
       surf.ss5.y_offset = tile_y / 2;
    }
-   if (region_bo != NULL)
-      surf.ss1.base_addr += region_bo->offset; /* reloc */
+   surf.ss1.base_addr += region->buffer->offset; /* reloc */
 
-   surf.ss2.width = key.width - 1;
-   surf.ss2.height = key.height - 1;
-   brw_set_surface_tiling(&surf, key.tiling);
-   surf.ss3.pitch = (key.pitch * key.cpp) - 1;
+   surf.ss2.width = rb->Width - 1;
+   surf.ss2.height = rb->Height - 1;
+   brw_set_surface_tiling(&surf, region->tiling);
+   surf.ss3.pitch = (region->pitch * region->cpp) - 1;
 
    if (intel->gen < 6) {
       /* _NEW_COLOR */
-      surf.ss0.color_blend = key.color_blend;
-      surf.ss0.writedisable_red =   !key.color_mask[0];
-      surf.ss0.writedisable_green = !key.color_mask[1];
-      surf.ss0.writedisable_blue =  !key.color_mask[2];
-      surf.ss0.writedisable_alpha = !key.color_mask[3];
+      surf.ss0.color_blend = (!ctx->Color._LogicOpEnabled &&
+			      (ctx->Color.BlendEnabled & (1 << unit)));
+      surf.ss0.writedisable_red =   !ctx->Color.ColorMask[unit][0];
+      surf.ss0.writedisable_green = !ctx->Color.ColorMask[unit][1];
+      surf.ss0.writedisable_blue =  !ctx->Color.ColorMask[unit][2];
+      /* As mentioned above, disable writes to the alpha component when the
+       * renderbuffer is XRGB.
+       */
+      if (ctx->DrawBuffer->Visual.alphaBits == 0)
+	 surf.ss0.writedisable_alpha = 1;
+      else
+	 surf.ss0.writedisable_alpha = !ctx->Color.ColorMask[unit][3];
    }
 
    map = brw_state_batch(brw, sizeof(surf), 32,
@@ -567,15 +503,13 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 			 &brw->wm.surf_offset[unit]);
    memcpy(map, &surf, sizeof(surf));
 
-   if (region_bo != NULL) {
-      drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
-			      brw->wm.surf_offset[unit] +
-			      offsetof(struct brw_surface_state, ss1),
-			      region_bo,
-			      surf.ss1.base_addr - region_bo->offset,
-			      I915_GEM_DOMAIN_RENDER,
-			      I915_GEM_DOMAIN_RENDER);
-   }
+   drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+			   brw->wm.surf_offset[unit] +
+			   offsetof(struct brw_surface_state, ss1),
+			   region->buffer,
+			   surf.ss1.base_addr - region->buffer->offset,
+			   I915_GEM_DOMAIN_RENDER,
+			   I915_GEM_DOMAIN_RENDER);
 }
 
 static void
@@ -635,12 +569,16 @@ upload_wm_surfaces(struct brw_context *brw)
    /* Update surfaces for drawing buffers */
    if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-         brw_update_renderbuffer_surface(brw,
-					 ctx->DrawBuffer->_ColorDrawBuffers[i],
-					 i);
+	 if (intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[i])) {
+	    brw_update_renderbuffer_surface(brw,
+					    ctx->DrawBuffer->_ColorDrawBuffers[i],
+					    i);
+	 } else {
+	    brw_update_null_renderbuffer_surface(brw, i);
+	 }
       }
    } else {
-      brw_update_renderbuffer_surface(brw, NULL, 0);
+      brw_update_null_renderbuffer_surface(brw, 0);
    }
 
    /* Update surfaces for textures */
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 800a255..dbcdc5b 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -35,6 +35,7 @@
 struct gen6_blend_state_key {
    GLboolean color_blend, alpha_enabled;
    GLboolean dither;
+   GLboolean color_mask[BRW_MAX_DRAW_BUFFERS][4];
 
    GLenum logic_op;
 
@@ -54,6 +55,9 @@ blend_state_populate_key(struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* _NEW_COLOR */
+   memcpy(key->color_mask, ctx->Color.ColorMask, sizeof(key->color_mask));
+
+   /* _NEW_COLOR */
    if (ctx->Color._LogicOpEnabled)
       key->logic_op = ctx->Color.LogicOp;
    else
@@ -87,54 +91,62 @@ static drm_intel_bo *
 blend_state_create_from_key(struct brw_context *brw,
 			    struct gen6_blend_state_key *key)
 {
-   struct gen6_blend_state blend;
+   struct gen6_blend_state blend[BRW_MAX_DRAW_BUFFERS];
    drm_intel_bo *bo;
+   int b;
 
    memset(&blend, 0, sizeof(blend));
 
-   if (key->logic_op != GL_COPY) {
-      blend.blend1.logic_op_enable = 1;
-      blend.blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
+   for (b = 0; b < BRW_MAX_DRAW_BUFFERS; b++) {
+      if (key->logic_op != GL_COPY) {
+	 blend[b].blend1.logic_op_enable = 1;
+	 blend[b].blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
+      } else if (key->color_blend & (1 << b)) {
+	 GLenum eqRGB = key->blend_eq_rgb;
+	 GLenum eqA = key->blend_eq_a;
+	 GLenum srcRGB = key->blend_src_rgb;
+	 GLenum dstRGB = key->blend_dst_rgb;
+	 GLenum srcA = key->blend_src_a;
+	 GLenum dstA = key->blend_dst_a;
+
+	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	    srcRGB = dstRGB = GL_ONE;
+	 }
+
+	 if (eqA == GL_MIN || eqA == GL_MAX) {
+	    srcA = dstA = GL_ONE;
+	 }
+
+	 blend[b].blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+	 blend[b].blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
+	 blend[b].blend0.blend_func = brw_translate_blend_equation(eqRGB);
+
+	 blend[b].blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+	 blend[b].blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
+	 blend[b].blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+
+	 blend[b].blend0.blend_enable = 1;
+	 blend[b].blend0.ia_blend_enable = (srcA != srcRGB ||
+					 dstA != dstRGB ||
+					 eqA != eqRGB);
       }
 
-      blend.blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      blend.blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
-      blend.blend0.blend_func = brw_translate_blend_equation(eqRGB);
-
-      blend.blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      blend.blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
-      blend.blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+      if (key->alpha_enabled) {
+	 blend[b].blend1.alpha_test_enable = 1;
+	 blend[b].blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
 
-      blend.blend0.blend_enable = 1;
-      blend.blend0.ia_blend_enable = (srcA != srcRGB ||
-				      dstA != dstRGB ||
-				      eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      blend.blend1.alpha_test_enable = 1;
-      blend.blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      }
 
-   }
+      if (key->dither) {
+	 blend[b].blend1.dither_enable = 1;
+	 blend[b].blend1.y_dither_offset = 0;
+	 blend[b].blend1.x_dither_offset = 0;
+      }
 
-   if (key->dither) {
-      blend.blend1.dither_enable = 1;
-      blend.blend1.y_dither_offset = 0;
-      blend.blend1.x_dither_offset = 0;
+      blend[b].blend1.write_disable_r = !key->color_mask[b][0];
+      blend[b].blend1.write_disable_g = !key->color_mask[b][1];
+      blend[b].blend1.write_disable_b = !key->color_mask[b][2];
+      blend[b].blend1.write_disable_a = !key->color_mask[b][3];
    }
 
    bo = brw_upload_cache(&brw->cache, BRW_BLEND_STATE,
@@ -172,7 +184,7 @@ const struct brw_tracked_state gen6_blend_state = {
 };
 
 struct gen6_color_calc_state_key {
-   GLubyte blend_constant_color[4];
+   float blend_constant_color[4];
    GLclampf alpha_ref;
    GLubyte stencil_ref[2];
 };
@@ -266,7 +278,7 @@ static void upload_cc_state_pointers(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
 
    BEGIN_BATCH(4);
-   OUT_BATCH(CMD_3D_CC_STATE_POINTERS << 16 | (4 - 2));
+   OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2));
    OUT_RELOC(brw->cc.blend_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    OUT_RELOC(brw->cc.depth_stencil_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index c65b41e..38c98f3 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -43,7 +43,10 @@ upload_clip_state(struct brw_context *brw)
       depth_clamp = GEN6_CLIP_Z_TEST;
 
    if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
-      provoking = 0;
+      provoking =
+	 (0 << GEN6_CLIP_TRI_PROVOKE_SHIFT) |
+	 (1 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) |
+	 (0 << GEN6_CLIP_LINE_PROVOKE_SHIFT);
    } else {
       provoking =
 	 (2 << GEN6_CLIP_TRI_PROVOKE_SHIFT) |
@@ -55,7 +58,7 @@ upload_clip_state(struct brw_context *brw)
    userclip = (1 << brw_count_bits(ctx->Transform.ClipPlanesEnabled)) - 1;
 
    BEGIN_BATCH(4);
-   OUT_BATCH(CMD_3D_CLIP_STATE << 16 | (4 - 2));
+   OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
    OUT_BATCH(GEN6_CLIP_STATISTICS_ENABLE);
    OUT_BATCH(GEN6_CLIP_ENABLE |
 	     GEN6_CLIP_API_OGL |
@@ -64,7 +67,9 @@ upload_clip_state(struct brw_context *brw)
 	     userclip << GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT |
 	     depth_clamp |
 	     provoking);
-   OUT_BATCH(GEN6_CLIP_FORCE_ZERO_RTAINDEX);
+   OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
+             U_FIXED(225.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
+             GEN6_CLIP_FORCE_ZERO_RTAINDEX);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index 6127b91..7296c7c 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -37,7 +37,7 @@ upload_gs_state(struct brw_context *brw)
 
    /* Disable all the constant buffers. */
    BEGIN_BATCH(5);
-   OUT_BATCH(CMD_3D_CONSTANT_GS_STATE << 16 | (5 - 2));
+   OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 | (5 - 2));
    OUT_BATCH(0);
    OUT_BATCH(0);
    OUT_BATCH(0);
@@ -46,7 +46,7 @@ upload_gs_state(struct brw_context *brw)
 
    if (brw->gs.prog_bo) {
       BEGIN_BATCH(7);
-      OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2));
+      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
       OUT_RELOC(brw->gs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
       OUT_BATCH(GEN6_GS_SPF_MODE |
 		(0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
@@ -62,7 +62,7 @@ upload_gs_state(struct brw_context *brw)
       ADVANCE_BATCH();
    } else {
       BEGIN_BATCH(7);
-      OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2));
+      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
       OUT_BATCH(0); /* prog_bo */
       OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
 		(0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
index fc5d391..f65c651 100644
--- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
@@ -36,7 +36,7 @@ upload_sampler_state_pointers(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
 
    BEGIN_BATCH(4);
-   OUT_BATCH(CMD_3D_SAMPLER_STATE_POINTERS << 16 |
+   OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS << 16 |
 	     VS_SAMPLER_STATE_CHANGE |
 	     GS_SAMPLER_STATE_CHANGE |
 	     PS_SAMPLER_STATE_CHANGE |
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index b57126c..12b6582 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -92,7 +92,7 @@ static void upload_scissor_state_pointers(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
 
    BEGIN_BATCH(2);
-   OUT_BATCH(CMD_3D_SCISSOR_STATE_POINTERS << 16 | (2 - 2));
+   OUT_BATCH(_3DSTATE_SCISSOR_STATE_POINTERS << 16 | (2 - 2));
    OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    ADVANCE_BATCH();
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 471067e..f277829 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -33,9 +33,10 @@
 #include "intel_batchbuffer.h"
 
 static uint32_t
-get_attr_override(struct brw_context *brw, int fs_attr)
+get_attr_override(struct brw_context *brw, int fs_attr, int two_side_color)
 {
    int attr_index = 0, i, vs_attr;
+   int bfc = 0;
 
    if (fs_attr <= FRAG_ATTRIB_TEX7)
       vs_attr = fs_attr;
@@ -57,6 +58,30 @@ get_attr_override(struct brw_context *brw, int fs_attr)
 	 attr_index++;
    }
 
+   assert(attr_index < 32);
+
+   if (two_side_color) {
+       if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+                (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+   }
+
+   if (bfc && (fs_attr <= FRAG_ATTRIB_TEX7 && fs_attr > FRAG_ATTRIB_WPOS)) {
+       if (fs_attr == FRAG_ATTRIB_COL0)
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       else if (fs_attr == FRAG_ATTRIB_COL1 && bfc == 2) {
+           attr_index++;
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       } else {
+           attr_index += bfc;
+       }
+   }
+
    return attr_index;
 }
 
@@ -67,13 +92,15 @@ upload_sf_state(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    /* CACHE_NEW_VS_PROG */
    uint32_t num_inputs = brw_count_bits(brw->vs.prog_data->outputs_written);
+   /* BRW_NEW_FRAGMENT_PROGRAM */
    uint32_t num_outputs = brw_count_bits(brw->fragment_program->Base.InputsRead);
-   uint32_t dw1, dw2, dw3, dw4, dw16;
+   uint32_t dw1, dw2, dw3, dw4, dw16, dw17;
    int i;
    /* _NEW_BUFFER */
    GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    int attr = 0;
    int urb_start;
+   int two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_TRANSFORM */
    if (ctx->Transform.ClipPlanesEnabled)
@@ -91,6 +118,7 @@ upload_sf_state(struct brw_context *brw)
    dw3 = 0;
    dw4 = 0;
    dw16 = 0;
+   dw17 = 0;
 
    /* _NEW_POLYGON */
    if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo)
@@ -99,6 +127,48 @@ upload_sf_state(struct brw_context *brw)
    if (ctx->Polygon.OffsetFill)
        dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
 
+   if (ctx->Polygon.OffsetLine)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
+
+   if (ctx->Polygon.OffsetPoint)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
+
+   switch (ctx->Polygon.FrontMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_FRONT_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_FRONT_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_FRONT_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
+   switch (ctx->Polygon.BackMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_BACK_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_BACK_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_BACK_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
    /* _NEW_SCISSOR */
    if (ctx->Scissor.Enabled)
       dw3 |= GEN6_SF_SCISSOR_ENABLE;
@@ -160,8 +230,14 @@ upload_sf_state(struct brw_context *brw)
        }
    }
 
+   /* flat shading */
+   if (ctx->Light.ShadeModel == GL_FLAT) {
+       dw17 |= ((brw->fragment_program->Base.InputsRead & (FRAG_BIT_COL0 | FRAG_BIT_COL1)) >>
+                ((brw->fragment_program->Base.InputsRead & FRAG_BIT_WPOS) ? 0 : 1));
+   }
+
    BEGIN_BATCH(20);
-   OUT_BATCH(CMD_3D_SF_STATE << 16 | (20 - 2));
+   OUT_BATCH(_3DSTATE_SF << 16 | (20 - 2));
    OUT_BATCH(dw1);
    OUT_BATCH(dw2);
    OUT_BATCH(dw3);
@@ -174,7 +250,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr);
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color);
 	    attr++;
 	    break;
 	 }
@@ -182,7 +258,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr) << 16;
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color) << 16;
 	    attr++;
 	    break;
 	 }
@@ -190,7 +266,7 @@ upload_sf_state(struct brw_context *brw)
       OUT_BATCH(attr_overrides);
    }
    OUT_BATCH(dw16); /* point sprite texcoord bitmask */
-   OUT_BATCH(0); /* constant interp bitmask */
+   OUT_BATCH(dw17); /* constant interp bitmask */
    OUT_BATCH(0); /* wrapshortest enables 0-7 */
    OUT_BATCH(0); /* wrapshortest enables 8-15 */
    ADVANCE_BATCH();
@@ -205,7 +281,8 @@ const struct brw_tracked_state gen6_sf_state = {
 		_NEW_BUFFERS |
 		_NEW_POINT |
 		_NEW_TRANSFORM),
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = (BRW_NEW_CONTEXT |
+		BRW_NEW_FRAGMENT_PROGRAM),
       .cache = CACHE_NEW_VS_PROG
    },
    .emit = upload_sf_state,
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index a341234..fc46c4c 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -60,7 +60,7 @@ upload_urb(struct brw_context *brw)
    assert(!brw->gs.prog_bo || brw->urb.vs_size < 5);
 
    BEGIN_BATCH(3);
-   OUT_BATCH(CMD_URB << 16 | (3 - 2));
+   OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
    OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
 	     ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
    OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
@@ -72,7 +72,7 @@ const struct brw_tracked_state gen6_urb = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_CONTEXT,
-      .cache = CACHE_NEW_VS_PROG,
+      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
    },
    .prepare = prepare_urb,
    .emit = upload_urb,
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index d691bbe..cd7d209 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -117,7 +117,7 @@ static void upload_viewport_state_pointers(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
 
    BEGIN_BATCH(4);
-   OUT_BATCH(CMD_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) |
+   OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) |
 	     GEN6_CC_VIEWPORT_MODIFY |
 	     GEN6_SF_VIEWPORT_MODIFY |
 	     GEN6_CLIP_VIEWPORT_MODIFY);
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index e94d0c0..e68c0ac 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -47,14 +47,14 @@ upload_vs_state(struct brw_context *brw)
    if (brw->vs.prog_data->nr_params == 0 && !ctx->Transform.ClipPlanesEnabled) {
       /* Disable the push constant buffers. */
       BEGIN_BATCH(5);
-      OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 | (5 - 2));
+      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2));
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
-      int params_uploaded = 0;
+      int params_uploaded = 0, param_regs;
       float *param;
 
       if (brw->vertex_program->IsNVProgram)
@@ -88,20 +88,11 @@ upload_vs_state(struct brw_context *brw)
 	 params_uploaded++;
       }
 
-      if (vp->use_const_buffer) {
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       memcpy(param + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	       params_uploaded++;
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr_params; i++) {
-	    memcpy(param, vp->program.Base.Parameters->ParameterValues[i],
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    memcpy(param + brw->vs.constant_map[i] * 4,
+		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
-	    param += 4;
 	    params_uploaded++;
 	 }
       }
@@ -117,13 +108,16 @@ upload_vs_state(struct brw_context *brw)
 
       drm_intel_gem_bo_unmap_gtt(constant_bo);
 
+      param_regs = (params_uploaded + 1) / 2;
+      assert(param_regs <= 32);
+
       BEGIN_BATCH(5);
-      OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 |
+      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
       OUT_RELOC(constant_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(params_uploaded, 2) / 2 - 1);
+		param_regs - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -133,9 +127,10 @@ upload_vs_state(struct brw_context *brw)
    }
 
    BEGIN_BATCH(6);
-   OUT_BATCH(CMD_3D_VS_STATE << 16 | (6 - 2));
+   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
    OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
+	     GEN6_VS_FLOATING_POINT_MODE_ALT |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
    OUT_BATCH(0); /* scratch space base offset */
    OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) |
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index ea5418b..78901ec 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -66,6 +66,21 @@ prepare_wm_constants(struct brw_context *brw)
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
 				      *brw->wm.prog_data->param[i]);
       }
+
+      if (0) {
+	 printf("WM constants:\n");
+	 for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
+	    if ((i & 7) == 0)
+	       printf("g%d: ", brw->wm.prog_data->first_curbe_grf + i / 8);
+	    printf("%8f ", constants[i]);
+	    if ((i & 7) == 7)
+	       printf("\n");
+	 }
+	 if ((i & 7) != 0)
+	    printf("\n");
+	 printf("\n");
+      }
+
       drm_intel_gem_bo_unmap_gtt(brw->wm.push_const_bo);
    }
 }
@@ -88,10 +103,11 @@ upload_wm_state(struct brw_context *brw)
       brw_fragment_program_const(brw->fragment_program);
    uint32_t dw2, dw4, dw5, dw6;
 
+   /* CACHE_NEW_WM_PROG */
    if (brw->wm.prog_data->nr_params == 0) {
       /* Disable the push constant buffers. */
       BEGIN_BATCH(5);
-      OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 | (5 - 2));
+      OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (5 - 2));
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -99,12 +115,13 @@ upload_wm_state(struct brw_context *brw)
       ADVANCE_BATCH();
    } else {
       BEGIN_BATCH(5);
-      OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 |
+      OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
       OUT_RELOC(brw->wm.push_const_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(brw->wm.prog_data->nr_params, 8) / 8 - 1);
+		ALIGN(brw->wm.prog_data->nr_params,
+		      brw->wm.prog_data->dispatch_width) / 8 - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -116,6 +133,9 @@ upload_wm_state(struct brw_context *brw)
    dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0;
    dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5;
 
+   /* OpenGL non-ieee floating point mode */
+   dw2 |= GEN6_WM_FLOATING_POINT_MODE_ALT;
+
    /* BRW_NEW_NR_WM_SURFACES */
    dw2 |= brw->wm.nr_surfaces << GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT;
 
@@ -126,8 +146,8 @@ upload_wm_state(struct brw_context *brw)
 
    dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   if (fp->isGLSL)
+   /* CACHE_NEW_WM_PROG */
+   if (brw->wm.prog_data->dispatch_width == 8)
       dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
    else
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
@@ -161,7 +181,7 @@ upload_wm_state(struct brw_context *brw)
       GEN6_WM_NUM_SF_OUTPUTS_SHIFT;
 
    BEGIN_BATCH(9);
-   OUT_BATCH(CMD_3D_WM_STATE << 16 | (9 - 2));
+   OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
    OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_BATCH(dw2);
    OUT_BATCH(0); /* scratch space base offset */
@@ -176,13 +196,14 @@ upload_wm_state(struct brw_context *brw)
 const struct brw_tracked_state gen6_wm_state = {
    .dirty = {
       .mesa  = (_NEW_LINE | _NEW_POLYGONSTIPPLE | _NEW_COLOR | _NEW_BUFFERS |
-		_NEW_PROGRAM_CONSTANTS),
+		_NEW_PROGRAM_CONSTANTS | _NEW_POLYGON),
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_FRAGMENT_PROGRAM |
                 BRW_NEW_NR_WM_SURFACES |
 		BRW_NEW_URB_FENCE |
 		BRW_NEW_BATCH),
-      .cache = CACHE_NEW_SAMPLER
+      .cache = (CACHE_NEW_SAMPLER |
+		CACHE_NEW_WM_PROG)
    },
    .emit = upload_wm_state,
 };