69 files changed, 879 insertions, 435 deletions
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index c4516ba..dd48956 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 8e8cf6a..441bfec 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -326,6 +326,13 @@ enum a3xx_tex_type {
 	A3XX_TEX_3D = 3,
 };
 
+enum a3xx_tex_msaa {
+	A3XX_TPL1_MSAA1X = 0,
+	A3XX_TPL1_MSAA2X = 1,
+	A3XX_TPL1_MSAA4X = 2,
+	A3XX_TPL1_MSAA8X = 3,
+};
+
 #define A3XX_INT0_RBBM_GPU_IDLE					0x00000001
 #define A3XX_INT0_RBBM_AHB_ERROR				0x00000002
 #define A3XX_INT0_RBBM_REG_TIMEOUT				0x00000004
@@ -2652,6 +2659,7 @@ static inline uint32_t A3XX_VGT_DRAW_INITIATOR_NUM_INSTANCES(uint32_t val)
 #define REG_A3XX_VGT_IMMED_DATA					0x000021fd
 
 #define REG_A3XX_TEX_SAMP_0					0x00000000
+#define A3XX_TEX_SAMP_0_CLAMPENABLE				0x00000001
 #define A3XX_TEX_SAMP_0_MIPFILTER_LINEAR			0x00000002
 #define A3XX_TEX_SAMP_0_XY_MAG__MASK				0x0000000c
 #define A3XX_TEX_SAMP_0_XY_MAG__SHIFT				2
@@ -2695,6 +2703,7 @@ static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val
 {
 	return ((val) << A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT) & A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK;
 }
+#define A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF			0x01000000
 #define A3XX_TEX_SAMP_0_UNNORM_COORDS				0x80000000
 
 #define REG_A3XX_TEX_SAMP_1					0x00000001
@@ -2750,6 +2759,12 @@ static inline uint32_t A3XX_TEX_CONST_0_MIPLVLS(uint32_t val)
 {
 	return ((val) << A3XX_TEX_CONST_0_MIPLVLS__SHIFT) & A3XX_TEX_CONST_0_MIPLVLS__MASK;
 }
+#define A3XX_TEX_CONST_0_MSAATEX__MASK				0x00300000
+#define A3XX_TEX_CONST_0_MSAATEX__SHIFT				20
+static inline uint32_t A3XX_TEX_CONST_0_MSAATEX(enum a3xx_tex_msaa val)
+{
+	return ((val) << A3XX_TEX_CONST_0_MSAATEX__SHIFT) & A3XX_TEX_CONST_0_MSAATEX__MASK;
+}
 #define A3XX_TEX_CONST_0_FMT__MASK				0x1fc00000
 #define A3XX_TEX_CONST_0_FMT__SHIFT				22
 static inline uint32_t A3XX_TEX_CONST_0_FMT(enum a3xx_tex_fmt val)
@@ -2785,7 +2800,7 @@ static inline uint32_t A3XX_TEX_CONST_1_FETCHSIZE(enum a3xx_tex_fetchsize val)
 }
 
 #define REG_A3XX_TEX_CONST_2					0x00000002
-#define A3XX_TEX_CONST_2_INDX__MASK				0x000000ff
+#define A3XX_TEX_CONST_2_INDX__MASK				0x000001ff
 #define A3XX_TEX_CONST_2_INDX__SHIFT				0
 static inline uint32_t A3XX_TEX_CONST_2_INDX(uint32_t val)
 {
@@ -2805,7 +2820,7 @@ static inline uint32_t A3XX_TEX_CONST_2_SWAP(enum a3xx_color_swap val)
 }
 
 #define REG_A3XX_TEX_CONST_3					0x00000003
-#define A3XX_TEX_CONST_3_LAYERSZ1__MASK				0x00007fff
+#define A3XX_TEX_CONST_3_LAYERSZ1__MASK				0x0001ffff
 #define A3XX_TEX_CONST_3_LAYERSZ1__SHIFT			0
 static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ1(uint32_t val)
 {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
index ec87aa9..04cb9b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
@@ -262,6 +262,15 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = {
 	_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
 	_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
 	_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
+
+	_T(DXT1_RGB,   DXT1, NONE, WZYX),
+	_T(DXT1_SRGB,  DXT1, NONE, WZYX),
+	_T(DXT1_RGBA,  DXT1, NONE, WZYX),
+	_T(DXT1_SRGBA, DXT1, NONE, WZYX),
+	_T(DXT3_RGBA,  DXT3, NONE, WZYX),
+	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
+	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
+	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
 };
 
 enum a3xx_vtx_fmt
@@ -301,7 +310,7 @@ fd3_pipe2fetchsize(enum pipe_format format)
 {
 	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
 		format = PIPE_FORMAT_Z32_FLOAT;
-	switch (util_format_get_blocksizebits(format)) {
+	switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) {
 	case 8: return TFETCH_1_BYTE;
 	case 16: return TFETCH_2_BYTE;
 	case 32: return TFETCH_4_BYTE;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
index 9c16804..583caaa 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
@@ -73,7 +73,7 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,
 	so->gras_su_poly_offset_scale =
 			A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
-			A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units);
+			A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f);
 
 	so->gras_su_mode_control =
 			A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index c30658d..2d6ecb2 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -115,6 +115,7 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 
 	so->texsamp0 =
 			COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) |
+			COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) |
 			COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) |
 			A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) |
 			A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) |
@@ -239,7 +240,7 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 			A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
 	/* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */
 	so->texconst2 =
-			A3XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp);
+			A3XX_TEX_CONST_2_PITCH(util_format_get_nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
 	switch (prsc->target) {
 	case PIPE_TEXTURE_1D_ARRAY:
 	case PIPE_TEXTURE_2D_ARRAY:
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 563f70a..2e1d712 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -162,10 +162,13 @@ enum a4xx_tex_fmt {
 	TFMT4_8_UNORM = 4,
 	TFMT4_8_8_UNORM = 14,
 	TFMT4_8_8_8_8_UNORM = 28,
+	TFMT4_8_SNORM = 5,
 	TFMT4_8_8_SNORM = 15,
 	TFMT4_8_8_8_8_SNORM = 29,
+	TFMT4_8_UINT = 6,
 	TFMT4_8_8_UINT = 16,
 	TFMT4_8_8_8_8_UINT = 30,
+	TFMT4_8_SINT = 7,
 	TFMT4_8_8_SINT = 17,
 	TFMT4_8_8_8_8_SINT = 31,
 	TFMT4_16_UINT = 21,
@@ -430,7 +433,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(enum a3xx_color_swap val)
 	return ((val) << A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__SHIFT) & A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__MASK;
 }
 #define A4XX_RB_MRT_BUF_INFO_COLOR_SRGB				0x00002000
-#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK		0x007fc000
+#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK		0xffffc000
 #define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__SHIFT		14
 static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
 {
@@ -440,7 +443,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
 static inline uint32_t REG_A4XX_RB_MRT_BASE(uint32_t i0) { return 0x000020a6 + 0x5*i0; }
 
 static inline uint32_t REG_A4XX_RB_MRT_CONTROL3(uint32_t i0) { return 0x000020a7 + 0x5*i0; }
-#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK			0x0001fff8
+#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK			0x03fffff8
 #define A4XX_RB_MRT_CONTROL3_STRIDE__SHIFT			3
 static inline uint32_t A4XX_RB_MRT_CONTROL3_STRIDE(uint32_t val)
 {
@@ -1460,6 +1463,7 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
 {
 	return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
 }
+#define A4XX_SP_FS_MRT_REG_COLOR_SRGB				0x00040000
 
 #define REG_A4XX_SP_CS_CTRL_REG0				0x00002300
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index ab7850e..3a1d4b6 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -56,6 +56,7 @@ struct fd4_emit {
 	uint32_t sprite_coord_enable;  /* bitmask */
 	bool sprite_coord_mode;
 	bool rasterflat;
+	bool no_decode_srgb;
 
 	/* cached to avoid repeated lookups of same variants: */
 	struct ir3_shader_variant *vp, *fp;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 3e00454..6c9e217 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -79,9 +79,9 @@ struct fd4_format {
 static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	/* 8-bit */
 	VT(R8_UNORM,   8_UNORM, R8_UNORM, WZYX),
-	V_(R8_SNORM,   8_SNORM, NONE,     WZYX),
-	V_(R8_UINT,    8_UINT,  NONE,     WZYX),
-	V_(R8_SINT,    8_SINT,  NONE,     WZYX),
+	VT(R8_SNORM,   8_SNORM, NONE,     WZYX),
+	VT(R8_UINT,    8_UINT,  NONE,     WZYX),
+	VT(R8_SINT,    8_SINT,  NONE,     WZYX),
 	V_(R8_USCALED, 8_UINT,  NONE,     WZYX),
 	V_(R8_SSCALED, 8_UINT,  NONE,     WZYX),
 
@@ -115,8 +115,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 
 	VT(R8G8_UNORM,   8_8_UNORM, R8G8_UNORM, WZYX),
 	VT(R8G8_SNORM,   8_8_SNORM, R8G8_SNORM, WZYX),
-	VT(R8G8_UINT,    8_8_UINT,  NONE,       WZYX),
-	VT(R8G8_SINT,    8_8_SINT,  NONE,       WZYX),
+	VT(R8G8_UINT,    8_8_UINT,  R8G8_UINT,  WZYX),
+	VT(R8G8_SINT,    8_8_SINT,  R8G8_SINT,  WZYX),
 	V_(R8G8_USCALED, 8_8_UINT,  NONE,       WZYX),
 	V_(R8G8_SSCALED, 8_8_SINT,  NONE,       WZYX),
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 81c37f7..3f8bbf3 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -46,7 +46,8 @@
 
 static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
-		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
+		struct pipe_surface **bufs, uint32_t *bases,
+		uint32_t bin_w, bool decode_srgb)
 {
 	enum a4xx_tile_mode tile_mode;
 	unsigned i;
@@ -60,6 +61,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		enum a4xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
+		bool srgb = false;
 		struct fd_resource *rsc = NULL;
 		struct fd_resource_slice *slice = NULL;
 		uint32_t stride = 0;
@@ -68,10 +70,9 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
-			enum pipe_format pformat = 0;
+			enum pipe_format pformat = psurf->format;
 
 			rsc = fd_resource(psurf->texture);
-			pformat = psurf->format;
 
 			/* In case we're drawing to Z32F_S8, the "color" actually goes to
 			 * the stencil
@@ -86,6 +87,11 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			format = fd4_pipe2color(pformat);
 			swap = fd4_pipe2swap(pformat);
 
+			if (decode_srgb)
+				srgb = util_format_is_srgb(pformat);
+			else
+				pformat = util_format_linear(pformat);
+
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
 			offset = fd_resource_offset(rsc, psurf->u.tex.level,
@@ -108,7 +114,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
-				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
+				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) |
+				COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB));
 		if (bin_w || (i >= nr_bufs) || !bufs[i]) {
 			OUT_RING(ring, base);
 			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
@@ -282,7 +289,7 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_surface *zsbufs[2];
 
-	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
+	emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false);
 
 	if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
 		/* The gmem_restore_tex logic will put the first buffer's stencil
@@ -315,6 +322,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 			.key = {
 				.half_precision = fd_half_precision(pfb),
 			},
+			.no_decode_srgb = true,
 	};
 	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	float x0, y0, x1, y1;
@@ -520,7 +528,7 @@ fd4_emit_sysmem_prep(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
 			A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
@@ -677,7 +685,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
 	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 1a6d014..a3d7123 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -450,10 +450,15 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
 	for (i = 0; i < 8; i++) {
 		enum a4xx_color_fmt format = 0;
-		if (i < nr)
+		bool srgb = false;
+		if (i < nr) {
 			format = fd4_emit_format(bufs[i]);
+			if (bufs[i] && !emit->no_decode_srgb)
+				srgb = util_format_is_srgb(bufs[i]->format);
+		}
 		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
 				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+				COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
 				COND(emit->key.half_precision,
 					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index d2bc5fe..213b29c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -187,9 +187,9 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_CUBE:
-	case PIPE_TEXTURE_CUBE_ARRAY:  /* ?? not sure about _CUBE_ARRAY */
+	case PIPE_TEXTURE_CUBE_ARRAY:
 		so->texconst3 =
-			A4XX_TEX_CONST_3_DEPTH(1) |
+			A4XX_TEX_CONST_3_DEPTH(prsc->array_size / 6) |
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_3D:
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 00b6acb..29944b7 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 98a90e2..432dce3 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 8e6d431..0b6b9fb 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -131,11 +131,13 @@ static void
 fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 		unsigned flags)
 {
+	struct fd_ringbuffer *ring = fd_context(pctx)->ring;
+
 	fd_context_render(pctx);
 
 	if (fence) {
 		fd_screen_fence_ref(pctx->screen, fence, NULL);
-		*fence = fd_fence_create(pctx);
+		*fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(ring));
 	}
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 04a9fea..5125f09 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -50,35 +50,18 @@ fd_screen_fence_ref(struct pipe_screen *pscreen,
 	*ptr = pfence;
 }
 
-/* TODO we need to spiff out libdrm_freedreno a bit to allow passing
- * the timeout.. and maybe a better way to check if fence has been
- * signaled.  The current implementation is a bit lame for now to
- * avoid bumping libdrm version requirement.
- */
-
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
-		struct pipe_fence_handle *fence)
-{
-	uint32_t timestamp = fd_ringbuffer_timestamp(fence->ctx->ring);
-
-	/* TODO util helper for compare w/ rollover? */
-	return timestamp >= fence->timestamp;
-}
-
 boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
-	if (!timeout)
-		return fd_screen_fence_signalled(screen, fence);
-
-	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
+	if (fd_pipe_wait_timeout(fence->screen->pipe, fence->timestamp, timeout))
 		return false;
 
 	return true;
 }
 
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+		uint32_t timestamp)
 {
 	struct pipe_fence_handle *fence;
 	struct fd_context *ctx = fd_context(pctx);
@@ -91,7 +74,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
 
 	fence->ctx = ctx;
 	fence->screen = ctx->screen;
-	fence->timestamp = fd_ringbuffer_timestamp(ctx->ring);
+	fence->timestamp = timestamp;
 
 	return fence;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h
index e36bcc4..06c314a 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.h
+++ b/src/gallium/drivers/freedreno/freedreno_fence.h
@@ -34,11 +34,10 @@
 void fd_screen_fence_ref(struct pipe_screen *pscreen,
 		struct pipe_fence_handle **ptr,
 		struct pipe_fence_handle *pfence);
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
-		struct pipe_fence_handle *pfence);
 boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *pfence,
 		uint64_t timeout);
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx);
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+		uint32_t timestamp);
 
 #endif /* FREEDRENO_FENCE_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 709ad4e..98de096 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -222,7 +222,7 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 	ptrans->level = level;
 	ptrans->usage = usage;
 	ptrans->box = *box;
-	ptrans->stride = slice->pitch * rsc->cpp;
+	ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->cpp;
 	ptrans->layer_stride = slice->size0;
 
 	if (usage & PIPE_TRANSFER_READ)
@@ -375,9 +375,11 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
 
 	for (level = 0; level <= prsc->last_level; level++) {
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
+		uint32_t blocks;
 
 		slice->pitch = width = align(width, 32);
 		slice->offset = size;
+		blocks = util_format_get_nblocks(prsc->format, width, height);
 		/* 1d array and 2d array textures must all have the same layer size
 		 * for each miplevel on a3xx. 3d textures can have different layer
 		 * sizes for high levels, but the hw auto-sizer is buggy (or at least
@@ -387,9 +389,9 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
 		if (prsc->target == PIPE_TEXTURE_3D && (
 					level == 1 ||
 					(level > 1 && rsc->slices[level - 1].size0 > 0xf000)))
-			slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+			slice->size0 = align(blocks * rsc->cpp, alignment);
 		else if (level == 0 || rsc->layer_first || alignment == 1)
-			slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+			slice->size0 = align(blocks * rsc->cpp, alignment);
 		else
 			slice->size0 = rsc->slices[level - 1].size0;
 
@@ -459,7 +461,6 @@ fd_resource_create(struct pipe_screen *pscreen,
 	if (is_a4xx(fd_screen(pscreen))) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
-			/* TODO 3D_ARRAY? */
 			rsc->layer_first = false;
 			break;
 		default:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b55f5b3..86e9a21 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -163,7 +163,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-	case PIPE_CAP_CUBE_MAP_ARRAY:
 	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
@@ -176,6 +175,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
 	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@@ -191,8 +191,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 16383;
 
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 		return is_a3xx(screen);
 
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_CUBE_MAP_ARRAY:
+		return is_a4xx(screen);
+
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
 		return 256;
 
@@ -202,7 +207,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return is_ir3(screen) ? 130 : 120;
 
 	/* Unsupported features. */
-	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
@@ -230,8 +234,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 0ab3345..071901a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1636,6 +1636,11 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
 	}
 
+	/* the array coord for cube arrays needs 0.5 added to it */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array &&
+		opc != OPC_ISAML)
+		coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0);
+
 	/*
 	 * lay out the first argument in the proper order:
 	 *  - actual coordinates first
@@ -1759,6 +1764,12 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 
 	tex_info(tex, &flags, &coords);
 
+	/* Actually we want the number of dimensions, not coordinates. This
+	 * distinction only matters for cubes.
+	 */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+		coords = 2;
+
 	dst = get_dst(ctx, &tex->dest, 4);
 
 	compile_assert(ctx, tex->num_srcs == 1);
@@ -2301,7 +2312,7 @@ emit_instructions(struct ir3_compile *ctx)
 	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
 
 	/* Create inputs in first block: */
-	ctx->block = get_block(ctx, fxn->start_block);
+	ctx->block = get_block(ctx, nir_start_block(fxn));
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index dc9e462..bed7b7b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -29,6 +29,7 @@
 
 #include "ir3_nir.h"
 #include "glsl/nir/nir_builder.h"
+#include "glsl/nir/nir_control_flow.h"
 
 /* Based on nir_opt_peephole_select, and hacked up to more aggressively
  * flatten anything that can be flattened
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index eaf3b3c..8801839 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -189,7 +189,7 @@ ir3_ra_alloc_reg_set(void *memctx)
 	}
 
 	/* allocate the reg-set.. */
-	set->regs = ra_alloc_reg_set(set, ra_reg_count);
+	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
 	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
 	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
 
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index 24e0156..b2a639c 100644
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -120,7 +120,8 @@ i915_surface_copy_render(struct pipe_context *pipe,
 
    util_blitter_blit_generic(i915->blitter, dst_view, &dstbox,
                              src_view, src_box, src_width0, src_height0,
-                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                             FALSE);
    return;
 
 fallback:
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 3fae3bc..9346ea3 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -121,7 +121,8 @@ NV50_CODEGEN_SOURCES := \
 	codegen/nv50_ir_target_nv50.cpp \
 	codegen/nv50_ir_target_nv50.h \
 	codegen/nv50_ir_util.cpp \
-	codegen/nv50_ir_util.h
+	codegen/nv50_ir_util.h \
+	codegen/unordered_set.h
 
 NVC0_CODEGEN_SOURCES := \
 	codegen/nv50_ir_emit_gk110.cpp \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 3ddaeaf..ba1b085 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -29,8 +29,8 @@
 #include <deque>
 #include <list>
 #include <vector>
-#include <tr1/unordered_set>
 
+#include "codegen/unordered_set.h"
 #include "codegen/nv50_ir_util.h"
 #include "codegen/nv50_ir_graph.h"
 
@@ -585,10 +585,10 @@ public:
 
    static inline Value *get(Iterator&);
 
-   std::tr1::unordered_set<ValueRef *> uses;
+   unordered_set<ValueRef *> uses;
    std::list<ValueDef *> defs;
-   typedef std::tr1::unordered_set<ValueRef *>::iterator UseIterator;
-   typedef std::tr1::unordered_set<ValueRef *>::const_iterator UseCIterator;
+   typedef unordered_set<ValueRef *>::iterator UseIterator;
+   typedef unordered_set<ValueRef *>::const_iterator UseCIterator;
    typedef std::list<ValueDef *>::iterator DefIterator;
    typedef std::list<ValueDef *>::const_iterator DefCIterator;
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index f06056f..8f15429 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -933,6 +933,7 @@ CodeEmitterGK110::emitCVT(const Instruction *i)
 
    code[0] |= typeSizeofLog2(dType) << 10;
    code[0] |= typeSizeofLog2(i->sType) << 12;
+   code[1] |= i->subOp << 12;
 
    if (isSignedIntType(dType))
       code[0] |= 0x4000;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index ef5c87d..6e22788 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -818,6 +818,7 @@ CodeEmitterGM107::emitI2F()
    emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+   emitField(0x29, 2, insn->subOp);
    emitRND  (0x27, rnd, -1);
    emitField(0x0d, 1, isSignedType(insn->sType));
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
@@ -850,6 +851,7 @@ CodeEmitterGM107::emitI2I()
    emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+   emitField(0x29, 2, insn->subOp);
    emitField(0x0d, 1, isSignedType(insn->sType));
    emitField(0x0c, 1, isSignedType(insn->dType));
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index f607f3b..6bf5219 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1020,6 +1020,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
       code[0] |= util_logbase2(typeSizeof(dType)) << 20;
       code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
 
+      // for 8/16 source types, the byte/word is in subOp. word 1 is
+      // represented as 2.
+      code[1] |= i->subOp << 0x17;
+
       if (sat)
          code[0] |= 0x20;
       if (abs)
@@ -2614,11 +2618,12 @@ private:
          int imul; // integer MUL to MUL delay 3
       } res;
       struct ScoreData {
-         int r[64];
+         int r[256];
          int p[8];
          int c;
       } rd, wr;
       int base;
+      int regs;
 
       void rebase(const int base)
       {
@@ -2627,7 +2632,7 @@ private:
             return;
          this->base = 0;
 
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             rd.r[i] += delta;
             wr.r[i] += delta;
          }
@@ -2646,16 +2651,17 @@ private:
          res.imul += delta;
          res.tex += delta;
       }
-      void wipe()
+      void wipe(int regs)
       {
          memset(&rd, 0, sizeof(rd));
          memset(&wr, 0, sizeof(wr));
          memset(&res, 0, sizeof(res));
+         this->regs = regs;
       }
       int getLatest(const ScoreData& d) const
       {
          int max = 0;
-         for (int i = 0; i < 64; ++i)
+         for (int i = 0; i < regs; ++i)
             if (d.r[i] > max)
                max = d.r[i];
          for (int i = 0; i < 8; ++i)
@@ -2690,7 +2696,7 @@ private:
       }
       void setMax(const RegScores *that)
       {
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
             wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
          }
@@ -2711,7 +2717,7 @@ private:
       }
       void print(int cycle)
       {
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             if (rd.r[i] > cycle)
                INFO("rd $r%i @ %i\n", i, rd.r[i]);
             if (wr.r[i] > cycle)
@@ -2806,9 +2812,10 @@ SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
 bool
 SchedDataCalculator::visit(Function *func)
 {
+   int regs = targ->getFileSize(FILE_GPR) + 1;
    scoreBoards.resize(func->cfg.getSize());
    for (size_t i = 0; i < scoreBoards.size(); ++i)
-      scoreBoards[i].wipe();
+      scoreBoards[i].wipe(regs);
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 4847a0f..f153674 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2990,9 +2990,15 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_UBFE:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);
-         src1 = fetchSrc(1, c);
-         src2 = fetchSrc(2, c);
-         mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE &&
+             tgsi.getSrc(2).getFile() == TGSI_FILE_IMMEDIATE) {
+            src1 = loadImm(NULL, tgsi.getSrc(2).getValueU32(c, info) << 8 |
+                           tgsi.getSrc(1).getValueU32(c, info));
+         } else {
+            src1 = fetchSrc(1, c);
+            src2 = fetchSrc(2, c);
+            mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+         }
          mkOp2(OP_EXTBF, dstTy, dst0[c], src0, src1);
       }
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 1f3fce2..420cc4e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -193,100 +193,16 @@ GM107LoweringPass::visit(Instruction *i)
       checkPredicate(i);
 
    switch (i->op) {
-   case OP_TEX:
-   case OP_TXB:
-   case OP_TXL:
-   case OP_TXF:
-   case OP_TXG:
-      return handleTEX(i->asTex());
-   case OP_TXD:
-      return handleTXD(i->asTex());
-   case OP_TXLQ:
-      return handleTXLQ(i->asTex());
-   case OP_TXQ:
-      return handleTXQ(i->asTex());
-   case OP_EX2:
-      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
-      i->setSrc(0, i->getDef(0));
-      break;
-   case OP_POW:
-      return handlePOW(i);
-   case OP_DIV:
-      return handleDIV(i);
-   case OP_MOD:
-      return handleMOD(i);
-   case OP_SQRT:
-      return handleSQRT(i);
-   case OP_EXPORT:
-      return handleEXPORT(i);
    case OP_PFETCH:
       return handlePFETCH(i);
-   case OP_EMIT:
-   case OP_RESTART:
-      return handleOUT(i);
-   case OP_RDSV:
-      return handleRDSV(i);
-   case OP_WRSV:
-      return handleWRSV(i);
-   case OP_LOAD:
-      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
-         if (prog->getType() == Program::TYPE_COMPUTE) {
-            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
-            i->getSrc(0)->reg.fileIndex = 0;
-         } else
-         if (prog->getType() == Program::TYPE_GEOMETRY &&
-             i->src(0).isIndirect(0)) {
-            // XXX: this assumes vec4 units
-            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                    i->getIndirect(0, 0), bld.mkImm(4));
-            i->setIndirect(0, 0, ptr);
-            i->op = OP_VFETCH;
-         } else {
-            i->op = OP_VFETCH;
-            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
-         }
-      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-         if (i->src(0).isIndirect(1)) {
-            Value *ptr;
-            if (i->src(0).isIndirect(0))
-               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(0x1010),
-                                i->getIndirect(0, 0));
-            else
-               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(16));
-            i->setIndirect(0, 1, NULL);
-            i->setIndirect(0, 0, ptr);
-            i->subOp = NV50_IR_SUBOP_LDC_IS;
-         }
-      }
-      break;
-   case OP_ATOM:
-   {
-      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
-      handleATOM(i);
-      handleCasExch(i, cctl);
-   }
-      break;
-   case OP_SULDB:
-   case OP_SULDP:
-   case OP_SUSTB:
-   case OP_SUSTP:
-   case OP_SUREDB:
-   case OP_SUREDP:
-      handleSurfaceOpNVE4(i->asTex());
-      break;
    case OP_DFDX:
    case OP_DFDY:
-      handleDFDX(i);
-      break;
+      return handleDFDX(i);
    case OP_POPCNT:
-      handlePOPCNT(i);
-      break;
+      return handlePOPCNT(i);
    default:
-      break;
+      return NVC0LoweringPass::visit(i);
    }
-   return true;
 }
 
 } // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index c3c302d..b1f4065 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -224,7 +224,7 @@ NVC0LegalizePostRA::findFirstUses(
       const Instruction *texi,
       const Instruction *insn,
       std::list<TexUse> &uses,
-      std::tr1::unordered_set<const Instruction *>& visited)
+      unordered_set<const Instruction *>& visited)
 {
    for (int d = 0; insn->defExists(d); ++d) {
       Value *v = insn->getDef(d);
@@ -323,7 +323,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    if (!uses)
       return false;
    for (size_t i = 0; i < texes.size(); ++i) {
-      std::tr1::unordered_set<const Instruction *> visited;
+      unordered_set<const Instruction *> visited;
       findFirstUses(texes[i], texes[i], uses[i], visited);
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 260e101..2ce52e5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -20,8 +20,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <tr1/unordered_set>
-
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_build_util.h"
 
@@ -73,7 +71,7 @@ private:
    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
    void findFirstUses(const Instruction *tex, const Instruction *def,
                       std::list<TexUse>&,
-                      std::tr1::unordered_set<const Instruction *>&);
+                      unordered_set<const Instruction *>&);
    void findOverwritingDefs(const Instruction *tex, Instruction *insn,
                             const BasicBlock *term,
                             std::list<TexUse>&);
@@ -111,10 +109,11 @@ protected:
 
    void checkPredicate(Instruction *);
 
+   virtual bool visit(Instruction *);
+
 private:
    virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
-   virtual bool visit(Instruction *);
 
    void readTessCoord(LValue *dst, int c);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index cea96dc..b01ef41 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1023,27 +1023,53 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
    case OP_AND:
    {
-      CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
-      if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
-         return;
-      if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
-         return;
-      if (imm0.reg.data.f32 != 1.0)
-         return;
-      if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
-         return;
+      Instruction *src = i->getSrc(t)->getInsn();
+      ImmediateValue imm1;
+      if (imm0.reg.data.u32 == 0) {
+         i->op = OP_MOV;
+         i->setSrc(0, new_ImmediateValue(prog, 0u));
+         i->src(0).mod = Modifier(0);
+         i->setSrc(1, NULL);
+      } else if (imm0.reg.data.u32 == ~0U) {
+         i->op = i->src(t).mod.getOp();
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->src(0).mod = i->src(t).mod;
+         }
+         i->setSrc(1, NULL);
+      } else if (src->asCmp()) {
+         CmpInstruction *cmp = src->asCmp();
+         if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+            return;
+         if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+            return;
+         if (imm0.reg.data.f32 != 1.0)
+            return;
+         if (cmp->dType != TYPE_U32)
+            return;
 
-      i->getSrc(t)->getInsn()->dType = TYPE_F32;
-      if (i->src(t).mod != Modifier(0)) {
-         assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
-         i->src(t).mod = Modifier(0);
-         cmp->setCond = inverseCondCode(cmp->setCond);
-      }
-      i->op = OP_MOV;
-      i->setSrc(s, NULL);
-      if (t) {
-         i->setSrc(0, i->getSrc(t));
-         i->setSrc(t, NULL);
+         cmp->dType = TYPE_F32;
+         if (i->src(t).mod != Modifier(0)) {
+            assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+            i->src(t).mod = Modifier(0);
+            cmp->setCond = inverseCondCode(cmp->setCond);
+         }
+         i->op = OP_MOV;
+         i->setSrc(s, NULL);
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->setSrc(t, NULL);
+         }
+      } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
+                 src->op == OP_SHR &&
+                 src->src(1).getImmediate(imm1) &&
+                 i->src(t).mod == Modifier(0) &&
+                 util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+         // low byte = offset, high byte = width
+         uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
+         i->op = OP_EXTBF;
+         i->setSrc(0, src->getSrc(0));
+         i->setSrc(1, new_ImmediateValue(prog, ext));
       }
    }
       break;
@@ -1106,6 +1132,84 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       i->op = OP_MOV;
       break;
    }
+   case OP_CVT: {
+      Storage res;
+
+      // TODO: handle 64-bit values properly
+      if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
+         return;
+
+      // TODO: handle single byte/word extractions
+      if (i->subOp)
+         return;
+
+      bld.setPosition(i, true); /* make sure bld is init'ed */
+
+#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
+   case type: \
+      switch (i->sType) { \
+      case TYPE_F32: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f32, fmin, fmax) : \
+                                    imm0.reg.data.f32); \
+         break; \
+      case TYPE_S32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s32, imin, imax) : \
+                        imm0.reg.data.s32; \
+         break; \
+      case TYPE_U32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u32, umin, umax) : \
+                        imm0.reg.data.u32; \
+         break; \
+      case TYPE_S16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s16, imin, imax) : \
+                        imm0.reg.data.s16; \
+         break; \
+      case TYPE_U16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u16, umin, umax) : \
+                        imm0.reg.data.u16; \
+         break; \
+      default: return; \
+      } \
+      i->setSrc(0, bld.mkImm(res.data.dst)); \
+      break
+
+      switch(i->dType) {
+      CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
+      CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
+      CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
+      CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
+      case TYPE_F32:
+         switch (i->sType) {
+         case TYPE_F32:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f32));
+         break;
+      default:
+         return;
+      }
+#undef CASE
+
+      i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
+      i->op = OP_MOV;
+      i->saturate = 0;
+      i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
+      break;
+   }
    default:
       return;
    }
@@ -1212,7 +1316,8 @@ private:
    void handleRCP(Instruction *);
    void handleSLCT(Instruction *);
    void handleLOGOP(Instruction *);
-   void handleCVT(Instruction *);
+   void handleCVT_NEG(Instruction *);
+   void handleCVT_EXTBF(Instruction *);
    void handleSUCLAMP(Instruction *);
 
    BuildUtil bld;
@@ -1463,12 +1568,12 @@ AlgebraicOpt::handleLOGOP(Instruction *logop)
 // nv50:
 //  F2I(NEG(I2F(ABS(SET))))
 void
-AlgebraicOpt::handleCVT(Instruction *cvt)
+AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
 {
+   Instruction *insn = cvt->getSrc(0)->getInsn();
    if (cvt->sType != TYPE_F32 ||
        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
       return;
-   Instruction *insn = cvt->getSrc(0)->getInsn();
    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
       return;
    if (insn->src(0).mod != Modifier(0))
@@ -1498,6 +1603,104 @@ AlgebraicOpt::handleCVT(Instruction *cvt)
    delete_Instruction(prog, cvt);
 }
 
+// Some shaders extract packed bytes out of words and convert them to
+// e.g. float. The Fermi+ CVT instruction can extract those directly, as can
+// nv50 for word sizes.
+//
+// CVT(EXTBF(x, byte/word))
+// CVT(AND(bytemask, x))
+// CVT(AND(bytemask, SHR(x, 8/16/24)))
+// CVT(SHR(x, 16/24))
+void
+AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   ImmediateValue imm;
+   Value *arg = NULL;
+   unsigned width, offset;
+   if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
+      return;
+   if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
+      width = (imm.reg.data.u32 >> 8) & 0xff;
+      offset = imm.reg.data.u32 & 0xff;
+      arg = insn->getSrc(0);
+
+      if (width != 8 && width != 16)
+         return;
+      if (width == 8 && offset & 0x7)
+         return;
+      if (width == 16 && offset & 0xf)
+         return;
+   } else if (insn->op == OP_AND) {
+      int s;
+      if (insn->src(0).getImmediate(imm))
+         s = 0;
+      else if (insn->src(1).getImmediate(imm))
+         s = 1;
+      else
+         return;
+
+      if (imm.reg.data.u32 == 0xff)
+         width = 8;
+      else if (imm.reg.data.u32 == 0xffff)
+         width = 16;
+      else
+         return;
+
+      arg = insn->getSrc(!s);
+      Instruction *shift = arg->getInsn();
+      offset = 0;
+      if (shift && shift->op == OP_SHR &&
+          shift->sType == cvt->sType &&
+          shift->src(1).getImmediate(imm) &&
+          ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+           (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
+         arg = shift->getSrc(0);
+         offset = imm.reg.data.u32;
+      }
+   } else if (insn->op == OP_SHR &&
+              insn->sType == cvt->sType &&
+              insn->src(1).getImmediate(imm)) {
+      arg = insn->getSrc(0);
+      if (imm.reg.data.u32 == 24) {
+         width = 8;
+         offset = 24;
+      } else if (imm.reg.data.u32 == 16) {
+         width = 16;
+         offset = 16;
+      } else {
+         return;
+      }
+   }
+
+   if (!arg)
+      return;
+
+   // Irrespective of what came earlier, we can undo a shift on the argument
+   // by adjusting the offset.
+   Instruction *shift = arg->getInsn();
+   if (shift && shift->op == OP_SHL &&
+       shift->src(1).getImmediate(imm) &&
+       ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+        (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
+       imm.reg.data.u32 <= offset) {
+      arg = shift->getSrc(0);
+      offset -= imm.reg.data.u32;
+   }
+
+   // The unpackSnorm lowering still leaves a few shifts behind, but it's too
+   // annoying to detect them.
+
+   if (width == 8) {
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
+   } else {
+      assert(width == 16);
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
+   }
+   cvt->setSrc(0, arg);
+   cvt->subOp = offset >> 3;
+}
+
 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
 void
 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
@@ -1568,7 +1771,9 @@ AlgebraicOpt::visit(BasicBlock *bb)
          handleLOGOP(i);
          break;
       case OP_CVT:
-         handleCVT(i);
+         handleCVT_NEG(i);
+         if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
+             handleCVT_EXTBF(i);
          break;
       case OP_SUCLAMP:
          handleSUCLAMP(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 78bc97f..0cd21cf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -25,7 +25,6 @@
 
 #include <stack>
 #include <limits>
-#include <tr1/unordered_set>
 
 namespace nv50_ir {
 
@@ -1551,7 +1550,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
       // Keep track of which instructions to delete later. Deleting them
       // inside the loop is unsafe since a single instruction may have
       // multiple destinations that all need to be spilled (like OP_SPLIT).
-      std::tr1::unordered_set<Instruction *> to_del;
+      unordered_set<Instruction *> to_del;
 
       for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
            ++d) {
@@ -1593,7 +1592,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
          }
       }
 
-      for (std::tr1::unordered_set<Instruction *>::const_iterator it = to_del.begin();
+      for (unordered_set<Instruction *>::const_iterator it = to_del.begin();
            it != to_del.end(); ++it)
          delete_Instruction(func->getProgram(), *it);
    }
diff --git a/src/gallium/drivers/nouveau/codegen/unordered_set.h b/src/gallium/drivers/nouveau/codegen/unordered_set.h
new file mode 100644
index 0000000..8ef6d46
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/unordered_set.h
@@ -0,0 +1,48 @@
+#ifndef __NV50_UNORDERED_SET_H__
+#define __NV50_UNORDERED_SET_H__
+
+#if (__cplusplus >= 201103L) || defined(ANDROID)
+#include <unordered_set>
+#else
+#include <tr1/unordered_set>
+#endif
+
+namespace nv50_ir {
+
+#if __cplusplus >= 201103L
+using std::unordered_set;
+#elif !defined(ANDROID)
+using std::tr1::unordered_set;
+#else // Android release before lollipop
+using std::isfinite;
+typedef std::tr1::unordered_set<void *> voidptr_unordered_set;
+
+template <typename V>
+class unordered_set : public voidptr_unordered_set {
+  public:
+    typedef voidptr_unordered_set _base;
+    typedef _base::iterator _biterator;
+    typedef _base::const_iterator const_biterator;
+
+    class iterator : public _biterator {
+      public:
+        iterator(const _biterator & i) : _biterator(i) {}
+        V operator*() const { return reinterpret_cast<V>(*_biterator(*this)); }
+    };
+    class const_iterator : public const_biterator {
+      public:
+        const_iterator(const iterator & i) : const_biterator(i) {}
+        const_iterator(const const_biterator & i) : const_biterator(i) {}
+        const V operator*() const { return reinterpret_cast<const V>(*const_biterator(*this)); }
+    };
+
+    iterator begin() { return _base::begin(); }
+    iterator end() { return _base::end(); }
+    const_iterator begin() const { return _base::begin(); }
+    const_iterator end() const { return _base::end(); }
+};
+#endif
+
+} // namespace nv50_ir
+
+#endif // __NV50_UNORDERED_SET_H__
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 8660498..495450b 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -190,6 +190,10 @@ main(int argc, char *argv[])
       type = PIPE_SHADER_GEOMETRY;
    else if (!strncmp(text, "COMP", 4))
       type = PIPE_SHADER_COMPUTE;
+   else if (!strncmp(text, "TESS_CTRL", 9))
+      type = PIPE_SHADER_TESS_CTRL;
+   else if (!strncmp(text, "TESS_EVAL", 9))
+      type = PIPE_SHADER_TESS_EVAL;
    else {
       _debug_printf("Unrecognized TGSI header\n");
       return 1;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 9505a0b..410e631 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -117,7 +117,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
    struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
    int i;
    bool emit_common_func = cso->rt[0].blend_enable;
-   uint32_t ms;
 
    if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
       SB_BEGIN_3D(so, BLEND_INDEPENDENT, 1);
@@ -189,15 +188,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
       SB_DATA    (so, nv50_colormask(cso->rt[0].colormask));
    }
 
-   ms = 0;
-   if (cso->alpha_to_coverage)
-      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
-   if (cso->alpha_to_one)
-      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
-   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
-   SB_DATA    (so, ms);
-
    assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
    return so;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 985603d..b304a17 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -1,4 +1,6 @@
 
+#include "util/u_format.h"
+
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_defs.xml.h"
 
@@ -314,6 +316,25 @@ nv50_validate_derived_2(struct nv50_context *nv50)
 }
 
 static void
+nv50_validate_derived_3(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+   uint32_t ms = 0;
+
+   if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+        !util_format_is_pure_integer(fb->cbufs[0]->format)) && nv50->blend) {
+      if (nv50->blend->pipe.alpha_to_coverage)
+         ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+      if (nv50->blend->pipe.alpha_to_one)
+         ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+   }
+
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, ms);
+}
+
+static void
 nv50_validate_clip(struct nv50_context *nv50)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
@@ -474,6 +495,7 @@ static struct state_validate {
     { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_derived_3,     NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
     { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index cf75d1e..4b1d00c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -19,7 +19,7 @@
 struct nv50_blend_stateobj {
    struct pipe_blend_state pipe;
    int size;
-   uint32_t state[84]; // TODO: allocate less if !independent_blend_enable
+   uint32_t state[82]; // TODO: allocate less if !independent_blend_enable
 };
 
 struct nv50_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index b1ae016..64348b3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -68,6 +68,10 @@ nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
       return NV50_SURFACE_FORMAT_R16_UNORM;
    case 4:
       return NV50_SURFACE_FORMAT_BGRA8_UNORM;
+   case 8:
+      return NV50_SURFACE_FORMAT_RGBA16_FLOAT;
+   case 16:
+      return NV50_SURFACE_FORMAT_RGBA32_FLOAT;
    default:
       return 0;
    }
@@ -1003,6 +1007,8 @@ nv50_blitctx_prepare_state(struct nv50_blitctx *blit)
    /* zsa state */
    BEGIN_NV04(push, NV50_3D(DEPTH_TEST_ENABLE), 1);
    PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(DEPTH_BOUNDS_EN), 1);
+   PUSH_DATA (push, 0);
    BEGIN_NV04(push, NV50_3D(STENCIL_ENABLE), 1);
    PUSH_DATA (push, 0);
    BEGIN_NV04(push, NV50_3D(ALPHA_TEST_ENABLE), 1);
@@ -1387,18 +1393,24 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
             PUSH_DATA (push, info->dst.box.z + i);
          } else {
             const unsigned z = info->dst.box.z + i;
+            const uint64_t address = dst->base.address +
+               dst->level[info->dst.level].offset +
+               z * dst->layer_stride;
             BEGIN_NV04(push, NV50_2D(DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
-            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          if (src->layout_3d) {
             /* not possible because of depth tiling */
             assert(0);
          } else {
             const unsigned z = info->src.box.z + i;
+            const uint64_t address = src->base.address +
+               src->level[info->src.level].offset +
+               z * src->layer_stride;
             BEGIN_NV04(push, NV50_2D(SRC_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
-            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          BEGIN_NV04(push, NV50_2D(BLIT_SRC_Y_INT), 1); /* trigger */
          PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 84f8db6..7a15a11 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -132,6 +132,9 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
       pipe_resource_reference(res, NULL);
    }
    util_dynarray_fini(&nvc0->global_residents);
+
+   if (nvc0->tcp_empty)
+      nvc0->base.pipe.delete_tcs_state(&nvc0->base.pipe, nvc0->tcp_empty);
 }
 
 static void
@@ -306,13 +309,6 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    pipe->memory_barrier = nvc0_memory_barrier;
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
-   if (!screen->cur_ctx) {
-      nvc0->state = screen->save_state;
-      screen->cur_ctx = nvc0;
-      nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
-   }
-   screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
-
    nvc0_init_query_functions(nvc0);
    nvc0_init_surface_functions(nvc0);
    nvc0_init_state_functions(nvc0);
@@ -326,6 +322,21 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
 
    /* shader builtin library is per-screen, but we need a context for m2mf */
    nvc0_program_library_upload(nvc0);
+   nvc0_program_init_tcp_empty(nvc0);
+   if (!nvc0->tcp_empty)
+      goto out_err;
+   /* set the empty tctl prog on next draw in case one is never set */
+   nvc0->dirty |= NVC0_NEW_TCTLPROG;
+
+   /* now that there are no more opportunities for errors, set the current
+    * context if there isn't already one.
+    */
+   if (!screen->cur_ctx) {
+      nvc0->state = screen->save_state;
+      screen->cur_ctx = nvc0;
+      nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
+   }
+   screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
 
    /* add permanently resident buffers to bufctxts */
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index f449942..df1a891 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -128,6 +128,8 @@ struct nvc0_context {
    struct nvc0_program *fragprog;
    struct nvc0_program *compprog;
 
+   struct nvc0_program *tcp_empty;
+
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
@@ -227,6 +229,7 @@ void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
 uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
                                     uint32_t label);
+void nvc0_program_init_tcp_empty(struct nvc0_context *);
 
 /* nvc0_query.c */
 void nvc0_init_query_functions(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 507a250..12f1bb7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -22,6 +22,8 @@
 
 #include "pipe/p_defines.h"
 
+#include "tgsi/tgsi_ureg.h"
+
 #include "nvc0/nvc0_context.h"
 
 #include "codegen/nv50_ir_driver.h"
@@ -799,3 +801,18 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
          return prog->code_base + base + syms[i].offset;
    return prog->code_base; /* no symbols or symbol not found */
 }
+
+void
+nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)
+{
+   struct ureg_program *ureg;
+
+   ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+   if (!ureg)
+      return;
+
+   ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);
+   ureg_END(ureg);
+
+   nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8aa127a..8f8ac2d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -148,8 +148,13 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
       PUSH_DATA (push, tp->num_gprs);
    } else {
-      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
+      tp = nvc0->tcp_empty;
+      /* not a whole lot we can do to handle this failure */
+      if (!nvc0_program_validate(nvc0, tp))
+         assert(!"unable to validate empty tcp");
+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
       PUSH_DATA (push, 0x20);
+      PUSH_DATA (push, tp->code_base);
    }
    nvc0_program_update_context_state(nvc0, tp, 1);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 2a33857..ee29912 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -90,7 +90,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
    struct nvc0_blend_stateobj *so = CALLOC_STRUCT(nvc0_blend_stateobj);
    int i;
    int r; /* reference */
-   uint32_t ms;
    uint8_t blend_en = 0;
    bool indep_masks = false;
    bool indep_funcs = false;
@@ -176,15 +175,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
       }
    }
 
-   ms = 0;
-   if (cso->alpha_to_coverage)
-      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
-   if (cso->alpha_to_one)
-      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
-   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
-   SB_DATA    (so, ms);
-
    assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
    return so;
 }
@@ -234,7 +224,7 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe,
     SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample);
 
     SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth);
-    if (cso->line_smooth)
+    if (cso->line_smooth || cso->multisample)
        SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1);
     else
        SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index ce1119c..47bd66d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -1,4 +1,5 @@
 
+#include "util/u_format.h"
 #include "util/u_math.h"
 
 #include "nvc0/nvc0_context.h"
@@ -555,6 +556,25 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0)
 }
 
 static void
+nvc0_validate_derived_3(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+   uint32_t ms = 0;
+
+   if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+        !util_format_is_pure_integer(fb->cbufs[0]->format)) && nvc0->blend) {
+      if (nvc0->blend->pipe.alpha_to_coverage)
+         ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+      if (nvc0->blend->pipe.alpha_to_one)
+         ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, ms);
+}
+
+static void
 nvc0_validate_tess_state(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -628,6 +648,7 @@ static struct state_validate {
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
     { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
+    { nvc0_validate_derived_3,     NVC0_NEW_BLEND | NVC0_NEW_FRAMEBUFFER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
                                    NVC0_NEW_VERTPROG |
                                    NVC0_NEW_TEVLPROG |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 18fcc12..8bc33c6 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -17,7 +17,7 @@
 struct nvc0_blend_stateobj {
    struct pipe_blend_state pipe;
    int size;
-   uint32_t state[72];
+   uint32_t state[70];
 };
 
 struct nvc0_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 51a6f93..dbdf292 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -887,6 +887,7 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit)
 
    /* zsa state */
    IMMED_NVC0(push, NVC0_3D(DEPTH_TEST_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(DEPTH_BOUNDS_EN), 0);
    IMMED_NVC0(push, NVC0_3D(STENCIL_ENABLE), 0);
    IMMED_NVC0(push, NVC0_3D(ALPHA_TEST_ENABLE), 0);
 
@@ -1336,18 +1337,24 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
             PUSH_DATA (push, info->dst.box.z + i);
          } else {
             const unsigned z = info->dst.box.z + i;
+            const uint64_t address = dst->base.address +
+               dst->level[info->dst.level].offset +
+               z * dst->layer_stride;
             BEGIN_NVC0(push, NVC0_2D(DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
-            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          if (src->layout_3d) {
             /* not possible because of depth tiling */
             assert(0);
          } else {
             const unsigned z = info->src.box.z + i;
+            const uint64_t address = src->base.address +
+               src->level[info->src.level].offset +
+               z * src->layer_stride;
             BEGIN_NVC0(push, NVC0_2D(SRC_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
-            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_Y_INT), 1); /* trigger */
          PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
index 14f93fb..e8f4087 100644
--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@@ -693,7 +693,8 @@ void rc_init_regalloc_state(struct rc_regalloc_state *s)
 	};
 
 	/* Allocate the main ra data structure */
-	s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW);
+	s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW,
+                                   true);
 
 	/* Create the register classes */
 	for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 6ea8f24..b8cc316 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -667,7 +667,8 @@ static void r300_resource_copy_region(struct pipe_context *pipe,
     r300_blitter_begin(r300, R300_COPY);
     util_blitter_blit_generic(r300->blitter, dst_view, &dstbox,
                               src_view, src_box, src_width0, src_height0,
-                              PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+                              PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                              FALSE);
     r300_blitter_end(r300);
 
     pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index b0002c3..22a0950 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -732,7 +732,8 @@ void r600_resource_copy_region(struct pipe_context *ctx,
 	r600_blitter_begin(ctx, R600_COPY_TEXTURE);
 	util_blitter_blit_generic(rctx->blitter, dst_view, &dstbox,
 				  src_view, src_box, src_width0, src_height0,
-				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+				  FALSE);
 	r600_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index 381f06d..fdbe1c0 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -262,7 +262,7 @@ static const struct alu_op_info alu_op_table[] = {
 		{"PRED_SETNE_PUSH_INT",       2, { 0x4D, 0x4D },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_NE | AF_INT_CMP },
 		{"PRED_SETLT_PUSH_INT",       2, { 0x4E, 0x4E },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_LT | AF_INT_CMP },
 		{"PRED_SETLE_PUSH_INT",       2, { 0x4F, 0x4F },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_LE | AF_INT_CMP },
-		{"FLT_TO_INT",                1, { 0x6B, 0x50 },{   AF_S,  AF_S, AF_VS, AF_VS},  AF_INT_DST | AF_CVT },
+		{"FLT_TO_INT",                1, { 0x6B, 0x50 },{   AF_S,  AF_S,  AF_V,  AF_V},  AF_INT_DST | AF_CVT },
 		{"BFREV_INT",                 1, {   -1, 0x51 },{      0,     0, AF_VS, AF_VS},  AF_INT_DST },
 		{"ADDC_UINT",                 2, {   -1, 0x52 },{      0,     0, AF_VS, AF_VS},  AF_UINT_DST },
 		{"SUBB_UINT",                 2, {   -1, 0x53 },{      0,     0, AF_VS, AF_VS},  AF_UINT_DST },
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 9b66105..384ba80 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -90,7 +90,7 @@
 
 struct r600_context;
 struct r600_bytecode;
-struct r600_shader_key;
+union  r600_shader_key;
 
 /* This is an atom containing GPU commands that never change.
  * This is supposed to be copied directly into the CS. */
@@ -643,7 +643,7 @@ void r600_resource_copy_region(struct pipe_context *ctx,
 /* r600_shader.c */
 int r600_pipe_shader_create(struct pipe_context *ctx,
 			    struct r600_pipe_shader *shader,
-			    struct r600_shader_key key);
+			    union r600_shader_key key);
 
 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 8d1f95a..4c4b600 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -62,7 +62,7 @@ The compiler must issue the source argument to slots z, y, and x
 
 static int r600_shader_from_tgsi(struct r600_context *rctx,
 				 struct r600_pipe_shader *pipeshader,
-				 struct r600_shader_key key);
+				 union r600_shader_key key);
 
 
 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
@@ -133,7 +133,7 @@ static int store_shader(struct pipe_context *ctx,
 
 int r600_pipe_shader_create(struct pipe_context *ctx,
 			    struct r600_pipe_shader *shader,
-			    struct r600_shader_key key)
+			    union r600_shader_key key)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_shader_selector *sel = shader->selector;
@@ -141,7 +141,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
-	unsigned export_shader = key.vs_as_es;
+	unsigned export_shader = key.vs.as_es;
 
 	shader->shader.bc.isa = rctx->isa;
 
@@ -1802,7 +1802,7 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
 
 static int r600_shader_from_tgsi(struct r600_context *rctx,
 				 struct r600_pipe_shader *pipeshader,
-				 struct r600_shader_key key)
+				 union r600_shader_key key)
 {
 	struct r600_screen *rscreen = rctx->screen;
 	struct r600_shader *shader = &pipeshader->shader;
@@ -1816,7 +1816,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	unsigned opcode;
 	int i, j, k, r = 0;
 	int next_param_base = 0, next_clip_base;
-	int max_color_exports = MAX2(key.nr_cbufs, 1);
+	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
 	/* Declarations used by llvm code */
 	bool use_llvm = false;
 	bool indirect_gprs;
@@ -1830,8 +1830,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.shader = shader;
 	ctx.native_integers = true;
 
-	shader->vs_as_gs_a = key.vs_as_gs_a;
-	shader->vs_as_es = key.vs_as_es;
+	shader->vs_as_gs_a = key.vs.as_gs_a;
+	shader->vs_as_es = key.vs.as_es;
 
 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
 			   rscreen->has_compressed_msaa_texturing);
@@ -1844,9 +1844,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->processor_type = ctx.type;
 	ctx.bc->type = shader->processor_type;
 
-	ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
+	ring_outputs = key.vs.as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
 
-	if (key.vs_as_es) {
+	if (key.vs.as_es) {
 		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
 	} else {
 		ctx.gs_for_vs = NULL;
@@ -1866,7 +1866,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->nr_ps_color_exports = 0;
 	shader->nr_ps_max_color_exports = 0;
 
-	shader->two_side = key.color_two_side;
+	shader->two_side = key.ps.color_two_side;
 
 	/* register allocations */
 	/* Values [0,127] correspond to GPR[0..127].
@@ -1970,7 +1970,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->fs_write_all = FALSE;
 
 	if (shader->vs_as_gs_a)
-		vs_add_primid_output(&ctx, key.vs_prim_id_out);
+		vs_add_primid_output(&ctx, key.vs.prim_id_out);
 
 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
 		tgsi_parse_token(&ctx.parse);
@@ -2091,7 +2091,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
 		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
 		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
+		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
 		radeon_llvm_ctx.has_compressed_msaa_texturing =
 			ctx.bc->has_compressed_msaa_texturing;
 		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
@@ -2270,7 +2270,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	convert_edgeflag_to_int(&ctx);
 
 	if (ring_outputs) {
-		if (key.vs_as_es)
+		if (key.vs.as_es)
 			emit_gs_ring_writes(&ctx, FALSE);
 	} else {
 		/* Export output */
@@ -2386,7 +2386,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 						j--;
 						continue;
 					}
-					output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
 					output[j].array_base = shader->output[i].sid;
 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 					shader->nr_ps_color_exports++;
@@ -2399,7 +2399,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 							output[j].swizzle_x = 0;
 							output[j].swizzle_y = 1;
 							output[j].swizzle_z = 2;
-							output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
 							output[j].burst_count = 1;
 							output[j].array_base = k;
 							output[j].op = CF_OP_EXPORT;
@@ -6151,10 +6151,10 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx)
 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
 		if (r)
 			return r;
-		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[2]);
+		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
 		if (r)
 			return r;
-		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[2], &ctx->src[1]);
+		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
 		if (r)
 			return r;
 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 5d05c81..927bac5 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -95,13 +95,17 @@ struct r600_shader {
 	struct r600_shader_array * arrays;
 };
 
-struct r600_shader_key {
-	unsigned color_two_side:1;
-	unsigned alpha_to_one:1;
-	unsigned nr_cbufs:4;
-	unsigned vs_as_es:1;
-	unsigned vs_as_gs_a:1;
-	unsigned vs_prim_id_out:8;
+union r600_shader_key {
+	struct {
+		unsigned	nr_cbufs:4;
+		unsigned	color_two_side:1;
+		unsigned	alpha_to_one:1;
+	} ps;
+	struct {
+		unsigned	prim_id_out:8;
+		unsigned	as_es:1; /* export shader */
+		unsigned	as_gs_a:1;
+	} vs;
 };
 
 struct r600_shader_array {
@@ -122,7 +126,7 @@ struct r600_pipe_shader {
 	unsigned		flatshade;
 	unsigned		pa_cl_vs_out_cntl;
 	unsigned		nr_ps_color_outputs;
-	struct r600_shader_key	key;
+	union r600_shader_key	key;
 	unsigned		db_shader_control;
 	unsigned		ps_depth_export;
 	unsigned		enabled_stream_buffers_mask;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index aa4a8d0..a05dd83 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -702,29 +702,39 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 }
 
 /* Compute the key for the hw shader variant */
-static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_shader_key key;
+	union r600_shader_key key;
 	memset(&key, 0, sizeof(key));
 
-	if (sel->type == PIPE_SHADER_FRAGMENT) {
-		key.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
-		key.alpha_to_one = rctx->alpha_to_one &&
-				   rctx->rasterizer && rctx->rasterizer->multisample_enable &&
-				   !rctx->framebuffer.cb0_is_integer;
-		key.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
-		/* Dual-source blending only makes sense with nr_cbufs == 1. */
-		if (key.nr_cbufs == 1 && rctx->dual_src_blend)
-			key.nr_cbufs = 2;
-	} else if (sel->type == PIPE_SHADER_VERTEX) {
-		key.vs_as_es = (rctx->gs_shader != NULL);
+	switch (sel->type) {
+	case PIPE_SHADER_VERTEX: {
+		key.vs.as_es = (rctx->gs_shader != NULL);
 		if (rctx->ps_shader->current->shader.gs_prim_id_input && !rctx->gs_shader) {
-			key.vs_as_gs_a = true;
-			key.vs_prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
+			key.vs.as_gs_a = true;
+			key.vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
 		}
+		break;
+	}
+	case PIPE_SHADER_GEOMETRY:
+		break;
+	case PIPE_SHADER_FRAGMENT: {
+		key.ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
+		key.ps.alpha_to_one = rctx->alpha_to_one &&
+				      rctx->rasterizer && rctx->rasterizer->multisample_enable &&
+				      !rctx->framebuffer.cb0_is_integer;
+		key.ps.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
+		/* Dual-source blending only makes sense with nr_cbufs == 1. */
+		if (key.ps.nr_cbufs == 1 && rctx->dual_src_blend)
+			key.ps.nr_cbufs = 2;
+		break;
 	}
+	default:
+		assert(0);
+	}
+
 	return key;
 }
 
@@ -734,7 +744,7 @@ static int r600_shader_select(struct pipe_context *ctx,
         struct r600_pipe_shader_selector* sel,
         bool *dirty)
 {
-	struct r600_shader_key key;
+	union r600_shader_key key;
 	struct r600_pipe_shader * shader = NULL;
 	int r;
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 16ee541..81f3f45 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -209,8 +209,6 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 
 static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 {
-	unsigned width_in_mb, height_in_mb, ctx_size;
-
 	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
 	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
 
@@ -223,8 +221,7 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 
 	width = align (width, 16);
 	height = align (height, 16);
-	ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
-	return ctx_size;
+	return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
 }
 
 /* calculate size of reference picture buffer */
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 48972bd..b7450b6 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -586,7 +586,8 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	si_blitter_begin(ctx, SI_COPY);
 	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
 				  src_view, src_box, src_width0, src_height0,
-				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+				  FALSE);
 	si_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4288e9b..fa6c15a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2277,7 +2277,7 @@ static void tex_fetch_args(
 	unsigned sampler_index;
 	unsigned num_deriv_channels = 0;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
-	LLVMValueRef res_ptr, samp_ptr;
+	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
 
 	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
 	sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
@@ -2293,9 +2293,19 @@ static void tex_fetch_args(
 
 		samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
 		samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+
+		if (target == TGSI_TEXTURE_2D_MSAA ||
+		    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
+						 lp_build_const_int32(gallivm,
+								      SI_FMASK_TEX_OFFSET), "");
+			fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+			fmask_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+		}
 	} else {
 		res_ptr = si_shader_ctx->resources[sampler_index];
 		samp_ptr = si_shader_ctx->samplers[sampler_index];
+		fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
 	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
@@ -2493,7 +2503,7 @@ static void tex_fetch_args(
 		txf_emit_data.dst_type = LLVMVectorType(
 			LLVMInt32TypeInContext(gallivm->context), 4);
 		txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count);
-		txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+		txf_emit_data.args[1] = fmask_ptr;
 		txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture);
 		txf_emit_data.arg_count = 3;
 
@@ -2524,8 +2534,7 @@ static void tex_fetch_args(
 		 * resource descriptor is 0 (invalid),
 		 */
 		LLVMValueRef fmask_desc =
-			LLVMBuildBitCast(gallivm->builder,
-					 si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index],
+			LLVMBuildBitCast(gallivm->builder, fmask_ptr,
 					 LLVMVectorType(uint_bld->elem_type, 8), "");
 
 		LLVMValueRef fmask_word1 =
@@ -3973,7 +3982,7 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
 			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
 				key->vs.es_enabled_outputs);
 		fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
-		fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
+		fprintf(stderr, "  as_ls = %u\n", key->vs.as_ls);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 654c46f..3a63af8 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_a_or_b_or_acc;
         unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 7978ea1..5b43583 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -143,15 +143,6 @@ qir_opt_algebraic(struct vc4_compile *c)
                 case QOP_SEL_X_Y_ZC:
                 case QOP_SEL_X_Y_NS:
                 case QOP_SEL_X_Y_NC:
-                        if (qir_reg_equals(inst->src[0], inst->src[1])) {
-                                /* Turn "dst = (sf == x) ? a : a)" into
-                                 * "dst = a"
-                                 */
-                                replace_with_mov(c, inst, inst->src[1]);
-                                progress = true;
-                                break;
-                        }
-
                         if (is_zero(c, inst->src[1])) {
                                 /* Replace references to a 0 uniform value
                                  * with the SEL_X_0 equivalent.
@@ -207,6 +198,7 @@ qir_opt_algebraic(struct vc4_compile *c)
 
                         /* FADD(a, FSUB(0, b)) -> FSUB(a, b) */
                         if (inst->src[1].file == QFILE_TEMP &&
+                            c->defs[inst->src[1].index] &&
                             c->defs[inst->src[1].index]->op == QOP_FSUB) {
                                 struct qinst *fsub = c->defs[inst->src[1].index];
                                 if (is_zero(c, fsub->src[0])) {
@@ -221,6 +213,7 @@ qir_opt_algebraic(struct vc4_compile *c)
 
                         /* FADD(FSUB(0, b), a) -> FSUB(a, b) */
                         if (inst->src[0].file == QFILE_TEMP &&
+                            c->defs[inst->src[0].index] &&
                             c->defs[inst->src[0].index]->op == QOP_FSUB) {
                                 struct qinst *fsub = c->defs[inst->src[0].index];
                                 if (is_zero(c, fsub->src[0])) {
@@ -236,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c)
                         break;
 
                 case QOP_FMUL:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1) ||
-                            fmul_replace_one(c, inst, 0) ||
-                            fmul_replace_one(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1) ||
+                             fmul_replace_one(c, inst, 0) ||
+                             fmul_replace_one(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
                         break;
 
                 case QOP_MUL24:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
@@ -280,6 +275,14 @@ qir_opt_algebraic(struct vc4_compile *c)
                         }
                         break;
 
+                case QOP_RCP:
+                        if (is_1f(c, inst->src[0])) {
+                                replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
+                                break;
+                        }
+                        break;
+
                 default:
                         break;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index a755de9..fd2539a 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -39,21 +39,27 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 {
         bool progress = false;
         bool debug = false;
-        struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg));
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         int index = inst->src[i].index;
                         if (inst->src[i].file == QFILE_TEMP &&
-                            (movs[index].file == QFILE_TEMP ||
-                             movs[index].file == QFILE_UNIF)) {
+                            c->defs[index] &&
+                            c->defs[index]->op == QOP_MOV &&
+                            (c->defs[index]->src[0].file == QFILE_TEMP ||
+                             c->defs[index]->src[0].file == QFILE_UNIF)) {
+                                /* If it has a pack, it shouldn't be an SSA
+                                 * def.
+                                 */
+                                assert(!c->defs[index]->dst.pack);
+
                                 if (debug) {
                                         fprintf(stderr, "Copy propagate: ");
                                         qir_dump_inst(c, inst);
                                         fprintf(stderr, "\n");
                                 }
 
-                                inst->src[i] = movs[index];
+                                inst->src[i] = c->defs[index]->src[0];
 
                                 if (debug) {
                                         fprintf(stderr, "to: ");
@@ -64,14 +70,6 @@ qir_opt_copy_propagation(struct vc4_compile *c)
                                 progress = true;
                         }
                 }
-
-                if (inst->op == QOP_MOV &&
-                    inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM) {
-                        movs[inst->dst.index] = inst->src[0];
-                }
         }
-
-        free(movs);
         return progress;
 }
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index e04f028..f2cdf8f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -68,7 +68,7 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                         continue;
 
                 struct qinst *inst = c->defs[temp];
-                if (qir_is_multi_instruction(inst))
+                if (!inst || qir_is_multi_instruction(inst))
                         continue;
 
                 if (qir_depends_on_flags(inst) || inst->sf)
@@ -79,22 +79,6 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                         continue;
                 }
 
-                /* A QOP_TEX_RESULT destination is r4, so we can't move
-                 * accesses to it past another QOP_TEX_RESULT which would
-                 * update it.
-                 */
-                int src;
-                for (src = 0; src < qir_get_op_nsrc(inst->op); src++) {
-                        if (inst->src[src].file == QFILE_TEMP) {
-                                if (c->defs[inst->src[src].index]->op ==
-                                    QOP_TEX_RESULT) {
-                                        break;
-                                }
-                        }
-                }
-                if (src != qir_get_op_nsrc(inst->op))
-                        continue;
-
                 /* Move the generating instruction to the end of the program
                  * to maintain the order of the VPM writes.
                  */
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 13c4721..e002983 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
         c->ubo_ranges[array_id].used = false;
 }
 
+static bool
+ntq_src_is_only_ssa_def_user(nir_src *src)
+{
+        if (!src->is_ssa)
+                return false;
+
+        if (!list_empty(&src->ssa->if_uses))
+                return false;
+
+        return (src->ssa->uses.next == &src->use_link &&
+                src->ssa->uses.next->next == &src->ssa->uses);
+}
+
+/**
+ * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
+ * bit set.
+ *
+ * However, as an optimization, it tries to find the instructions generating
+ * the sources to be packed and just emit the pack flag there, if possible.
+ */
+static void
+ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
+{
+        struct qreg result = qir_get_temp(c);
+        struct nir_alu_instr *vec4 = NULL;
+
+        /* If packing from a vec4 op (as expected), identify it so that we can
+         * peek back at what generated its sources.
+         */
+        if (instr->src[0].src.is_ssa &&
+            instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
+            nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
+            nir_op_vec4) {
+                vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        }
+
+        for (int i = 0; i < 4; i++) {
+                int swiz = instr->src[0].swizzle[i];
+                struct qreg src;
+                if (vec4) {
+                        src = ntq_get_src(c, vec4->src[swiz].src,
+                                          vec4->src[swiz].swizzle[0]);
+                } else {
+                        src = ntq_get_src(c, instr->src[0].src, swiz);
+                }
+
+                if (vec4 &&
+                    ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
+                    src.file == QFILE_TEMP &&
+                    c->defs[src.index] &&
+                    qir_is_mul(c->defs[src.index]) &&
+                    !c->defs[src.index]->dst.pack) {
+                        struct qinst *rewrite = c->defs[src.index];
+                        c->defs[src.index] = NULL;
+                        rewrite->dst = result;
+                        rewrite->dst.pack = QPU_PACK_MUL_8A + i;
+                        continue;
+                }
+
+                qir_PACK_8_F(c, result, src, i);
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+        *dest = result;
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -839,17 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 
         if (instr->op == nir_op_pack_unorm_4x8) {
-                struct qreg result;
-                for (int i = 0; i < 4; i++) {
-                        struct qreg src = ntq_get_src(c, instr->src[0].src,
-                                                      instr->src[0].swizzle[i]);
-                        if (i == 0)
-                                result = qir_PACK_8888_F(c, src);
-                        else
-                                result = qir_PACK_8_F(c, result, src, i);
-                }
-                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
-                *dest = result;
+                ntq_emit_pack_unorm_4x8(c, instr);
                 return;
         }
 
@@ -1130,20 +1186,24 @@ emit_frag_end(struct vc4_compile *c)
 static void
 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
 {
-        struct qreg xyi[2];
+        struct qreg packed = qir_get_temp(c);
 
         for (int i = 0; i < 2; i++) {
                 struct qreg scale =
                         qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
 
-                xyi[i] = qir_FTOI(c, qir_FMUL(c,
-                                              qir_FMUL(c,
-                                                       c->outputs[c->output_position_index + i],
-                                                       scale),
-                                              rcp_w));
+                struct qreg packed_chan = packed;
+                packed_chan.pack = QPU_PACK_A_16A + i;
+
+                qir_FTOI_dest(c, packed_chan,
+                              qir_FMUL(c,
+                                       qir_FMUL(c,
+                                                c->outputs[c->output_position_index + i],
+                                                scale),
+                                       rcp_w));
         }
 
-        qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1]));
+        qir_VPM_WRITE(c, packed);
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 254140a..9d93071 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -71,12 +71,11 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
         [QOP_LOG2] = { "log2", 1, 2, false, true },
-        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true },
-        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true },
-        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true },
-        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true },
-        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true },
-        [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true },
+        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1 },
+        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 1 },
+        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 1 },
+        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 1 },
+        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 1 },
         [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true },
         [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
         [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
@@ -169,6 +168,18 @@ qir_is_multi_instruction(struct qinst *inst)
 }
 
 bool
+qir_is_mul(struct qinst *inst)
+{
+        switch (inst->op) {
+        case QOP_FMUL:
+        case QOP_MUL24:
+                return true;
+        default:
+                return false;
+        }
+}
+
+bool
 qir_is_tex(struct qinst *inst)
 {
         return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
@@ -273,6 +284,14 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
+        if (inst->dst.pack) {
+                if (inst->dst.pack) {
+                        if (qir_is_mul(inst))
+                                vc4_qpu_disasm_pack_mul(stderr, inst->dst.pack);
+                        else
+                                vc4_qpu_disasm_pack_a(stderr, inst->dst.pack);
+                }
+        }
         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                 fprintf(stderr, ", ");
                 qir_print_reg(c, inst->src[i], false);
@@ -348,7 +367,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst)
         if (inst->dst.file == QFILE_TEMP)
                 c->defs[inst->dst.index] = inst;
 
-        list_addtail(&inst->link, &c->instructions);
+        qir_emit_nodef(c, inst);
 }
 
 bool
@@ -389,8 +408,11 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
 struct qreg
 qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 {
-        while (reg.file == QFILE_TEMP && c->defs[reg.index]->op == QOP_MOV)
+        while (reg.file == QFILE_TEMP &&
+               c->defs[reg.index] &&
+               c->defs[reg.index]->op == QOP_MOV) {
                 reg = c->defs[reg.index]->src[0];
+        }
 
         return reg;
 }
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index cade795..a2b21fa 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -58,6 +58,7 @@ enum qfile {
 struct qreg {
         enum qfile file;
         uint32_t index;
+        int pack;
 };
 
 enum qop {
@@ -104,7 +105,6 @@ enum qop {
         QOP_LOG2,
         QOP_VW_SETUP,
         QOP_VR_SETUP,
-        QOP_PACK_SCALED,
         QOP_PACK_8888_F,
         QOP_PACK_8A_F,
         QOP_PACK_8B_F,
@@ -444,13 +444,20 @@ struct qreg qir_uniform(struct vc4_compile *c,
                         enum quniform_contents contents,
                         uint32_t data);
 void qir_reorder_uniforms(struct vc4_compile *c);
+
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
+static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+{
+        list_addtail(&inst->link, &c->instructions);
+}
+
 struct qreg qir_get_temp(struct vc4_compile *c);
 int qir_get_op_nsrc(enum qop qop);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
 bool qir_is_multi_instruction(struct qinst *inst);
+bool qir_is_mul(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
@@ -509,6 +516,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
         struct qreg t = qir_get_temp(c);                                 \
         qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
         return t;                                                        \
+}                                                                        \
+static inline void                                                       \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
 }
 
 #define QIR_ALU2(name)                                                   \
@@ -518,6 +531,12 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b)          \
         struct qreg t = qir_get_temp(c);                                 \
         qir_emit(c, qir_inst(QOP_##name, t, a, b));                      \
         return t;                                                        \
+}                                                                        \
+static inline void                                                       \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, b));             \
 }
 
 #define QIR_NODST_1(name)                                               \
@@ -534,6 +553,14 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b)         \
         qir_emit(c, qir_inst(QOP_##name, c->undef, a, b));       \
 }
 
+#define QIR_PACK(name)                                                   \
+static inline struct qreg                                                \
+qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a)       \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return dest;                                                     \
+}
+
 QIR_ALU1(MOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
@@ -570,12 +597,11 @@ QIR_ALU1(RCP)
 QIR_ALU1(RSQ)
 QIR_ALU1(EXP2)
 QIR_ALU1(LOG2)
-QIR_ALU2(PACK_SCALED)
 QIR_ALU1(PACK_8888_F)
-QIR_ALU2(PACK_8A_F)
-QIR_ALU2(PACK_8B_F)
-QIR_ALU2(PACK_8C_F)
-QIR_ALU2(PACK_8D_F)
+QIR_PACK(PACK_8A_F)
+QIR_PACK(PACK_8B_F)
+QIR_PACK(PACK_8C_F)
+QIR_PACK(PACK_8D_F)
 QIR_ALU1(VARY_ADD_C)
 QIR_NODST_2(TEX_S)
 QIR_NODST_2(TEX_T)
@@ -627,11 +653,12 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
 }
 
 static inline struct qreg
-qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan)
+qir_PACK_8_F(struct vc4_compile *c, struct qreg dest, struct qreg val, int chan)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val));
-        return t;
+        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, dest, val, c->undef));
+        if (dest.file == QFILE_TEMP)
+                c->defs[dest.index] = NULL;
+        return dest;
 }
 
 static inline struct qreg
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index fbb90ba..0719d28 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -24,6 +24,7 @@
 #ifndef VC4_QPU_H
 #define VC4_QPU_H
 
+#include <stdio.h>
 #include <stdint.h>
 
 #include "util/u_math.h"
@@ -206,6 +207,12 @@ void
 vc4_qpu_disasm(const uint64_t *instructions, int num_instructions);
 
 void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack);
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack);
+
+void
 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst);
 
 #endif /* VC4_QPU_H */
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 00aeb30..0879787 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -245,6 +245,18 @@ get_special_write_desc(int reg, bool is_a)
         return special_write[reg];
 }
 
+void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack)
+{
+        fprintf(out, ".%s", DESC(qpu_pack_mul, pack));
+}
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack)
+{
+        fprintf(out, "%s", DESC(qpu_pack_a, pack));
+}
+
 static void
 print_alu_dst(uint64_t inst, bool is_mul)
 {
@@ -263,9 +275,9 @@ print_alu_dst(uint64_t inst, bool is_mul)
                 fprintf(stderr, "%s%d?", file, waddr);
 
         if (is_mul && (inst & QPU_PM)) {
-                fprintf(stderr, ".%s", DESC(qpu_pack_mul, pack));
+                vc4_qpu_disasm_pack_mul(stderr, pack);
         } else if (is_a && !(inst & QPU_PM)) {
-                fprintf(stderr, "%s", DESC(qpu_pack_a, pack));
+                vc4_qpu_disasm_pack_a(stderr, pack);
         }
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index f324056..adf3a8b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -179,10 +179,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                 static const struct {
                         uint32_t op;
-                        bool is_mul;
                 } translate[] = {
-#define A(name) [QOP_##name] = {QPU_A_##name, false}
-#define M(name) [QOP_##name] = {QPU_M_##name, true}
+#define A(name) [QOP_##name] = {QPU_A_##name}
+#define M(name) [QOP_##name] = {QPU_M_##name}
                         A(FADD),
                         A(FSUB),
                         A(FMIN),
@@ -336,28 +335,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_PACK_8B_F:
                 case QOP_PACK_8C_F:
                 case QOP_PACK_8D_F:
-                        /* If dst doesn't happen to already contain src[0],
-                         * then we have to move it in.
-                         */
-                        if (qinst->src[0].file != QFILE_NULL &&
-                            (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
-                                /* Don't overwrite src1 while setting up
-                                 * the dst!
-                                 */
-                                if (dst.mux == src[1].mux &&
-                                    dst.addr == src[1].addr) {
-                                        queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
-                                        src[1] = qpu_rb(31);
-                                }
-
-                                queue(c, qpu_m_MOV(dst, src[0]));
-                        }
-
-                        queue(c, qpu_m_MOV(dst, src[1]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
-                                                       qinst->op - QOP_PACK_8A_F,
-                                                       QPU_PACK);
+                        queue(c,
+                              qpu_m_MOV(dst, src[0]) |
+                              QPU_PM |
+                              QPU_SET_FIELD(QPU_PACK_MUL_8A +
+                                            qinst->op - QOP_PACK_8A_F,
+                                            QPU_PACK));
                         break;
 
                 case QOP_FRAG_X:
@@ -419,24 +402,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
                         break;
 
-                case QOP_PACK_SCALED: {
-                        uint64_t a = (qpu_a_MOV(dst, src[0]) |
-                                      QPU_SET_FIELD(QPU_PACK_A_16A,
-                                                    QPU_PACK));
-                        uint64_t b = (qpu_a_MOV(dst, src[1]) |
-                                      QPU_SET_FIELD(QPU_PACK_A_16B,
-                                                    QPU_PACK));
-
-                        if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
-                                queue(c, b);
-                                queue(c, a);
-                        } else {
-                                queue(c, a);
-                                queue(c, b);
-                        }
-                        break;
-                }
-
                 case QOP_TEX_S:
                 case QOP_TEX_T:
                 case QOP_TEX_R:
@@ -529,14 +494,24 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
 
-                        if (translate[qinst->op].is_mul) {
+                        if (qir_is_mul(qinst)) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]));
+                                if (qinst->dst.pack) {
+                                        *last_inst(c) |= QPU_PM;
+                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+                                                                       QPU_PACK);
+                                }
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]));
+                                if (qinst->dst.pack) {
+                                        assert(dst.mux == QPU_MUX_A);
+                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+                                                                       QPU_PACK);
+                                }
                         }
 
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index a29db1f..3ced50f 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -113,9 +113,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         if (vc4->regs)
                 return;
 
-        vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
+        vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
         vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
         vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
@@ -130,10 +131,12 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                  */
                 if (vc4_regs[i].mux == QPU_MUX_R4) {
                         ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
                         continue;
                 }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
         }
 
         for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
@@ -177,7 +180,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
-        memset(def, 0, sizeof(def));
+        for (int i = 0; i < ARRAY_SIZE(def); i++)
+                def[i] = ~0;
         memset(use, 0, sizeof(use));
 
         /* If things aren't ever written (undefined values), just read from
@@ -196,7 +200,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (inst->dst.file == QFILE_TEMP) {
-                        def[inst->dst.index] = ip;
+                        def[inst->dst.index] = MIN2(ip, def[inst->dst.index]);
                         use[inst->dst.index] = ip;
                 }
 
@@ -267,17 +271,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_PACK_SCALED:
-                        /* The pack flags require an A-file dst register. */
-                        class_bits[inst->dst.index] &= CLASS_BIT_A;
-                        break;
-
                 default:
                         break;
                 }
 
+                if (inst->dst.pack && !qir_is_mul(inst)) {
+                        /* The non-MUL pack flags require an A-file dst
+                         * register.
+                         */
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
+                }
+
                 if (qir_src_needs_a_file(inst)) {
-                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                        switch (inst->op) {
+                        case QOP_UNPACK_8A_F:
+                        case QOP_UNPACK_8B_F:
+                        case QOP_UNPACK_8C_F:
+                        case QOP_UNPACK_8D_F:
+                                /* Special case: these can be done as R4
+                                 * unpacks, as well.
+                                 */
+                                class_bits[inst->src[0].index] &= (CLASS_BIT_A |
+                                                                   CLASS_BIT_R4);
+                                break;
+                        default:
+                                class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                                break;
+                        }
                 }
                 ip++;
         }
@@ -287,9 +307,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 
                 switch (class_bits[i]) {
                 case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
-                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
                         ra_set_node_class(g, node, vc4->reg_class_any);
                         break;
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
+                        break;
                 case CLASS_BIT_A | CLASS_BIT_R4:
                         ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
                         break;