summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/freedreno/a2xx/a2xx.xml.h4
-rw-r--r--src/gallium/drivers/freedreno/a3xx/a3xx.xml.h23
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_format.c11
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c2
-rw-r--r--src/gallium/drivers/freedreno/a3xx/fd3_texture.c3
-rw-r--r--src/gallium/drivers/freedreno/a4xx/a4xx.xml.h12
-rw-r--r--src/gallium/drivers/freedreno/a4xx/fd4_emit.h1
-rw-r--r--src/gallium/drivers/freedreno/a4xx/fd4_format.c10
-rw-r--r--src/gallium/drivers/freedreno/a4xx/fd4_gmem.c22
-rw-r--r--src/gallium/drivers/freedreno/a4xx/fd4_program.c7
-rw-r--r--src/gallium/drivers/freedreno/a4xx/fd4_texture.c4
-rw-r--r--src/gallium/drivers/freedreno/adreno_common.xml.h4
-rw-r--r--src/gallium/drivers/freedreno/adreno_pm4.xml.h4
-rw-r--r--src/gallium/drivers/freedreno/freedreno_context.c4
-rw-r--r--src/gallium/drivers/freedreno/freedreno_fence.c25
-rw-r--r--src/gallium/drivers/freedreno/freedreno_fence.h5
-rw-r--r--src/gallium/drivers/freedreno/freedreno_resource.c9
-rw-r--r--src/gallium/drivers/freedreno/freedreno_screen.c10
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c13
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c1
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_ra.c2
-rw-r--r--src/gallium/drivers/i915/i915_surface.c3
-rw-r--r--src/gallium/drivers/nouveau/Makefile.sources3
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir.h8
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp1
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp2
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp21
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp12
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp90
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp4
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h7
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp253
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp5
-rw-r--r--src/gallium/drivers/nouveau/codegen/unordered_set.h48
-rw-r--r--src/gallium/drivers/nouveau/nouveau_compiler.c4
-rw-r--r--src/gallium/drivers/nouveau/nv50/nv50_state.c10
-rw-r--r--src/gallium/drivers/nouveau/nv50/nv50_state_validate.c22
-rw-r--r--src/gallium/drivers/nouveau/nv50/nv50_stateobj.h2
-rw-r--r--src/gallium/drivers/nouveau/nv50/nv50_surface.c20
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_context.c25
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_context.h3
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_program.c17
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c7
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_state.c12
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c21
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h2
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_surface.c15
-rw-r--r--src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c3
-rw-r--r--src/gallium/drivers/r300/r300_blit.c3
-rw-r--r--src/gallium/drivers/r600/r600_blit.c3
-rw-r--r--src/gallium/drivers/r600/r600_isa.h2
-rw-r--r--src/gallium/drivers/r600/r600_pipe.h4
-rw-r--r--src/gallium/drivers/r600/r600_shader.c34
-rw-r--r--src/gallium/drivers/r600/r600_shader.h20
-rw-r--r--src/gallium/drivers/r600/r600_state_common.c42
-rw-r--r--src/gallium/drivers/radeon/radeon_uvd.c5
-rw-r--r--src/gallium/drivers/radeonsi/si_blit.c3
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c19
-rw-r--r--src/gallium/drivers/vc4/vc4_context.h1
-rw-r--r--src/gallium/drivers/vc4/vc4_opt_algebraic.c33
-rw-r--r--src/gallium/drivers/vc4/vc4_opt_copy_propagation.c22
-rw-r--r--src/gallium/drivers/vc4/vc4_opt_vpm_writes.c18
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c96
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.c38
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h47
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu.h7
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_disasm.c16
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_emit.c63
-rw-r--r--src/gallium/drivers/vc4/vc4_register_allocate.c42
69 files changed, 879 insertions, 435 deletions
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index c4516ba..dd48956 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 8e8cf6a..441bfec 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark <robdclark@gmail.com> (robclark)
@@ -326,6 +326,13 @@ enum a3xx_tex_type {
A3XX_TEX_3D = 3,
};
+enum a3xx_tex_msaa {
+ A3XX_TPL1_MSAA1X = 0,
+ A3XX_TPL1_MSAA2X = 1,
+ A3XX_TPL1_MSAA4X = 2,
+ A3XX_TPL1_MSAA8X = 3,
+};
+
#define A3XX_INT0_RBBM_GPU_IDLE 0x00000001
#define A3XX_INT0_RBBM_AHB_ERROR 0x00000002
#define A3XX_INT0_RBBM_REG_TIMEOUT 0x00000004
@@ -2652,6 +2659,7 @@ static inline uint32_t A3XX_VGT_DRAW_INITIATOR_NUM_INSTANCES(uint32_t val)
#define REG_A3XX_VGT_IMMED_DATA 0x000021fd
#define REG_A3XX_TEX_SAMP_0 0x00000000
+#define A3XX_TEX_SAMP_0_CLAMPENABLE 0x00000001
#define A3XX_TEX_SAMP_0_MIPFILTER_LINEAR 0x00000002
#define A3XX_TEX_SAMP_0_XY_MAG__MASK 0x0000000c
#define A3XX_TEX_SAMP_0_XY_MAG__SHIFT 2
@@ -2695,6 +2703,7 @@ static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val
{
return ((val) << A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT) & A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK;
}
+#define A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF 0x01000000
#define A3XX_TEX_SAMP_0_UNNORM_COORDS 0x80000000
#define REG_A3XX_TEX_SAMP_1 0x00000001
@@ -2750,6 +2759,12 @@ static inline uint32_t A3XX_TEX_CONST_0_MIPLVLS(uint32_t val)
{
return ((val) << A3XX_TEX_CONST_0_MIPLVLS__SHIFT) & A3XX_TEX_CONST_0_MIPLVLS__MASK;
}
+#define A3XX_TEX_CONST_0_MSAATEX__MASK 0x00300000
+#define A3XX_TEX_CONST_0_MSAATEX__SHIFT 20
+static inline uint32_t A3XX_TEX_CONST_0_MSAATEX(enum a3xx_tex_msaa val)
+{
+ return ((val) << A3XX_TEX_CONST_0_MSAATEX__SHIFT) & A3XX_TEX_CONST_0_MSAATEX__MASK;
+}
#define A3XX_TEX_CONST_0_FMT__MASK 0x1fc00000
#define A3XX_TEX_CONST_0_FMT__SHIFT 22
static inline uint32_t A3XX_TEX_CONST_0_FMT(enum a3xx_tex_fmt val)
@@ -2785,7 +2800,7 @@ static inline uint32_t A3XX_TEX_CONST_1_FETCHSIZE(enum a3xx_tex_fetchsize val)
}
#define REG_A3XX_TEX_CONST_2 0x00000002
-#define A3XX_TEX_CONST_2_INDX__MASK 0x000000ff
+#define A3XX_TEX_CONST_2_INDX__MASK 0x000001ff
#define A3XX_TEX_CONST_2_INDX__SHIFT 0
static inline uint32_t A3XX_TEX_CONST_2_INDX(uint32_t val)
{
@@ -2805,7 +2820,7 @@ static inline uint32_t A3XX_TEX_CONST_2_SWAP(enum a3xx_color_swap val)
}
#define REG_A3XX_TEX_CONST_3 0x00000003
-#define A3XX_TEX_CONST_3_LAYERSZ1__MASK 0x00007fff
+#define A3XX_TEX_CONST_3_LAYERSZ1__MASK 0x0001ffff
#define A3XX_TEX_CONST_3_LAYERSZ1__SHIFT 0
static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ1(uint32_t val)
{
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
index ec87aa9..04cb9b9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
@@ -262,6 +262,15 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = {
_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
+
+ _T(DXT1_RGB, DXT1, NONE, WZYX),
+ _T(DXT1_SRGB, DXT1, NONE, WZYX),
+ _T(DXT1_RGBA, DXT1, NONE, WZYX),
+ _T(DXT1_SRGBA, DXT1, NONE, WZYX),
+ _T(DXT3_RGBA, DXT3, NONE, WZYX),
+ _T(DXT3_SRGBA, DXT3, NONE, WZYX),
+ _T(DXT5_RGBA, DXT5, NONE, WZYX),
+ _T(DXT5_SRGBA, DXT5, NONE, WZYX),
};
enum a3xx_vtx_fmt
@@ -301,7 +310,7 @@ fd3_pipe2fetchsize(enum pipe_format format)
{
if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
format = PIPE_FORMAT_Z32_FLOAT;
- switch (util_format_get_blocksizebits(format)) {
+ switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) {
case 8: return TFETCH_1_BYTE;
case 16: return TFETCH_2_BYTE;
case 32: return TFETCH_4_BYTE;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
index 9c16804..583caaa 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
@@ -73,7 +73,7 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,
so->gras_su_poly_offset_scale =
A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale);
so->gras_su_poly_offset_offset =
- A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units);
+ A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f);
so->gras_su_mode_control =
A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index c30658d..2d6ecb2 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -115,6 +115,7 @@ fd3_sampler_state_create(struct pipe_context *pctx,
so->texsamp0 =
COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) |
+ COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) |
COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) |
A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) |
A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) |
@@ -239,7 +240,7 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
/* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */
so->texconst2 =
- A3XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp);
+ A3XX_TEX_CONST_2_PITCH(util_format_get_nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
switch (prsc->target) {
case PIPE_TEXTURE_1D_ARRAY:
case PIPE_TEXTURE_2D_ARRAY:
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 563f70a..2e1d712 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark <robdclark@gmail.com> (robclark)
@@ -162,10 +162,13 @@ enum a4xx_tex_fmt {
TFMT4_8_UNORM = 4,
TFMT4_8_8_UNORM = 14,
TFMT4_8_8_8_8_UNORM = 28,
+ TFMT4_8_SNORM = 5,
TFMT4_8_8_SNORM = 15,
TFMT4_8_8_8_8_SNORM = 29,
+ TFMT4_8_UINT = 6,
TFMT4_8_8_UINT = 16,
TFMT4_8_8_8_8_UINT = 30,
+ TFMT4_8_SINT = 7,
TFMT4_8_8_SINT = 17,
TFMT4_8_8_8_8_SINT = 31,
TFMT4_16_UINT = 21,
@@ -430,7 +433,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(enum a3xx_color_swap val)
return ((val) << A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__SHIFT) & A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__MASK;
}
#define A4XX_RB_MRT_BUF_INFO_COLOR_SRGB 0x00002000
-#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK 0x007fc000
+#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK 0xffffc000
#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__SHIFT 14
static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
{
@@ -440,7 +443,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
static inline uint32_t REG_A4XX_RB_MRT_BASE(uint32_t i0) { return 0x000020a6 + 0x5*i0; }
static inline uint32_t REG_A4XX_RB_MRT_CONTROL3(uint32_t i0) { return 0x000020a7 + 0x5*i0; }
-#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK 0x0001fff8
+#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK 0x03fffff8
#define A4XX_RB_MRT_CONTROL3_STRIDE__SHIFT 3
static inline uint32_t A4XX_RB_MRT_CONTROL3_STRIDE(uint32_t val)
{
@@ -1460,6 +1463,7 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
{
return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
}
+#define A4XX_SP_FS_MRT_REG_COLOR_SRGB 0x00040000
#define REG_A4XX_SP_CS_CTRL_REG0 0x00002300
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index ab7850e..3a1d4b6 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -56,6 +56,7 @@ struct fd4_emit {
uint32_t sprite_coord_enable; /* bitmask */
bool sprite_coord_mode;
bool rasterflat;
+ bool no_decode_srgb;
/* cached to avoid repeated lookups of same variants: */
struct ir3_shader_variant *vp, *fp;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 3e00454..6c9e217 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -79,9 +79,9 @@ struct fd4_format {
static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
/* 8-bit */
VT(R8_UNORM, 8_UNORM, R8_UNORM, WZYX),
- V_(R8_SNORM, 8_SNORM, NONE, WZYX),
- V_(R8_UINT, 8_UINT, NONE, WZYX),
- V_(R8_SINT, 8_SINT, NONE, WZYX),
+ VT(R8_SNORM, 8_SNORM, NONE, WZYX),
+ VT(R8_UINT, 8_UINT, NONE, WZYX),
+ VT(R8_SINT, 8_SINT, NONE, WZYX),
V_(R8_USCALED, 8_UINT, NONE, WZYX),
V_(R8_SSCALED, 8_UINT, NONE, WZYX),
@@ -115,8 +115,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
VT(R8G8_UNORM, 8_8_UNORM, R8G8_UNORM, WZYX),
VT(R8G8_SNORM, 8_8_SNORM, R8G8_SNORM, WZYX),
- VT(R8G8_UINT, 8_8_UINT, NONE, WZYX),
- VT(R8G8_SINT, 8_8_SINT, NONE, WZYX),
+ VT(R8G8_UINT, 8_8_UINT, R8G8_UINT, WZYX),
+ VT(R8G8_SINT, 8_8_SINT, R8G8_SINT, WZYX),
V_(R8G8_USCALED, 8_8_UINT, NONE, WZYX),
V_(R8G8_SSCALED, 8_8_SINT, NONE, WZYX),
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 81c37f7..3f8bbf3 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -46,7 +46,8 @@
static void
emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
- struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
+ struct pipe_surface **bufs, uint32_t *bases,
+ uint32_t bin_w, bool decode_srgb)
{
enum a4xx_tile_mode tile_mode;
unsigned i;
@@ -60,6 +61,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
enum a4xx_color_fmt format = 0;
enum a3xx_color_swap swap = WZYX;
+ bool srgb = false;
struct fd_resource *rsc = NULL;
struct fd_resource_slice *slice = NULL;
uint32_t stride = 0;
@@ -68,10 +70,9 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
if ((i < nr_bufs) && bufs[i]) {
struct pipe_surface *psurf = bufs[i];
- enum pipe_format pformat = 0;
+ enum pipe_format pformat = psurf->format;
rsc = fd_resource(psurf->texture);
- pformat = psurf->format;
/* In case we're drawing to Z32F_S8, the "color" actually goes to
* the stencil
@@ -86,6 +87,11 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
format = fd4_pipe2color(pformat);
swap = fd4_pipe2swap(pformat);
+ if (decode_srgb)
+ srgb = util_format_is_srgb(pformat);
+ else
+ pformat = util_format_linear(pformat);
+
debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
offset = fd_resource_offset(rsc, psurf->u.tex.level,
@@ -108,7 +114,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
- A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
+ A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) |
+ COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB));
if (bin_w || (i >= nr_bufs) || !bufs[i]) {
OUT_RING(ring, base);
OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
@@ -282,7 +289,7 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
struct fd_ringbuffer *ring = ctx->ring;
struct pipe_surface *zsbufs[2];
- emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
+ emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false);
if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
/* The gmem_restore_tex logic will put the first buffer's stencil
@@ -315,6 +322,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
.key = {
.half_precision = fd_half_precision(pfb),
},
+ .no_decode_srgb = true,
};
unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
float x0, y0, x1, y1;
@@ -520,7 +528,7 @@ fd4_emit_sysmem_prep(struct fd_context *ctx)
OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
- emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+ emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true);
/* setup scissor/offset for current tile: */
OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
@@ -677,7 +685,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
- emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
+ emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true);
/* setup scissor/offset for current tile: */
OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 1a6d014..a3d7123 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -450,10 +450,15 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
for (i = 0; i < 8; i++) {
enum a4xx_color_fmt format = 0;
- if (i < nr)
+ bool srgb = false;
+ if (i < nr) {
format = fd4_emit_format(bufs[i]);
+ if (bufs[i] && !emit->no_decode_srgb)
+ srgb = util_format_is_srgb(bufs[i]->format);
+ }
OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+ COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
COND(emit->key.half_precision,
A4XX_SP_FS_MRT_REG_HALF_PRECISION));
}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index d2bc5fe..213b29c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -187,9 +187,9 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
break;
case PIPE_TEXTURE_CUBE:
- case PIPE_TEXTURE_CUBE_ARRAY: /* ?? not sure about _CUBE_ARRAY */
+ case PIPE_TEXTURE_CUBE_ARRAY:
so->texconst3 =
- A4XX_TEX_CONST_3_DEPTH(1) |
+ A4XX_TEX_CONST_3_DEPTH(prsc->array_size / 6) |
A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
break;
case PIPE_TEXTURE_3D:
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 00b6acb..29944b7 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 98a90e2..432dce3 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 8e6d431..0b6b9fb 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -131,11 +131,13 @@ static void
fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
unsigned flags)
{
+ struct fd_ringbuffer *ring = fd_context(pctx)->ring;
+
fd_context_render(pctx);
if (fence) {
fd_screen_fence_ref(pctx->screen, fence, NULL);
- *fence = fd_fence_create(pctx);
+ *fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(ring));
}
}
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 04a9fea..5125f09 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -50,35 +50,18 @@ fd_screen_fence_ref(struct pipe_screen *pscreen,
*ptr = pfence;
}
-/* TODO we need to spiff out libdrm_freedreno a bit to allow passing
- * the timeout.. and maybe a better way to check if fence has been
- * signaled. The current implementation is a bit lame for now to
- * avoid bumping libdrm version requirement.
- */
-
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
- struct pipe_fence_handle *fence)
-{
- uint32_t timestamp = fd_ringbuffer_timestamp(fence->ctx->ring);
-
- /* TODO util helper for compare w/ rollover? */
- return timestamp >= fence->timestamp;
-}
-
boolean fd_screen_fence_finish(struct pipe_screen *screen,
struct pipe_fence_handle *fence,
uint64_t timeout)
{
- if (!timeout)
- return fd_screen_fence_signalled(screen, fence);
-
- if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
+ if (fd_pipe_wait_timeout(fence->screen->pipe, fence->timestamp, timeout))
return false;
return true;
}
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+ uint32_t timestamp)
{
struct pipe_fence_handle *fence;
struct fd_context *ctx = fd_context(pctx);
@@ -91,7 +74,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
fence->ctx = ctx;
fence->screen = ctx->screen;
- fence->timestamp = fd_ringbuffer_timestamp(ctx->ring);
+ fence->timestamp = timestamp;
return fence;
}
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h
index e36bcc4..06c314a 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.h
+++ b/src/gallium/drivers/freedreno/freedreno_fence.h
@@ -34,11 +34,10 @@
void fd_screen_fence_ref(struct pipe_screen *pscreen,
struct pipe_fence_handle **ptr,
struct pipe_fence_handle *pfence);
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
- struct pipe_fence_handle *pfence);
boolean fd_screen_fence_finish(struct pipe_screen *screen,
struct pipe_fence_handle *pfence,
uint64_t timeout);
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx);
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+ uint32_t timestamp);
#endif /* FREEDRENO_FENCE_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 709ad4e..98de096 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -222,7 +222,7 @@ fd_resource_transfer_map(struct pipe_context *pctx,
ptrans->level = level;
ptrans->usage = usage;
ptrans->box = *box;
- ptrans->stride = slice->pitch * rsc->cpp;
+ ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->cpp;
ptrans->layer_stride = slice->size0;
if (usage & PIPE_TRANSFER_READ)
@@ -375,9 +375,11 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
for (level = 0; level <= prsc->last_level; level++) {
struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
+ uint32_t blocks;
slice->pitch = width = align(width, 32);
slice->offset = size;
+ blocks = util_format_get_nblocks(prsc->format, width, height);
/* 1d array and 2d array textures must all have the same layer size
* for each miplevel on a3xx. 3d textures can have different layer
* sizes for high levels, but the hw auto-sizer is buggy (or at least
@@ -387,9 +389,9 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
if (prsc->target == PIPE_TEXTURE_3D && (
level == 1 ||
(level > 1 && rsc->slices[level - 1].size0 > 0xf000)))
- slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+ slice->size0 = align(blocks * rsc->cpp, alignment);
else if (level == 0 || rsc->layer_first || alignment == 1)
- slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+ slice->size0 = align(blocks * rsc->cpp, alignment);
else
slice->size0 = rsc->slices[level - 1].size0;
@@ -459,7 +461,6 @@ fd_resource_create(struct pipe_screen *pscreen,
if (is_a4xx(fd_screen(pscreen))) {
switch (tmpl->target) {
case PIPE_TEXTURE_3D:
- /* TODO 3D_ARRAY? */
rsc->layer_first = false;
break;
default:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b55f5b3..86e9a21 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -163,7 +163,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_TEXTURE_MULTISAMPLE:
case PIPE_CAP_TEXTURE_BARRIER:
case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
- case PIPE_CAP_CUBE_MAP_ARRAY:
case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
case PIPE_CAP_START_INSTANCE:
case PIPE_CAP_COMPUTE:
@@ -176,6 +175,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_INDEP_BLEND_ENABLE:
case PIPE_CAP_INDEP_BLEND_FUNC:
case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
return is_a3xx(screen) || is_a4xx(screen);
case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@@ -191,8 +191,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return 16383;
case PIPE_CAP_DEPTH_CLIP_DISABLE:
+ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
return is_a3xx(screen);
+ case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+ case PIPE_CAP_CUBE_MAP_ARRAY:
+ return is_a4xx(screen);
+
case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
return 256;
@@ -202,7 +207,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return is_ir3(screen) ? 130 : 120;
/* Unsupported features. */
- case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
@@ -230,8 +234,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
- case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
- case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
case PIPE_CAP_DEPTH_BOUNDS_TEST:
return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 0ab3345..071901a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1636,6 +1636,11 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
}
+ /* the array coord for cube arrays needs 0.5 added to it */
+ if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array &&
+ opc != OPC_ISAML)
+ coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0);
+
/*
* lay out the first argument in the proper order:
* - actual coordinates first
@@ -1759,6 +1764,12 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
tex_info(tex, &flags, &coords);
+ /* Actually we want the number of dimensions, not coordinates. This
+ * distinction only matters for cubes.
+ */
+ if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+ coords = 2;
+
dst = get_dst(ctx, &tex->dest, 4);
compile_assert(ctx, tex->num_srcs == 1);
@@ -2301,7 +2312,7 @@ emit_instructions(struct ir3_compile *ctx)
ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
/* Create inputs in first block: */
- ctx->block = get_block(ctx, fxn->start_block);
+ ctx->block = get_block(ctx, nir_start_block(fxn));
ctx->in_block = ctx->block;
list_addtail(&ctx->block->node, &ctx->ir->block_list);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index dc9e462..bed7b7b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -29,6 +29,7 @@
#include "ir3_nir.h"
#include "glsl/nir/nir_builder.h"
+#include "glsl/nir/nir_control_flow.h"
/* Based on nir_opt_peephole_select, and hacked up to more aggressively
* flatten anything that can be flattened
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index eaf3b3c..8801839 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -189,7 +189,7 @@ ir3_ra_alloc_reg_set(void *memctx)
}
/* allocate the reg-set.. */
- set->regs = ra_alloc_reg_set(set, ra_reg_count);
+ set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index 24e0156..b2a639c 100644
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -120,7 +120,8 @@ i915_surface_copy_render(struct pipe_context *pipe,
util_blitter_blit_generic(i915->blitter, dst_view, &dstbox,
src_view, src_box, src_width0, src_height0,
- PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+ PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+ FALSE);
return;
fallback:
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 3fae3bc..9346ea3 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -121,7 +121,8 @@ NV50_CODEGEN_SOURCES := \
codegen/nv50_ir_target_nv50.cpp \
codegen/nv50_ir_target_nv50.h \
codegen/nv50_ir_util.cpp \
- codegen/nv50_ir_util.h
+ codegen/nv50_ir_util.h \
+ codegen/unordered_set.h
NVC0_CODEGEN_SOURCES := \
codegen/nv50_ir_emit_gk110.cpp \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 3ddaeaf..ba1b085 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -29,8 +29,8 @@
#include <deque>
#include <list>
#include <vector>
-#include <tr1/unordered_set>
+#include "codegen/unordered_set.h"
#include "codegen/nv50_ir_util.h"
#include "codegen/nv50_ir_graph.h"
@@ -585,10 +585,10 @@ public:
static inline Value *get(Iterator&);
- std::tr1::unordered_set<ValueRef *> uses;
+ unordered_set<ValueRef *> uses;
std::list<ValueDef *> defs;
- typedef std::tr1::unordered_set<ValueRef *>::iterator UseIterator;
- typedef std::tr1::unordered_set<ValueRef *>::const_iterator UseCIterator;
+ typedef unordered_set<ValueRef *>::iterator UseIterator;
+ typedef unordered_set<ValueRef *>::const_iterator UseCIterator;
typedef std::list<ValueDef *>::iterator DefIterator;
typedef std::list<ValueDef *>::const_iterator DefCIterator;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index f06056f..8f15429 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -933,6 +933,7 @@ CodeEmitterGK110::emitCVT(const Instruction *i)
code[0] |= typeSizeofLog2(dType) << 10;
code[0] |= typeSizeofLog2(i->sType) << 12;
+ code[1] |= i->subOp << 12;
if (isSignedIntType(dType))
code[0] |= 0x4000;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index ef5c87d..6e22788 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -818,6 +818,7 @@ CodeEmitterGM107::emitI2F()
emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
emitCC (0x2f);
emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+ emitField(0x29, 2, insn->subOp);
emitRND (0x27, rnd, -1);
emitField(0x0d, 1, isSignedType(insn->sType));
emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
@@ -850,6 +851,7 @@ CodeEmitterGM107::emitI2I()
emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
emitCC (0x2f);
emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+ emitField(0x29, 2, insn->subOp);
emitField(0x0d, 1, isSignedType(insn->sType));
emitField(0x0c, 1, isSignedType(insn->dType));
emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index f607f3b..6bf5219 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1020,6 +1020,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
code[0] |= util_logbase2(typeSizeof(dType)) << 20;
code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
+ // for 8/16 source types, the byte/word is in subOp. word 1 is
+ // represented as 2.
+ code[1] |= i->subOp << 0x17;
+
if (sat)
code[0] |= 0x20;
if (abs)
@@ -2614,11 +2618,12 @@ private:
int imul; // integer MUL to MUL delay 3
} res;
struct ScoreData {
- int r[64];
+ int r[256];
int p[8];
int c;
} rd, wr;
int base;
+ int regs;
void rebase(const int base)
{
@@ -2627,7 +2632,7 @@ private:
return;
this->base = 0;
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
rd.r[i] += delta;
wr.r[i] += delta;
}
@@ -2646,16 +2651,17 @@ private:
res.imul += delta;
res.tex += delta;
}
- void wipe()
+ void wipe(int regs)
{
memset(&rd, 0, sizeof(rd));
memset(&wr, 0, sizeof(wr));
memset(&res, 0, sizeof(res));
+ this->regs = regs;
}
int getLatest(const ScoreData& d) const
{
int max = 0;
- for (int i = 0; i < 64; ++i)
+ for (int i = 0; i < regs; ++i)
if (d.r[i] > max)
max = d.r[i];
for (int i = 0; i < 8; ++i)
@@ -2690,7 +2696,7 @@ private:
}
void setMax(const RegScores *that)
{
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
}
@@ -2711,7 +2717,7 @@ private:
}
void print(int cycle)
{
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
if (rd.r[i] > cycle)
INFO("rd $r%i @ %i\n", i, rd.r[i]);
if (wr.r[i] > cycle)
@@ -2806,9 +2812,10 @@ SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
bool
SchedDataCalculator::visit(Function *func)
{
+ int regs = targ->getFileSize(FILE_GPR) + 1;
scoreBoards.resize(func->cfg.getSize());
for (size_t i = 0; i < scoreBoards.size(); ++i)
- scoreBoards[i].wipe();
+ scoreBoards[i].wipe(regs);
return true;
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 4847a0f..f153674 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2990,9 +2990,15 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
case TGSI_OPCODE_UBFE:
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
src0 = fetchSrc(0, c);
- src1 = fetchSrc(1, c);
- src2 = fetchSrc(2, c);
- mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+ if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE &&
+ tgsi.getSrc(2).getFile() == TGSI_FILE_IMMEDIATE) {
+ src1 = loadImm(NULL, tgsi.getSrc(2).getValueU32(c, info) << 8 |
+ tgsi.getSrc(1).getValueU32(c, info));
+ } else {
+ src1 = fetchSrc(1, c);
+ src2 = fetchSrc(2, c);
+ mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+ }
mkOp2(OP_EXTBF, dstTy, dst0[c], src0, src1);
}
break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 1f3fce2..420cc4e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -193,100 +193,16 @@ GM107LoweringPass::visit(Instruction *i)
checkPredicate(i);
switch (i->op) {
- case OP_TEX:
- case OP_TXB:
- case OP_TXL:
- case OP_TXF:
- case OP_TXG:
- return handleTEX(i->asTex());
- case OP_TXD:
- return handleTXD(i->asTex());
- case OP_TXLQ:
- return handleTXLQ(i->asTex());
- case OP_TXQ:
- return handleTXQ(i->asTex());
- case OP_EX2:
- bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
- i->setSrc(0, i->getDef(0));
- break;
- case OP_POW:
- return handlePOW(i);
- case OP_DIV:
- return handleDIV(i);
- case OP_MOD:
- return handleMOD(i);
- case OP_SQRT:
- return handleSQRT(i);
- case OP_EXPORT:
- return handleEXPORT(i);
case OP_PFETCH:
return handlePFETCH(i);
- case OP_EMIT:
- case OP_RESTART:
- return handleOUT(i);
- case OP_RDSV:
- return handleRDSV(i);
- case OP_WRSV:
- return handleWRSV(i);
- case OP_LOAD:
- if (i->src(0).getFile() == FILE_SHADER_INPUT) {
- if (prog->getType() == Program::TYPE_COMPUTE) {
- i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
- i->getSrc(0)->reg.fileIndex = 0;
- } else
- if (prog->getType() == Program::TYPE_GEOMETRY &&
- i->src(0).isIndirect(0)) {
- // XXX: this assumes vec4 units
- Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 0), bld.mkImm(4));
- i->setIndirect(0, 0, ptr);
- i->op = OP_VFETCH;
- } else {
- i->op = OP_VFETCH;
- assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
- }
- } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
- if (i->src(0).isIndirect(1)) {
- Value *ptr;
- if (i->src(0).isIndirect(0))
- ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 1), bld.mkImm(0x1010),
- i->getIndirect(0, 0));
- else
- ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 1), bld.mkImm(16));
- i->setIndirect(0, 1, NULL);
- i->setIndirect(0, 0, ptr);
- i->subOp = NV50_IR_SUBOP_LDC_IS;
- }
- }
- break;
- case OP_ATOM:
- {
- const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
- handleATOM(i);
- handleCasExch(i, cctl);
- }
- break;
- case OP_SULDB:
- case OP_SULDP:
- case OP_SUSTB:
- case OP_SUSTP:
- case OP_SUREDB:
- case OP_SUREDP:
- handleSurfaceOpNVE4(i->asTex());
- break;
case OP_DFDX:
case OP_DFDY:
- handleDFDX(i);
- break;
+ return handleDFDX(i);
case OP_POPCNT:
- handlePOPCNT(i);
- break;
+ return handlePOPCNT(i);
default:
- break;
+ return NVC0LoweringPass::visit(i);
}
- return true;
}
} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index c3c302d..b1f4065 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -224,7 +224,7 @@ NVC0LegalizePostRA::findFirstUses(
const Instruction *texi,
const Instruction *insn,
std::list<TexUse> &uses,
- std::tr1::unordered_set<const Instruction *>& visited)
+ unordered_set<const Instruction *>& visited)
{
for (int d = 0; insn->defExists(d); ++d) {
Value *v = insn->getDef(d);
@@ -323,7 +323,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
if (!uses)
return false;
for (size_t i = 0; i < texes.size(); ++i) {
- std::tr1::unordered_set<const Instruction *> visited;
+ unordered_set<const Instruction *> visited;
findFirstUses(texes[i], texes[i], uses[i], visited);
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 260e101..2ce52e5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -20,8 +20,6 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <tr1/unordered_set>
-
#include "codegen/nv50_ir.h"
#include "codegen/nv50_ir_build_util.h"
@@ -73,7 +71,7 @@ private:
inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
void findFirstUses(const Instruction *tex, const Instruction *def,
std::list<TexUse>&,
- std::tr1::unordered_set<const Instruction *>&);
+ unordered_set<const Instruction *>&);
void findOverwritingDefs(const Instruction *tex, Instruction *insn,
const BasicBlock *term,
std::list<TexUse>&);
@@ -111,10 +109,11 @@ protected:
void checkPredicate(Instruction *);
+ virtual bool visit(Instruction *);
+
private:
virtual bool visit(Function *);
virtual bool visit(BasicBlock *);
- virtual bool visit(Instruction *);
void readTessCoord(LValue *dst, int c);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index cea96dc..b01ef41 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1023,27 +1023,53 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
case OP_AND:
{
- CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
- if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
- return;
- if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
- return;
- if (imm0.reg.data.f32 != 1.0)
- return;
- if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
- return;
+ Instruction *src = i->getSrc(t)->getInsn();
+ ImmediateValue imm1;
+ if (imm0.reg.data.u32 == 0) {
+ i->op = OP_MOV;
+ i->setSrc(0, new_ImmediateValue(prog, 0u));
+ i->src(0).mod = Modifier(0);
+ i->setSrc(1, NULL);
+ } else if (imm0.reg.data.u32 == ~0U) {
+ i->op = i->src(t).mod.getOp();
+ if (t) {
+ i->setSrc(0, i->getSrc(t));
+ i->src(0).mod = i->src(t).mod;
+ }
+ i->setSrc(1, NULL);
+ } else if (src->asCmp()) {
+ CmpInstruction *cmp = src->asCmp();
+ if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+ return;
+ if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+ return;
+ if (imm0.reg.data.f32 != 1.0)
+ return;
+ if (cmp->dType != TYPE_U32)
+ return;
- i->getSrc(t)->getInsn()->dType = TYPE_F32;
- if (i->src(t).mod != Modifier(0)) {
- assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
- i->src(t).mod = Modifier(0);
- cmp->setCond = inverseCondCode(cmp->setCond);
- }
- i->op = OP_MOV;
- i->setSrc(s, NULL);
- if (t) {
- i->setSrc(0, i->getSrc(t));
- i->setSrc(t, NULL);
+ cmp->dType = TYPE_F32;
+ if (i->src(t).mod != Modifier(0)) {
+ assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+ i->src(t).mod = Modifier(0);
+ cmp->setCond = inverseCondCode(cmp->setCond);
+ }
+ i->op = OP_MOV;
+ i->setSrc(s, NULL);
+ if (t) {
+ i->setSrc(0, i->getSrc(t));
+ i->setSrc(t, NULL);
+ }
+ } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
+ src->op == OP_SHR &&
+ src->src(1).getImmediate(imm1) &&
+ i->src(t).mod == Modifier(0) &&
+ util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+ // low byte = offset, high byte = width
+ uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
+ i->op = OP_EXTBF;
+ i->setSrc(0, src->getSrc(0));
+ i->setSrc(1, new_ImmediateValue(prog, ext));
}
}
break;
@@ -1106,6 +1132,84 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
i->op = OP_MOV;
break;
}
+ case OP_CVT: {
+ Storage res;
+
+ // TODO: handle 64-bit values properly
+ if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
+ return;
+
+ // TODO: handle single byte/word extractions
+ if (i->subOp)
+ return;
+
+ bld.setPosition(i, true); /* make sure bld is init'ed */
+
+#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
+ case type: \
+ switch (i->sType) { \
+ case TYPE_F32: \
+ res.data.dst = util_iround(i->saturate ? \
+ CLAMP(imm0.reg.data.f32, fmin, fmax) : \
+ imm0.reg.data.f32); \
+ break; \
+ case TYPE_S32: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.s32, imin, imax) : \
+ imm0.reg.data.s32; \
+ break; \
+ case TYPE_U32: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.u32, umin, umax) : \
+ imm0.reg.data.u32; \
+ break; \
+ case TYPE_S16: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.s16, imin, imax) : \
+ imm0.reg.data.s16; \
+ break; \
+ case TYPE_U16: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.u16, umin, umax) : \
+ imm0.reg.data.u16; \
+ break; \
+ default: return; \
+ } \
+ i->setSrc(0, bld.mkImm(res.data.dst)); \
+ break
+
+ switch(i->dType) {
+ CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
+ CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
+ CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
+ CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
+ case TYPE_F32:
+ switch (i->sType) {
+ case TYPE_F32:
+ res.data.f32 = i->saturate ?
+ CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+ imm0.reg.data.f32;
+ break;
+ case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
+ case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
+ case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
+ case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
+ default:
+ return;
+ }
+ i->setSrc(0, bld.mkImm(res.data.f32));
+ break;
+ default:
+ return;
+ }
+#undef CASE
+
+ i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
+ i->op = OP_MOV;
+ i->saturate = 0;
+ i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
+ break;
+ }
default:
return;
}
@@ -1212,7 +1316,8 @@ private:
void handleRCP(Instruction *);
void handleSLCT(Instruction *);
void handleLOGOP(Instruction *);
- void handleCVT(Instruction *);
+ void handleCVT_NEG(Instruction *);
+ void handleCVT_EXTBF(Instruction *);
void handleSUCLAMP(Instruction *);
BuildUtil bld;
@@ -1463,12 +1568,12 @@ AlgebraicOpt::handleLOGOP(Instruction *logop)
// nv50:
// F2I(NEG(I2F(ABS(SET))))
void
-AlgebraicOpt::handleCVT(Instruction *cvt)
+AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
{
+ Instruction *insn = cvt->getSrc(0)->getInsn();
if (cvt->sType != TYPE_F32 ||
cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
return;
- Instruction *insn = cvt->getSrc(0)->getInsn();
if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
return;
if (insn->src(0).mod != Modifier(0))
@@ -1498,6 +1603,104 @@ AlgebraicOpt::handleCVT(Instruction *cvt)
delete_Instruction(prog, cvt);
}
+// Some shaders extract packed bytes out of words and convert them to
+// e.g. float. The Fermi+ CVT instruction can extract those directly, as can
+// nv50 for word sizes.
+//
+// CVT(EXTBF(x, byte/word))
+// CVT(AND(bytemask, x))
+// CVT(AND(bytemask, SHR(x, 8/16/24)))
+// CVT(SHR(x, 16/24))
+void
+AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
+{
+ Instruction *insn = cvt->getSrc(0)->getInsn();
+ ImmediateValue imm;
+ Value *arg = NULL;
+ unsigned width, offset;
+ if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
+ return;
+ if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
+ width = (imm.reg.data.u32 >> 8) & 0xff;
+ offset = imm.reg.data.u32 & 0xff;
+ arg = insn->getSrc(0);
+
+ if (width != 8 && width != 16)
+ return;
+ if (width == 8 && offset & 0x7)
+ return;
+ if (width == 16 && offset & 0xf)
+ return;
+ } else if (insn->op == OP_AND) {
+ int s;
+ if (insn->src(0).getImmediate(imm))
+ s = 0;
+ else if (insn->src(1).getImmediate(imm))
+ s = 1;
+ else
+ return;
+
+ if (imm.reg.data.u32 == 0xff)
+ width = 8;
+ else if (imm.reg.data.u32 == 0xffff)
+ width = 16;
+ else
+ return;
+
+ arg = insn->getSrc(!s);
+ Instruction *shift = arg->getInsn();
+ offset = 0;
+ if (shift && shift->op == OP_SHR &&
+ shift->sType == cvt->sType &&
+ shift->src(1).getImmediate(imm) &&
+ ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+ (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
+ arg = shift->getSrc(0);
+ offset = imm.reg.data.u32;
+ }
+ } else if (insn->op == OP_SHR &&
+ insn->sType == cvt->sType &&
+ insn->src(1).getImmediate(imm)) {
+ arg = insn->getSrc(0);
+ if (imm.reg.data.u32 == 24) {
+ width = 8;
+ offset = 24;
+ } else if (imm.reg.data.u32 == 16) {
+ width = 16;
+ offset = 16;
+ } else {
+ return;
+ }
+ }
+
+ if (!arg)
+ return;
+
+ // Irrespective of what came earlier, we can undo a shift on the argument
+ // by adjusting the offset.
+ Instruction *shift = arg->getInsn();
+ if (shift && shift->op == OP_SHL &&
+ shift->src(1).getImmediate(imm) &&
+ ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+ (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
+ imm.reg.data.u32 <= offset) {
+ arg = shift->getSrc(0);
+ offset -= imm.reg.data.u32;
+ }
+
+ // The unpackSnorm lowering still leaves a few shifts behind, but it's too
+ // annoying to detect them.
+
+ if (width == 8) {
+ cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
+ } else {
+ assert(width == 16);
+ cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
+ }
+ cvt->setSrc(0, arg);
+ cvt->subOp = offset >> 3;
+}
+
// SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
void
AlgebraicOpt::handleSUCLAMP(Instruction *insn)
@@ -1568,7 +1771,9 @@ AlgebraicOpt::visit(BasicBlock *bb)
handleLOGOP(i);
break;
case OP_CVT:
- handleCVT(i);
+ handleCVT_NEG(i);
+ if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
+ handleCVT_EXTBF(i);
break;
case OP_SUCLAMP:
handleSUCLAMP(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 78bc97f..0cd21cf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -25,7 +25,6 @@
#include <stack>
#include <limits>
-#include <tr1/unordered_set>
namespace nv50_ir {
@@ -1551,7 +1550,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
// Keep track of which instructions to delete later. Deleting them
// inside the loop is unsafe since a single instruction may have
// multiple destinations that all need to be spilled (like OP_SPLIT).
- std::tr1::unordered_set<Instruction *> to_del;
+ unordered_set<Instruction *> to_del;
for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
++d) {
@@ -1593,7 +1592,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
}
}
- for (std::tr1::unordered_set<Instruction *>::const_iterator it = to_del.begin();
+ for (unordered_set<Instruction *>::const_iterator it = to_del.begin();
it != to_del.end(); ++it)
delete_Instruction(func->getProgram(), *it);
}
diff --git a/src/gallium/drivers/nouveau/codegen/unordered_set.h b/src/gallium/drivers/nouveau/codegen/unordered_set.h
new file mode 100644
index 0000000..8ef6d46
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/unordered_set.h
@@ -0,0 +1,48 @@
+#ifndef __NV50_UNORDERED_SET_H__
+#define __NV50_UNORDERED_SET_H__
+
+#if (__cplusplus >= 201103L) || defined(ANDROID)
+#include <unordered_set>
+#else
+#include <tr1/unordered_set>
+#endif
+
+namespace nv50_ir {
+
+#if __cplusplus >= 201103L
+using std::unordered_set;
+#elif !defined(ANDROID)
+using std::tr1::unordered_set;
+#else // Android release before lollipop
+using std::isfinite;
+typedef std::tr1::unordered_set<void *> voidptr_unordered_set;
+
+template <typename V>
+class unordered_set : public voidptr_unordered_set {
+ public:
+ typedef voidptr_unordered_set _base;
+ typedef _base::iterator _biterator;
+ typedef _base::const_iterator const_biterator;
+
+ class iterator : public _biterator {
+ public:
+ iterator(const _biterator & i) : _biterator(i) {}
+ V operator*() const { return reinterpret_cast<V>(*_biterator(*this)); }
+ };
+ class const_iterator : public const_biterator {
+ public:
+ const_iterator(const iterator & i) : const_biterator(i) {}
+ const_iterator(const const_biterator & i) : const_biterator(i) {}
+ const V operator*() const { return reinterpret_cast<const V>(*const_biterator(*this)); }
+ };
+
+ iterator begin() { return _base::begin(); }
+ iterator end() { return _base::end(); }
+ const_iterator begin() const { return _base::begin(); }
+ const_iterator end() const { return _base::end(); }
+};
+#endif
+
+} // namespace nv50_ir
+
+#endif // __NV50_UNORDERED_SET_H__
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 8660498..495450b 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -190,6 +190,10 @@ main(int argc, char *argv[])
type = PIPE_SHADER_GEOMETRY;
else if (!strncmp(text, "COMP", 4))
type = PIPE_SHADER_COMPUTE;
+ else if (!strncmp(text, "TESS_CTRL", 9))
+ type = PIPE_SHADER_TESS_CTRL;
+ else if (!strncmp(text, "TESS_EVAL", 9))
+ type = PIPE_SHADER_TESS_EVAL;
else {
_debug_printf("Unrecognized TGSI header\n");
return 1;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 9505a0b..410e631 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -117,7 +117,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
int i;
bool emit_common_func = cso->rt[0].blend_enable;
- uint32_t ms;
if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
SB_BEGIN_3D(so, BLEND_INDEPENDENT, 1);
@@ -189,15 +188,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
SB_DATA (so, nv50_colormask(cso->rt[0].colormask));
}
- ms = 0;
- if (cso->alpha_to_coverage)
- ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
- if (cso->alpha_to_one)
- ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
- SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
- SB_DATA (so, ms);
-
assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
return so;
}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 985603d..b304a17 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -1,4 +1,6 @@
+#include "util/u_format.h"
+
#include "nv50/nv50_context.h"
#include "nv50/nv50_defs.xml.h"
@@ -314,6 +316,25 @@ nv50_validate_derived_2(struct nv50_context *nv50)
}
static void
+nv50_validate_derived_3(struct nv50_context *nv50)
+{
+ struct nouveau_pushbuf *push = nv50->base.pushbuf;
+ struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+ uint32_t ms = 0;
+
+ if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+ !util_format_is_pure_integer(fb->cbufs[0]->format)) && nv50->blend) {
+ if (nv50->blend->pipe.alpha_to_coverage)
+ ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+ if (nv50->blend->pipe.alpha_to_one)
+ ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+ }
+
+ BEGIN_NV04(push, NV50_3D(MULTISAMPLE_CTRL), 1);
+ PUSH_DATA (push, ms);
+}
+
+static void
nv50_validate_clip(struct nv50_context *nv50)
{
struct nouveau_pushbuf *push = nv50->base.pushbuf;
@@ -474,6 +495,7 @@ static struct state_validate {
{ nv50_validate_derived_rs, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
{ nv50_validate_derived_2, NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
+ { nv50_validate_derived_3, NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
{ nv50_validate_clip, NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
{ nv50_constbufs_validate, NV50_NEW_CONSTBUF },
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index cf75d1e..4b1d00c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -19,7 +19,7 @@
struct nv50_blend_stateobj {
struct pipe_blend_state pipe;
int size;
- uint32_t state[84]; // TODO: allocate less if !independent_blend_enable
+ uint32_t state[82]; // TODO: allocate less if !independent_blend_enable
};
struct nv50_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index b1ae016..64348b3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -68,6 +68,10 @@ nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
return NV50_SURFACE_FORMAT_R16_UNORM;
case 4:
return NV50_SURFACE_FORMAT_BGRA8_UNORM;
+ case 8:
+ return NV50_SURFACE_FORMAT_RGBA16_FLOAT;
+ case 16:
+ return NV50_SURFACE_FORMAT_RGBA32_FLOAT;
default:
return 0;
}
@@ -1003,6 +1007,8 @@ nv50_blitctx_prepare_state(struct nv50_blitctx *blit)
/* zsa state */
BEGIN_NV04(push, NV50_3D(DEPTH_TEST_ENABLE), 1);
PUSH_DATA (push, 0);
+ BEGIN_NV04(push, NV50_3D(DEPTH_BOUNDS_EN), 1);
+ PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_3D(STENCIL_ENABLE), 1);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_3D(ALPHA_TEST_ENABLE), 1);
@@ -1387,18 +1393,24 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
PUSH_DATA (push, info->dst.box.z + i);
} else {
const unsigned z = info->dst.box.z + i;
+ const uint64_t address = dst->base.address +
+ dst->level[info->dst.level].offset +
+ z * dst->layer_stride;
BEGIN_NV04(push, NV50_2D(DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
- PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+ PUSH_DATAh(push, address);
+ PUSH_DATA (push, address);
}
if (src->layout_3d) {
/* not possible because of depth tiling */
assert(0);
} else {
const unsigned z = info->src.box.z + i;
+ const uint64_t address = src->base.address +
+ src->level[info->src.level].offset +
+ z * src->layer_stride;
BEGIN_NV04(push, NV50_2D(SRC_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, src->base.address + z * src->layer_stride);
- PUSH_DATA (push, src->base.address + z * src->layer_stride);
+ PUSH_DATAh(push, address);
+ PUSH_DATA (push, address);
}
BEGIN_NV04(push, NV50_2D(BLIT_SRC_Y_INT), 1); /* trigger */
PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 84f8db6..7a15a11 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -132,6 +132,9 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
pipe_resource_reference(res, NULL);
}
util_dynarray_fini(&nvc0->global_residents);
+
+ if (nvc0->tcp_empty)
+ nvc0->base.pipe.delete_tcs_state(&nvc0->base.pipe, nvc0->tcp_empty);
}
static void
@@ -306,13 +309,6 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
pipe->memory_barrier = nvc0_memory_barrier;
pipe->get_sample_position = nvc0_context_get_sample_position;
- if (!screen->cur_ctx) {
- nvc0->state = screen->save_state;
- screen->cur_ctx = nvc0;
- nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
- }
- screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
-
nvc0_init_query_functions(nvc0);
nvc0_init_surface_functions(nvc0);
nvc0_init_state_functions(nvc0);
@@ -326,6 +322,21 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
/* shader builtin library is per-screen, but we need a context for m2mf */
nvc0_program_library_upload(nvc0);
+ nvc0_program_init_tcp_empty(nvc0);
+ if (!nvc0->tcp_empty)
+ goto out_err;
+ /* set the empty tctl prog on next draw in case one is never set */
+ nvc0->dirty |= NVC0_NEW_TCTLPROG;
+
+ /* now that there are no more opportunities for errors, set the current
+ * context if there isn't already one.
+ */
+ if (!screen->cur_ctx) {
+ nvc0->state = screen->save_state;
+ screen->cur_ctx = nvc0;
+ nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
+ }
+ screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
/* add permanently resident buffers to bufctxts */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index f449942..df1a891 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -128,6 +128,8 @@ struct nvc0_context {
struct nvc0_program *fragprog;
struct nvc0_program *compprog;
+ struct nvc0_program *tcp_empty;
+
struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
uint16_t constbuf_dirty[6];
uint16_t constbuf_valid[6];
@@ -227,6 +229,7 @@ void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
void nvc0_program_library_upload(struct nvc0_context *);
uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
uint32_t label);
+void nvc0_program_init_tcp_empty(struct nvc0_context *);
/* nvc0_query.c */
void nvc0_init_query_functions(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 507a250..12f1bb7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -22,6 +22,8 @@
#include "pipe/p_defines.h"
+#include "tgsi/tgsi_ureg.h"
+
#include "nvc0/nvc0_context.h"
#include "codegen/nv50_ir_driver.h"
@@ -799,3 +801,18 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
return prog->code_base + base + syms[i].offset;
return prog->code_base; /* no symbols or symbol not found */
}
+
+void
+nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)
+{
+ struct ureg_program *ureg;
+
+ ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+ if (!ureg)
+ return;
+
+ ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);
+ ureg_END(ureg);
+
+ nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8aa127a..8f8ac2d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -148,8 +148,13 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
PUSH_DATA (push, tp->num_gprs);
} else {
- BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
+ tp = nvc0->tcp_empty;
+ /* not a whole lot we can do to handle this failure */
+ if (!nvc0_program_validate(nvc0, tp))
+ assert(!"unable to validate empty tcp");
+ BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
PUSH_DATA (push, 0x20);
+ PUSH_DATA (push, tp->code_base);
}
nvc0_program_update_context_state(nvc0, tp, 1);
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 2a33857..ee29912 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -90,7 +90,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
struct nvc0_blend_stateobj *so = CALLOC_STRUCT(nvc0_blend_stateobj);
int i;
int r; /* reference */
- uint32_t ms;
uint8_t blend_en = 0;
bool indep_masks = false;
bool indep_funcs = false;
@@ -176,15 +175,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
}
}
- ms = 0;
- if (cso->alpha_to_coverage)
- ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
- if (cso->alpha_to_one)
- ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
- SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
- SB_DATA (so, ms);
-
assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
return so;
}
@@ -234,7 +224,7 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe,
SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample);
SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth);
- if (cso->line_smooth)
+ if (cso->line_smooth || cso->multisample)
SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1);
else
SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index ce1119c..47bd66d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -1,4 +1,5 @@
+#include "util/u_format.h"
#include "util/u_math.h"
#include "nvc0/nvc0_context.h"
@@ -555,6 +556,25 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0)
}
static void
+nvc0_validate_derived_3(struct nvc0_context *nvc0)
+{
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+ uint32_t ms = 0;
+
+ if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+ !util_format_is_pure_integer(fb->cbufs[0]->format)) && nvc0->blend) {
+ if (nvc0->blend->pipe.alpha_to_coverage)
+ ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+ if (nvc0->blend->pipe.alpha_to_one)
+ ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+ }
+
+ BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_CTRL), 1);
+ PUSH_DATA (push, ms);
+}
+
+static void
nvc0_validate_tess_state(struct nvc0_context *nvc0)
{
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -628,6 +648,7 @@ static struct state_validate {
{ nvc0_validate_derived_1, NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
NVC0_NEW_RASTERIZER },
{ nvc0_validate_derived_2, NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
+ { nvc0_validate_derived_3, NVC0_NEW_BLEND | NVC0_NEW_FRAMEBUFFER },
{ nvc0_validate_clip, NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
NVC0_NEW_VERTPROG |
NVC0_NEW_TEVLPROG |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 18fcc12..8bc33c6 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -17,7 +17,7 @@
struct nvc0_blend_stateobj {
struct pipe_blend_state pipe;
int size;
- uint32_t state[72];
+ uint32_t state[70];
};
struct nvc0_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 51a6f93..dbdf292 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -887,6 +887,7 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit)
/* zsa state */
IMMED_NVC0(push, NVC0_3D(DEPTH_TEST_ENABLE), 0);
+ IMMED_NVC0(push, NVC0_3D(DEPTH_BOUNDS_EN), 0);
IMMED_NVC0(push, NVC0_3D(STENCIL_ENABLE), 0);
IMMED_NVC0(push, NVC0_3D(ALPHA_TEST_ENABLE), 0);
@@ -1336,18 +1337,24 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
PUSH_DATA (push, info->dst.box.z + i);
} else {
const unsigned z = info->dst.box.z + i;
+ const uint64_t address = dst->base.address +
+ dst->level[info->dst.level].offset +
+ z * dst->layer_stride;
BEGIN_NVC0(push, NVC0_2D(DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
- PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+ PUSH_DATAh(push, address);
+ PUSH_DATA (push, address);
}
if (src->layout_3d) {
/* not possible because of depth tiling */
assert(0);
} else {
const unsigned z = info->src.box.z + i;
+ const uint64_t address = src->base.address +
+ src->level[info->src.level].offset +
+ z * src->layer_stride;
BEGIN_NVC0(push, NVC0_2D(SRC_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, src->base.address + z * src->layer_stride);
- PUSH_DATA (push, src->base.address + z * src->layer_stride);
+ PUSH_DATAh(push, address);
+ PUSH_DATA (push, address);
}
BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_Y_INT), 1); /* trigger */
PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
index 14f93fb..e8f4087 100644
--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@@ -693,7 +693,8 @@ void rc_init_regalloc_state(struct rc_regalloc_state *s)
};
/* Allocate the main ra data structure */
- s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW);
+ s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW,
+ true);
/* Create the register classes */
for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 6ea8f24..b8cc316 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -667,7 +667,8 @@ static void r300_resource_copy_region(struct pipe_context *pipe,
r300_blitter_begin(r300, R300_COPY);
util_blitter_blit_generic(r300->blitter, dst_view, &dstbox,
src_view, src_box, src_width0, src_height0,
- PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+ PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+ FALSE);
r300_blitter_end(r300);
pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index b0002c3..22a0950 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -732,7 +732,8 @@ void r600_resource_copy_region(struct pipe_context *ctx,
r600_blitter_begin(ctx, R600_COPY_TEXTURE);
util_blitter_blit_generic(rctx->blitter, dst_view, &dstbox,
src_view, src_box, src_width0, src_height0,
- PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+ PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+ FALSE);
r600_blitter_end(ctx);
pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index 381f06d..fdbe1c0 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -262,7 +262,7 @@ static const struct alu_op_info alu_op_table[] = {
{"PRED_SETNE_PUSH_INT", 2, { 0x4D, 0x4D },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_NE | AF_INT_CMP },
{"PRED_SETLT_PUSH_INT", 2, { 0x4E, 0x4E },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_LT | AF_INT_CMP },
{"PRED_SETLE_PUSH_INT", 2, { 0x4F, 0x4F },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_LE | AF_INT_CMP },
- {"FLT_TO_INT", 1, { 0x6B, 0x50 },{ AF_S, AF_S, AF_VS, AF_VS}, AF_INT_DST | AF_CVT },
+ {"FLT_TO_INT", 1, { 0x6B, 0x50 },{ AF_S, AF_S, AF_V, AF_V}, AF_INT_DST | AF_CVT },
{"BFREV_INT", 1, { -1, 0x51 },{ 0, 0, AF_VS, AF_VS}, AF_INT_DST },
{"ADDC_UINT", 2, { -1, 0x52 },{ 0, 0, AF_VS, AF_VS}, AF_UINT_DST },
{"SUBB_UINT", 2, { -1, 0x53 },{ 0, 0, AF_VS, AF_VS}, AF_UINT_DST },
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 9b66105..384ba80 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -90,7 +90,7 @@
struct r600_context;
struct r600_bytecode;
-struct r600_shader_key;
+union r600_shader_key;
/* This is an atom containing GPU commands that never change.
* This is supposed to be copied directly into the CS. */
@@ -643,7 +643,7 @@ void r600_resource_copy_region(struct pipe_context *ctx,
/* r600_shader.c */
int r600_pipe_shader_create(struct pipe_context *ctx,
struct r600_pipe_shader *shader,
- struct r600_shader_key key);
+ union r600_shader_key key);
void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 8d1f95a..4c4b600 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -62,7 +62,7 @@ The compiler must issue the source argument to slots z, y, and x
static int r600_shader_from_tgsi(struct r600_context *rctx,
struct r600_pipe_shader *pipeshader,
- struct r600_shader_key key);
+ union r600_shader_key key);
static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
@@ -133,7 +133,7 @@ static int store_shader(struct pipe_context *ctx,
int r600_pipe_shader_create(struct pipe_context *ctx,
struct r600_pipe_shader *shader,
- struct r600_shader_key key)
+ union r600_shader_key key)
{
struct r600_context *rctx = (struct r600_context *)ctx;
struct r600_pipe_shader_selector *sel = shader->selector;
@@ -141,7 +141,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
- unsigned export_shader = key.vs_as_es;
+ unsigned export_shader = key.vs.as_es;
shader->shader.bc.isa = rctx->isa;
@@ -1802,7 +1802,7 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
static int r600_shader_from_tgsi(struct r600_context *rctx,
struct r600_pipe_shader *pipeshader,
- struct r600_shader_key key)
+ union r600_shader_key key)
{
struct r600_screen *rscreen = rctx->screen;
struct r600_shader *shader = &pipeshader->shader;
@@ -1816,7 +1816,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
unsigned opcode;
int i, j, k, r = 0;
int next_param_base = 0, next_clip_base;
- int max_color_exports = MAX2(key.nr_cbufs, 1);
+ int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
/* Declarations used by llvm code */
bool use_llvm = false;
bool indirect_gprs;
@@ -1830,8 +1830,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
ctx.shader = shader;
ctx.native_integers = true;
- shader->vs_as_gs_a = key.vs_as_gs_a;
- shader->vs_as_es = key.vs_as_es;
+ shader->vs_as_gs_a = key.vs.as_gs_a;
+ shader->vs_as_es = key.vs.as_es;
r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
rscreen->has_compressed_msaa_texturing);
@@ -1844,9 +1844,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
shader->processor_type = ctx.type;
ctx.bc->type = shader->processor_type;
- ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
+ ring_outputs = key.vs.as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
- if (key.vs_as_es) {
+ if (key.vs.as_es) {
ctx.gs_for_vs = &rctx->gs_shader->current->shader;
} else {
ctx.gs_for_vs = NULL;
@@ -1866,7 +1866,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
shader->nr_ps_color_exports = 0;
shader->nr_ps_max_color_exports = 0;
- shader->two_side = key.color_two_side;
+ shader->two_side = key.ps.color_two_side;
/* register allocations */
/* Values [0,127] correspond to GPR[0..127].
@@ -1970,7 +1970,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
shader->fs_write_all = FALSE;
if (shader->vs_as_gs_a)
- vs_add_primid_output(&ctx, key.vs_prim_id_out);
+ vs_add_primid_output(&ctx, key.vs.prim_id_out);
while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
tgsi_parse_token(&ctx.parse);
@@ -2091,7 +2091,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
radeon_llvm_ctx.stream_outputs = &so;
- radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
+ radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
radeon_llvm_ctx.has_compressed_msaa_texturing =
ctx.bc->has_compressed_msaa_texturing;
mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
@@ -2270,7 +2270,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
convert_edgeflag_to_int(&ctx);
if (ring_outputs) {
- if (key.vs_as_es)
+ if (key.vs.as_es)
emit_gs_ring_writes(&ctx, FALSE);
} else {
/* Export output */
@@ -2386,7 +2386,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
j--;
continue;
}
- output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+ output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
output[j].array_base = shader->output[i].sid;
output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
shader->nr_ps_color_exports++;
@@ -2399,7 +2399,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
output[j].swizzle_x = 0;
output[j].swizzle_y = 1;
output[j].swizzle_z = 2;
- output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+ output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
output[j].burst_count = 1;
output[j].array_base = k;
output[j].op = CF_OP_EXPORT;
@@ -6151,10 +6151,10 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx)
r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
if (r)
return r;
- r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[2]);
+ r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
if (r)
return r;
- r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[2], &ctx->src[1]);
+ r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
if (r)
return r;
tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 5d05c81..927bac5 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -95,13 +95,17 @@ struct r600_shader {
struct r600_shader_array * arrays;
};
-struct r600_shader_key {
- unsigned color_two_side:1;
- unsigned alpha_to_one:1;
- unsigned nr_cbufs:4;
- unsigned vs_as_es:1;
- unsigned vs_as_gs_a:1;
- unsigned vs_prim_id_out:8;
+union r600_shader_key {
+ struct {
+ unsigned nr_cbufs:4;
+ unsigned color_two_side:1;
+ unsigned alpha_to_one:1;
+ } ps;
+ struct {
+ unsigned prim_id_out:8;
+ unsigned as_es:1; /* export shader */
+ unsigned as_gs_a:1;
+ } vs;
};
struct r600_shader_array {
@@ -122,7 +126,7 @@ struct r600_pipe_shader {
unsigned flatshade;
unsigned pa_cl_vs_out_cntl;
unsigned nr_ps_color_outputs;
- struct r600_shader_key key;
+ union r600_shader_key key;
unsigned db_shader_control;
unsigned ps_depth_export;
unsigned enabled_stream_buffers_mask;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index aa4a8d0..a05dd83 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -702,29 +702,39 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
}
/* Compute the key for the hw shader variant */
-static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
struct r600_pipe_shader_selector * sel)
{
struct r600_context *rctx = (struct r600_context *)ctx;
- struct r600_shader_key key;
+ union r600_shader_key key;
memset(&key, 0, sizeof(key));
- if (sel->type == PIPE_SHADER_FRAGMENT) {
- key.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
- key.alpha_to_one = rctx->alpha_to_one &&
- rctx->rasterizer && rctx->rasterizer->multisample_enable &&
- !rctx->framebuffer.cb0_is_integer;
- key.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
- /* Dual-source blending only makes sense with nr_cbufs == 1. */
- if (key.nr_cbufs == 1 && rctx->dual_src_blend)
- key.nr_cbufs = 2;
- } else if (sel->type == PIPE_SHADER_VERTEX) {
- key.vs_as_es = (rctx->gs_shader != NULL);
+ switch (sel->type) {
+ case PIPE_SHADER_VERTEX: {
+ key.vs.as_es = (rctx->gs_shader != NULL);
if (rctx->ps_shader->current->shader.gs_prim_id_input && !rctx->gs_shader) {
- key.vs_as_gs_a = true;
- key.vs_prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
+ key.vs.as_gs_a = true;
+ key.vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
}
+ break;
+ }
+ case PIPE_SHADER_GEOMETRY:
+ break;
+ case PIPE_SHADER_FRAGMENT: {
+ key.ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
+ key.ps.alpha_to_one = rctx->alpha_to_one &&
+ rctx->rasterizer && rctx->rasterizer->multisample_enable &&
+ !rctx->framebuffer.cb0_is_integer;
+ key.ps.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
+ /* Dual-source blending only makes sense with nr_cbufs == 1. */
+ if (key.ps.nr_cbufs == 1 && rctx->dual_src_blend)
+ key.ps.nr_cbufs = 2;
+ break;
}
+ default:
+ assert(0);
+ }
+
return key;
}
@@ -734,7 +744,7 @@ static int r600_shader_select(struct pipe_context *ctx,
struct r600_pipe_shader_selector* sel,
bool *dirty)
{
- struct r600_shader_key key;
+ union r600_shader_key key;
struct r600_pipe_shader * shader = NULL;
int r;
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 16ee541..81f3f45 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -209,8 +209,6 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
static unsigned calc_ctx_size(struct ruvd_decoder *dec)
{
- unsigned width_in_mb, height_in_mb, ctx_size;
-
unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
@@ -223,8 +221,7 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
width = align (width, 16);
height = align (height, 16);
- ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
- return ctx_size;
+ return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
}
/* calculate size of reference picture buffer */
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 48972bd..b7450b6 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -586,7 +586,8 @@ void si_resource_copy_region(struct pipe_context *ctx,
si_blitter_begin(ctx, SI_COPY);
util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
src_view, src_box, src_width0, src_height0,
- PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+ PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+ FALSE);
si_blitter_end(ctx);
pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4288e9b..fa6c15a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2277,7 +2277,7 @@ static void tex_fetch_args(
unsigned sampler_index;
unsigned num_deriv_channels = 0;
bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
- LLVMValueRef res_ptr, samp_ptr;
+ LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
@@ -2293,9 +2293,19 @@ static void tex_fetch_args(
samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+
+ if (target == TGSI_TEXTURE_2D_MSAA ||
+ target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+ ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
+ lp_build_const_int32(gallivm,
+ SI_FMASK_TEX_OFFSET), "");
+ fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+ fmask_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+ }
} else {
res_ptr = si_shader_ctx->resources[sampler_index];
samp_ptr = si_shader_ctx->samplers[sampler_index];
+ fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
}
if (target == TGSI_TEXTURE_BUFFER) {
@@ -2493,7 +2503,7 @@ static void tex_fetch_args(
txf_emit_data.dst_type = LLVMVectorType(
LLVMInt32TypeInContext(gallivm->context), 4);
txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count);
- txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+ txf_emit_data.args[1] = fmask_ptr;
txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture);
txf_emit_data.arg_count = 3;
@@ -2524,8 +2534,7 @@ static void tex_fetch_args(
* resource descriptor is 0 (invalid),
*/
LLVMValueRef fmask_desc =
- LLVMBuildBitCast(gallivm->builder,
- si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index],
+ LLVMBuildBitCast(gallivm->builder, fmask_ptr,
LLVMVectorType(uint_bld->elem_type, 8), "");
LLVMValueRef fmask_word1 =
@@ -3973,7 +3982,7 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
fprintf(stderr, " es_enabled_outputs = 0x%"PRIx64"\n",
key->vs.es_enabled_outputs);
fprintf(stderr, " as_es = %u\n", key->vs.as_es);
- fprintf(stderr, " as_es = %u\n", key->vs.as_ls);
+ fprintf(stderr, " as_ls = %u\n", key->vs.as_ls);
break;
case PIPE_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 654c46f..3a63af8 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
struct ra_regs *regs;
unsigned int reg_class_any;
+ unsigned int reg_class_a_or_b_or_acc;
unsigned int reg_class_r4_or_a;
unsigned int reg_class_a;
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 7978ea1..5b43583 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -143,15 +143,6 @@ qir_opt_algebraic(struct vc4_compile *c)
case QOP_SEL_X_Y_ZC:
case QOP_SEL_X_Y_NS:
case QOP_SEL_X_Y_NC:
- if (qir_reg_equals(inst->src[0], inst->src[1])) {
- /* Turn "dst = (sf == x) ? a : a)" into
- * "dst = a"
- */
- replace_with_mov(c, inst, inst->src[1]);
- progress = true;
- break;
- }
-
if (is_zero(c, inst->src[1])) {
/* Replace references to a 0 uniform value
* with the SEL_X_0 equivalent.
@@ -207,6 +198,7 @@ qir_opt_algebraic(struct vc4_compile *c)
/* FADD(a, FSUB(0, b)) -> FSUB(a, b) */
if (inst->src[1].file == QFILE_TEMP &&
+ c->defs[inst->src[1].index] &&
c->defs[inst->src[1].index]->op == QOP_FSUB) {
struct qinst *fsub = c->defs[inst->src[1].index];
if (is_zero(c, fsub->src[0])) {
@@ -221,6 +213,7 @@ qir_opt_algebraic(struct vc4_compile *c)
/* FADD(FSUB(0, b), a) -> FSUB(a, b) */
if (inst->src[0].file == QFILE_TEMP &&
+ c->defs[inst->src[0].index] &&
c->defs[inst->src[0].index]->op == QOP_FSUB) {
struct qinst *fsub = c->defs[inst->src[0].index];
if (is_zero(c, fsub->src[0])) {
@@ -236,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c)
break;
case QOP_FMUL:
- if (replace_x_0_with_0(c, inst, 0) ||
- replace_x_0_with_0(c, inst, 1) ||
- fmul_replace_one(c, inst, 0) ||
- fmul_replace_one(c, inst, 1)) {
+ if (!inst->dst.pack &&
+ (replace_x_0_with_0(c, inst, 0) ||
+ replace_x_0_with_0(c, inst, 1) ||
+ fmul_replace_one(c, inst, 0) ||
+ fmul_replace_one(c, inst, 1))) {
progress = true;
break;
}
break;
case QOP_MUL24:
- if (replace_x_0_with_0(c, inst, 0) ||
- replace_x_0_with_0(c, inst, 1)) {
+ if (!inst->dst.pack &&
+ (replace_x_0_with_0(c, inst, 0) ||
+ replace_x_0_with_0(c, inst, 1))) {
progress = true;
break;
}
@@ -280,6 +275,14 @@ qir_opt_algebraic(struct vc4_compile *c)
}
break;
+ case QOP_RCP:
+ if (is_1f(c, inst->src[0])) {
+ replace_with_mov(c, inst, inst->src[0]);
+ progress = true;
+ break;
+ }
+ break;
+
default:
break;
}
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index a755de9..fd2539a 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -39,21 +39,27 @@ qir_opt_copy_propagation(struct vc4_compile *c)
{
bool progress = false;
bool debug = false;
- struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg));
list_for_each_entry(struct qinst, inst, &c->instructions, link) {
for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
int index = inst->src[i].index;
if (inst->src[i].file == QFILE_TEMP &&
- (movs[index].file == QFILE_TEMP ||
- movs[index].file == QFILE_UNIF)) {
+ c->defs[index] &&
+ c->defs[index]->op == QOP_MOV &&
+ (c->defs[index]->src[0].file == QFILE_TEMP ||
+ c->defs[index]->src[0].file == QFILE_UNIF)) {
+ /* If it has a pack, it shouldn't be an SSA
+ * def.
+ */
+ assert(!c->defs[index]->dst.pack);
+
if (debug) {
fprintf(stderr, "Copy propagate: ");
qir_dump_inst(c, inst);
fprintf(stderr, "\n");
}
- inst->src[i] = movs[index];
+ inst->src[i] = c->defs[index]->src[0];
if (debug) {
fprintf(stderr, "to: ");
@@ -64,14 +70,6 @@ qir_opt_copy_propagation(struct vc4_compile *c)
progress = true;
}
}
-
- if (inst->op == QOP_MOV &&
- inst->dst.file == QFILE_TEMP &&
- inst->src[0].file != QFILE_VPM) {
- movs[inst->dst.index] = inst->src[0];
- }
}
-
- free(movs);
return progress;
}
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index e04f028..f2cdf8f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -68,7 +68,7 @@ qir_opt_vpm_writes(struct vc4_compile *c)
continue;
struct qinst *inst = c->defs[temp];
- if (qir_is_multi_instruction(inst))
+ if (!inst || qir_is_multi_instruction(inst))
continue;
if (qir_depends_on_flags(inst) || inst->sf)
@@ -79,22 +79,6 @@ qir_opt_vpm_writes(struct vc4_compile *c)
continue;
}
- /* A QOP_TEX_RESULT destination is r4, so we can't move
- * accesses to it past another QOP_TEX_RESULT which would
- * update it.
- */
- int src;
- for (src = 0; src < qir_get_op_nsrc(inst->op); src++) {
- if (inst->src[src].file == QFILE_TEMP) {
- if (c->defs[inst->src[src].index]->op ==
- QOP_TEX_RESULT) {
- break;
- }
- }
- }
- if (src != qir_get_op_nsrc(inst->op))
- continue;
-
/* Move the generating instruction to the end of the program
* to maintain the order of the VPM writes.
*/
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 13c4721..e002983 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
c->ubo_ranges[array_id].used = false;
}
+static bool
+ntq_src_is_only_ssa_def_user(nir_src *src)
+{
+ if (!src->is_ssa)
+ return false;
+
+ if (!list_empty(&src->ssa->if_uses))
+ return false;
+
+ return (src->ssa->uses.next == &src->use_link &&
+ src->ssa->uses.next->next == &src->ssa->uses);
+}
+
+/**
+ * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
+ * bit set.
+ *
+ * However, as an optimization, it tries to find the instructions generating
+ * the sources to be packed and just emit the pack flag there, if possible.
+ */
+static void
+ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
+{
+ struct qreg result = qir_get_temp(c);
+ struct nir_alu_instr *vec4 = NULL;
+
+ /* If packing from a vec4 op (as expected), identify it so that we can
+ * peek back at what generated its sources.
+ */
+ if (instr->src[0].src.is_ssa &&
+ instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
+ nir_op_vec4) {
+ vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ int swiz = instr->src[0].swizzle[i];
+ struct qreg src;
+ if (vec4) {
+ src = ntq_get_src(c, vec4->src[swiz].src,
+ vec4->src[swiz].swizzle[0]);
+ } else {
+ src = ntq_get_src(c, instr->src[0].src, swiz);
+ }
+
+ if (vec4 &&
+ ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
+ src.file == QFILE_TEMP &&
+ c->defs[src.index] &&
+ qir_is_mul(c->defs[src.index]) &&
+ !c->defs[src.index]->dst.pack) {
+ struct qinst *rewrite = c->defs[src.index];
+ c->defs[src.index] = NULL;
+ rewrite->dst = result;
+ rewrite->dst.pack = QPU_PACK_MUL_8A + i;
+ continue;
+ }
+
+ qir_PACK_8_F(c, result, src, i);
+ }
+
+ struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+ *dest = result;
+}
+
static void
ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
{
@@ -839,17 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
}
if (instr->op == nir_op_pack_unorm_4x8) {
- struct qreg result;
- for (int i = 0; i < 4; i++) {
- struct qreg src = ntq_get_src(c, instr->src[0].src,
- instr->src[0].swizzle[i]);
- if (i == 0)
- result = qir_PACK_8888_F(c, src);
- else
- result = qir_PACK_8_F(c, result, src, i);
- }
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
- *dest = result;
+ ntq_emit_pack_unorm_4x8(c, instr);
return;
}
@@ -1130,20 +1186,24 @@ emit_frag_end(struct vc4_compile *c)
static void
emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
{
- struct qreg xyi[2];
+ struct qreg packed = qir_get_temp(c);
for (int i = 0; i < 2; i++) {
struct qreg scale =
qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
- xyi[i] = qir_FTOI(c, qir_FMUL(c,
- qir_FMUL(c,
- c->outputs[c->output_position_index + i],
- scale),
- rcp_w));
+ struct qreg packed_chan = packed;
+ packed_chan.pack = QPU_PACK_A_16A + i;
+
+ qir_FTOI_dest(c, packed_chan,
+ qir_FMUL(c,
+ qir_FMUL(c,
+ c->outputs[c->output_position_index + i],
+ scale),
+ rcp_w));
}
- qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1]));
+ qir_VPM_WRITE(c, packed);
}
static void
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 254140a..9d93071 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -71,12 +71,11 @@ static const struct qir_op_info qir_op_info[] = {
[QOP_RSQ] = { "rsq", 1, 1, false, true },
[QOP_EXP2] = { "exp2", 1, 2, false, true },
[QOP_LOG2] = { "log2", 1, 2, false, true },
- [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true },
- [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true },
- [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true },
- [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true },
- [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true },
- [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true },
+ [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1 },
+ [QOP_PACK_8A_F] = { "pack_8a_f", 1, 1 },
+ [QOP_PACK_8B_F] = { "pack_8b_f", 1, 1 },
+ [QOP_PACK_8C_F] = { "pack_8c_f", 1, 1 },
+ [QOP_PACK_8D_F] = { "pack_8d_f", 1, 1 },
[QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true },
[QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
[QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
@@ -169,6 +168,18 @@ qir_is_multi_instruction(struct qinst *inst)
}
bool
+qir_is_mul(struct qinst *inst)
+{
+ switch (inst->op) {
+ case QOP_FMUL:
+ case QOP_MUL24:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
qir_is_tex(struct qinst *inst)
{
return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
@@ -273,6 +284,14 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
inst->sf ? ".sf" : "");
qir_print_reg(c, inst->dst, true);
+ if (inst->dst.pack) {
+ if (inst->dst.pack) {
+ if (qir_is_mul(inst))
+ vc4_qpu_disasm_pack_mul(stderr, inst->dst.pack);
+ else
+ vc4_qpu_disasm_pack_a(stderr, inst->dst.pack);
+ }
+ }
for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
fprintf(stderr, ", ");
qir_print_reg(c, inst->src[i], false);
@@ -348,7 +367,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst)
if (inst->dst.file == QFILE_TEMP)
c->defs[inst->dst.index] = inst;
- list_addtail(&inst->link, &c->instructions);
+ qir_emit_nodef(c, inst);
}
bool
@@ -389,8 +408,11 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
struct qreg
qir_follow_movs(struct vc4_compile *c, struct qreg reg)
{
- while (reg.file == QFILE_TEMP && c->defs[reg.index]->op == QOP_MOV)
+ while (reg.file == QFILE_TEMP &&
+ c->defs[reg.index] &&
+ c->defs[reg.index]->op == QOP_MOV) {
reg = c->defs[reg.index]->src[0];
+ }
return reg;
}
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index cade795..a2b21fa 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -58,6 +58,7 @@ enum qfile {
struct qreg {
enum qfile file;
uint32_t index;
+ int pack;
};
enum qop {
@@ -104,7 +105,6 @@ enum qop {
QOP_LOG2,
QOP_VW_SETUP,
QOP_VR_SETUP,
- QOP_PACK_SCALED,
QOP_PACK_8888_F,
QOP_PACK_8A_F,
QOP_PACK_8B_F,
@@ -444,13 +444,20 @@ struct qreg qir_uniform(struct vc4_compile *c,
enum quniform_contents contents,
uint32_t data);
void qir_reorder_uniforms(struct vc4_compile *c);
+
void qir_emit(struct vc4_compile *c, struct qinst *inst);
+static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+{
+ list_addtail(&inst->link, &c->instructions);
+}
+
struct qreg qir_get_temp(struct vc4_compile *c);
int qir_get_op_nsrc(enum qop qop);
bool qir_reg_equals(struct qreg a, struct qreg b);
bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
bool qir_is_multi_instruction(struct qinst *inst);
+bool qir_is_mul(struct qinst *inst);
bool qir_is_tex(struct qinst *inst);
bool qir_depends_on_flags(struct qinst *inst);
bool qir_writes_r4(struct qinst *inst);
@@ -509,6 +516,12 @@ qir_##name(struct vc4_compile *c, struct qreg a) \
struct qreg t = qir_get_temp(c); \
qir_emit(c, qir_inst(QOP_##name, t, a, c->undef)); \
return t; \
+} \
+static inline void \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest, \
+ struct qreg a) \
+{ \
+ qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef)); \
}
#define QIR_ALU2(name) \
@@ -518,6 +531,12 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b) \
struct qreg t = qir_get_temp(c); \
qir_emit(c, qir_inst(QOP_##name, t, a, b)); \
return t; \
+} \
+static inline void \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest, \
+ struct qreg a, struct qreg b) \
+{ \
+ qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, b)); \
}
#define QIR_NODST_1(name) \
@@ -534,6 +553,14 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b) \
qir_emit(c, qir_inst(QOP_##name, c->undef, a, b)); \
}
+#define QIR_PACK(name) \
+static inline struct qreg \
+qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a) \
+{ \
+ qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef)); \
+ return dest; \
+}
+
QIR_ALU1(MOV)
QIR_ALU2(FADD)
QIR_ALU2(FSUB)
@@ -570,12 +597,11 @@ QIR_ALU1(RCP)
QIR_ALU1(RSQ)
QIR_ALU1(EXP2)
QIR_ALU1(LOG2)
-QIR_ALU2(PACK_SCALED)
QIR_ALU1(PACK_8888_F)
-QIR_ALU2(PACK_8A_F)
-QIR_ALU2(PACK_8B_F)
-QIR_ALU2(PACK_8C_F)
-QIR_ALU2(PACK_8D_F)
+QIR_PACK(PACK_8A_F)
+QIR_PACK(PACK_8B_F)
+QIR_PACK(PACK_8C_F)
+QIR_PACK(PACK_8D_F)
QIR_ALU1(VARY_ADD_C)
QIR_NODST_2(TEX_S)
QIR_NODST_2(TEX_T)
@@ -627,11 +653,12 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
}
static inline struct qreg
-qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan)
+qir_PACK_8_F(struct vc4_compile *c, struct qreg dest, struct qreg val, int chan)
{
- struct qreg t = qir_get_temp(c);
- qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val));
- return t;
+ qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, dest, val, c->undef));
+ if (dest.file == QFILE_TEMP)
+ c->defs[dest.index] = NULL;
+ return dest;
}
static inline struct qreg
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index fbb90ba..0719d28 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -24,6 +24,7 @@
#ifndef VC4_QPU_H
#define VC4_QPU_H
+#include <stdio.h>
#include <stdint.h>
#include "util/u_math.h"
@@ -206,6 +207,12 @@ void
vc4_qpu_disasm(const uint64_t *instructions, int num_instructions);
void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack);
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack);
+
+void
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst);
#endif /* VC4_QPU_H */
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 00aeb30..0879787 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -245,6 +245,18 @@ get_special_write_desc(int reg, bool is_a)
return special_write[reg];
}
+void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack)
+{
+ fprintf(out, ".%s", DESC(qpu_pack_mul, pack));
+}
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack)
+{
+ fprintf(out, "%s", DESC(qpu_pack_a, pack));
+}
+
static void
print_alu_dst(uint64_t inst, bool is_mul)
{
@@ -263,9 +275,9 @@ print_alu_dst(uint64_t inst, bool is_mul)
fprintf(stderr, "%s%d?", file, waddr);
if (is_mul && (inst & QPU_PM)) {
- fprintf(stderr, ".%s", DESC(qpu_pack_mul, pack));
+ vc4_qpu_disasm_pack_mul(stderr, pack);
} else if (is_a && !(inst & QPU_PM)) {
- fprintf(stderr, "%s", DESC(qpu_pack_a, pack));
+ vc4_qpu_disasm_pack_a(stderr, pack);
}
}
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index f324056..adf3a8b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -179,10 +179,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
static const struct {
uint32_t op;
- bool is_mul;
} translate[] = {
-#define A(name) [QOP_##name] = {QPU_A_##name, false}
-#define M(name) [QOP_##name] = {QPU_M_##name, true}
+#define A(name) [QOP_##name] = {QPU_A_##name}
+#define M(name) [QOP_##name] = {QPU_M_##name}
A(FADD),
A(FSUB),
A(FMIN),
@@ -336,28 +335,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
case QOP_PACK_8B_F:
case QOP_PACK_8C_F:
case QOP_PACK_8D_F:
- /* If dst doesn't happen to already contain src[0],
- * then we have to move it in.
- */
- if (qinst->src[0].file != QFILE_NULL &&
- (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
- /* Don't overwrite src1 while setting up
- * the dst!
- */
- if (dst.mux == src[1].mux &&
- dst.addr == src[1].addr) {
- queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
- src[1] = qpu_rb(31);
- }
-
- queue(c, qpu_m_MOV(dst, src[0]));
- }
-
- queue(c, qpu_m_MOV(dst, src[1]));
- *last_inst(c) |= QPU_PM;
- *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
- qinst->op - QOP_PACK_8A_F,
- QPU_PACK);
+ queue(c,
+ qpu_m_MOV(dst, src[0]) |
+ QPU_PM |
+ QPU_SET_FIELD(QPU_PACK_MUL_8A +
+ qinst->op - QOP_PACK_8A_F,
+ QPU_PACK));
break;
case QOP_FRAG_X:
@@ -419,24 +402,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
break;
- case QOP_PACK_SCALED: {
- uint64_t a = (qpu_a_MOV(dst, src[0]) |
- QPU_SET_FIELD(QPU_PACK_A_16A,
- QPU_PACK));
- uint64_t b = (qpu_a_MOV(dst, src[1]) |
- QPU_SET_FIELD(QPU_PACK_A_16B,
- QPU_PACK));
-
- if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
- queue(c, b);
- queue(c, a);
- } else {
- queue(c, a);
- queue(c, b);
- }
- break;
- }
-
case QOP_TEX_S:
case QOP_TEX_T:
case QOP_TEX_R:
@@ -529,14 +494,24 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
fixup_raddr_conflict(c, dst, &src[0], &src[1]);
- if (translate[qinst->op].is_mul) {
+ if (qir_is_mul(qinst)) {
queue(c, qpu_m_alu2(translate[qinst->op].op,
dst,
src[0], src[1]));
+ if (qinst->dst.pack) {
+ *last_inst(c) |= QPU_PM;
+ *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+ QPU_PACK);
+ }
} else {
queue(c, qpu_a_alu2(translate[qinst->op].op,
dst,
src[0], src[1]));
+ if (qinst->dst.pack) {
+ assert(dst.mux == QPU_MUX_A);
+ *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+ QPU_PACK);
+ }
}
break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index a29db1f..3ced50f 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -113,9 +113,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
if (vc4->regs)
return;
- vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
+ vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+ vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
@@ -130,10 +131,12 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
*/
if (vc4_regs[i].mux == QPU_MUX_R4) {
ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
continue;
}
ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
+ ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
}
for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
@@ -177,7 +180,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
uint8_t class_bits[c->num_temps];
struct qpu_reg *temp_registers = calloc(c->num_temps,
sizeof(*temp_registers));
- memset(def, 0, sizeof(def));
+ for (int i = 0; i < ARRAY_SIZE(def); i++)
+ def[i] = ~0;
memset(use, 0, sizeof(use));
/* If things aren't ever written (undefined values), just read from
@@ -196,7 +200,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
uint32_t ip = 0;
list_for_each_entry(struct qinst, inst, &c->instructions, link) {
if (inst->dst.file == QFILE_TEMP) {
- def[inst->dst.index] = ip;
+ def[inst->dst.index] = MIN2(ip, def[inst->dst.index]);
use[inst->dst.index] = ip;
}
@@ -267,17 +271,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
break;
- case QOP_PACK_SCALED:
- /* The pack flags require an A-file dst register. */
- class_bits[inst->dst.index] &= CLASS_BIT_A;
- break;
-
default:
break;
}
+ if (inst->dst.pack && !qir_is_mul(inst)) {
+ /* The non-MUL pack flags require an A-file dst
+ * register.
+ */
+ class_bits[inst->dst.index] &= CLASS_BIT_A;
+ }
+
if (qir_src_needs_a_file(inst)) {
- class_bits[inst->src[0].index] &= CLASS_BIT_A;
+ switch (inst->op) {
+ case QOP_UNPACK_8A_F:
+ case QOP_UNPACK_8B_F:
+ case QOP_UNPACK_8C_F:
+ case QOP_UNPACK_8D_F:
+ /* Special case: these can be done as R4
+ * unpacks, as well.
+ */
+ class_bits[inst->src[0].index] &= (CLASS_BIT_A |
+ CLASS_BIT_R4);
+ break;
+ default:
+ class_bits[inst->src[0].index] &= CLASS_BIT_A;
+ break;
+ }
}
ip++;
}
@@ -287,9 +307,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
switch (class_bits[i]) {
case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
- case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
ra_set_node_class(g, node, vc4->reg_class_any);
break;
+ case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+ ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
+ break;
case CLASS_BIT_A | CLASS_BIT_R4:
ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
break;