diff options
Diffstat (limited to 'src/gallium/drivers')
69 files changed, 879 insertions, 435 deletions
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index c4516ba..dd48956 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index 8e8cf6a..441bfec 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) @@ -326,6 +326,13 @@ enum a3xx_tex_type { A3XX_TEX_3D = 3, }; +enum a3xx_tex_msaa { + A3XX_TPL1_MSAA1X = 0, + A3XX_TPL1_MSAA2X = 1, + A3XX_TPL1_MSAA4X = 2, + A3XX_TPL1_MSAA8X = 3, +}; + #define A3XX_INT0_RBBM_GPU_IDLE 0x00000001 #define A3XX_INT0_RBBM_AHB_ERROR 0x00000002 #define A3XX_INT0_RBBM_REG_TIMEOUT 0x00000004 @@ -2652,6 +2659,7 @@ static inline uint32_t A3XX_VGT_DRAW_INITIATOR_NUM_INSTANCES(uint32_t val) #define REG_A3XX_VGT_IMMED_DATA 0x000021fd #define REG_A3XX_TEX_SAMP_0 0x00000000 +#define A3XX_TEX_SAMP_0_CLAMPENABLE 0x00000001 #define A3XX_TEX_SAMP_0_MIPFILTER_LINEAR 0x00000002 #define A3XX_TEX_SAMP_0_XY_MAG__MASK 0x0000000c #define A3XX_TEX_SAMP_0_XY_MAG__SHIFT 2 @@ -2695,6 +2703,7 @@ static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val { return ((val) << A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT) & A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK; } +#define A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF 0x01000000 #define A3XX_TEX_SAMP_0_UNNORM_COORDS 0x80000000 #define REG_A3XX_TEX_SAMP_1 0x00000001 @@ -2750,6 +2759,12 @@ static inline uint32_t A3XX_TEX_CONST_0_MIPLVLS(uint32_t val) { return ((val) << A3XX_TEX_CONST_0_MIPLVLS__SHIFT) & A3XX_TEX_CONST_0_MIPLVLS__MASK; } +#define A3XX_TEX_CONST_0_MSAATEX__MASK 0x00300000 +#define A3XX_TEX_CONST_0_MSAATEX__SHIFT 20 +static inline uint32_t A3XX_TEX_CONST_0_MSAATEX(enum a3xx_tex_msaa val) +{ + return ((val) << A3XX_TEX_CONST_0_MSAATEX__SHIFT) & A3XX_TEX_CONST_0_MSAATEX__MASK; +} #define A3XX_TEX_CONST_0_FMT__MASK 0x1fc00000 #define A3XX_TEX_CONST_0_FMT__SHIFT 22 static inline uint32_t A3XX_TEX_CONST_0_FMT(enum a3xx_tex_fmt val) @@ -2785,7 +2800,7 @@ static inline uint32_t A3XX_TEX_CONST_1_FETCHSIZE(enum a3xx_tex_fetchsize val) } #define REG_A3XX_TEX_CONST_2 0x00000002 -#define A3XX_TEX_CONST_2_INDX__MASK 0x000000ff +#define A3XX_TEX_CONST_2_INDX__MASK 0x000001ff #define A3XX_TEX_CONST_2_INDX__SHIFT 0 static inline uint32_t A3XX_TEX_CONST_2_INDX(uint32_t val) { @@ -2805,7 +2820,7 @@ static inline uint32_t A3XX_TEX_CONST_2_SWAP(enum a3xx_color_swap val) } #define REG_A3XX_TEX_CONST_3 0x00000003 -#define A3XX_TEX_CONST_3_LAYERSZ1__MASK 0x00007fff +#define A3XX_TEX_CONST_3_LAYERSZ1__MASK 0x0001ffff #define A3XX_TEX_CONST_3_LAYERSZ1__SHIFT 0 static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ1(uint32_t val) { diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c index ec87aa9..04cb9b9 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c @@ -262,6 +262,15 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = { _T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX), _T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX), _T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX), + + _T(DXT1_RGB, DXT1, NONE, WZYX), + _T(DXT1_SRGB, DXT1, NONE, WZYX), + _T(DXT1_RGBA, DXT1, NONE, WZYX), + _T(DXT1_SRGBA, DXT1, NONE, WZYX), + _T(DXT3_RGBA, DXT3, NONE, WZYX), + _T(DXT3_SRGBA, DXT3, NONE, WZYX), + _T(DXT5_RGBA, DXT5, NONE, WZYX), + _T(DXT5_SRGBA, DXT5, NONE, WZYX), }; enum a3xx_vtx_fmt @@ -301,7 +310,7 @@ fd3_pipe2fetchsize(enum pipe_format format) { if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) format = PIPE_FORMAT_Z32_FLOAT; - switch (util_format_get_blocksizebits(format)) { + switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) { case 8: return TFETCH_1_BYTE; case 16: return TFETCH_2_BYTE; case 32: return TFETCH_4_BYTE; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c index 9c16804..583caaa 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c @@ -73,7 +73,7 @@ fd3_rasterizer_state_create(struct pipe_context *pctx, so->gras_su_poly_offset_scale = A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale); so->gras_su_poly_offset_offset = - A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units); + A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f); so->gras_su_mode_control = A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c index c30658d..2d6ecb2 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c @@ -115,6 +115,7 @@ fd3_sampler_state_create(struct pipe_context *pctx, so->texsamp0 = COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) | + COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) | COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) | A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | @@ -239,7 +240,7 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); /* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */ so->texconst2 = - A3XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp); + A3XX_TEX_CONST_2_PITCH(util_format_get_nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp); switch (prsc->target) { case PIPE_TEXTURE_1D_ARRAY: case PIPE_TEXTURE_2D_ARRAY: diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index 563f70a..2e1d712 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) @@ -162,10 +162,13 @@ enum a4xx_tex_fmt { TFMT4_8_UNORM = 4, TFMT4_8_8_UNORM = 14, TFMT4_8_8_8_8_UNORM = 28, + TFMT4_8_SNORM = 5, TFMT4_8_8_SNORM = 15, TFMT4_8_8_8_8_SNORM = 29, + TFMT4_8_UINT = 6, TFMT4_8_8_UINT = 16, TFMT4_8_8_8_8_UINT = 30, + TFMT4_8_SINT = 7, TFMT4_8_8_SINT = 17, TFMT4_8_8_8_8_SINT = 31, TFMT4_16_UINT = 21, @@ -430,7 +433,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(enum a3xx_color_swap val) return ((val) << A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__SHIFT) & A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__MASK; } #define A4XX_RB_MRT_BUF_INFO_COLOR_SRGB 0x00002000 -#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK 0x007fc000 +#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK 0xffffc000 #define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__SHIFT 14 static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val) { @@ -440,7 +443,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val) static inline uint32_t REG_A4XX_RB_MRT_BASE(uint32_t i0) { return 0x000020a6 + 0x5*i0; } static inline uint32_t REG_A4XX_RB_MRT_CONTROL3(uint32_t i0) { return 0x000020a7 + 0x5*i0; } -#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK 0x0001fff8 +#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK 0x03fffff8 #define A4XX_RB_MRT_CONTROL3_STRIDE__SHIFT 3 static inline uint32_t A4XX_RB_MRT_CONTROL3_STRIDE(uint32_t val) { @@ -1460,6 +1463,7 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val) { return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK; } +#define A4XX_SP_FS_MRT_REG_COLOR_SRGB 0x00040000 #define REG_A4XX_SP_CS_CTRL_REG0 0x00002300 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h index ab7850e..3a1d4b6 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h @@ -56,6 +56,7 @@ struct fd4_emit { uint32_t sprite_coord_enable; /* bitmask */ bool sprite_coord_mode; bool rasterflat; + bool no_decode_srgb; /* cached to avoid repeated lookups of same variants: */ struct ir3_shader_variant *vp, *fp; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c index 3e00454..6c9e217 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c @@ -79,9 +79,9 @@ struct fd4_format { static struct fd4_format formats[PIPE_FORMAT_COUNT] = { /* 8-bit */ VT(R8_UNORM, 8_UNORM, R8_UNORM, WZYX), - V_(R8_SNORM, 8_SNORM, NONE, WZYX), - V_(R8_UINT, 8_UINT, NONE, WZYX), - V_(R8_SINT, 8_SINT, NONE, WZYX), + VT(R8_SNORM, 8_SNORM, NONE, WZYX), + VT(R8_UINT, 8_UINT, NONE, WZYX), + VT(R8_SINT, 8_SINT, NONE, WZYX), V_(R8_USCALED, 8_UINT, NONE, WZYX), V_(R8_SSCALED, 8_UINT, NONE, WZYX), @@ -115,8 +115,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { VT(R8G8_UNORM, 8_8_UNORM, R8G8_UNORM, WZYX), VT(R8G8_SNORM, 8_8_SNORM, R8G8_SNORM, WZYX), - VT(R8G8_UINT, 8_8_UINT, NONE, WZYX), - VT(R8G8_SINT, 8_8_SINT, NONE, WZYX), + VT(R8G8_UINT, 8_8_UINT, R8G8_UINT, WZYX), + VT(R8G8_SINT, 8_8_SINT, R8G8_SINT, WZYX), V_(R8G8_USCALED, 8_8_UINT, NONE, WZYX), V_(R8G8_SSCALED, 8_8_SINT, NONE, WZYX), diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c index 81c37f7..3f8bbf3 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c @@ -46,7 +46,8 @@ static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w) + struct pipe_surface **bufs, uint32_t *bases, + uint32_t bin_w, bool decode_srgb) { enum a4xx_tile_mode tile_mode; unsigned i; @@ -60,6 +61,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { enum a4xx_color_fmt format = 0; enum a3xx_color_swap swap = WZYX; + bool srgb = false; struct fd_resource *rsc = NULL; struct fd_resource_slice *slice = NULL; uint32_t stride = 0; @@ -68,10 +70,9 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, if ((i < nr_bufs) && bufs[i]) { struct pipe_surface *psurf = bufs[i]; - enum pipe_format pformat = 0; + enum pipe_format pformat = psurf->format; rsc = fd_resource(psurf->texture); - pformat = psurf->format; /* In case we're drawing to Z32F_S8, the "color" actually goes to * the stencil @@ -86,6 +87,11 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, format = fd4_pipe2color(pformat); swap = fd4_pipe2swap(pformat); + if (decode_srgb) + srgb = util_format_is_srgb(pformat); + else + pformat = util_format_linear(pformat); + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); offset = fd_resource_offset(rsc, psurf->u.tex.level, @@ -108,7 +114,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | - A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap)); + A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | + COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB)); if (bin_w || (i >= nr_bufs) || !bufs[i]) { OUT_RING(ring, base); OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride)); @@ -282,7 +289,7 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases, struct fd_ringbuffer *ring = ctx->ring; struct pipe_surface *zsbufs[2]; - emit_mrt(ring, nr_bufs, bufs, bases, bin_w); + emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false); if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { /* The gmem_restore_tex logic will put the first buffer's stencil @@ -315,6 +322,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile) .key = { .half_precision = fd_half_precision(pfb), }, + .no_decode_srgb = true, }; unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; float x0, y0, x1, y1; @@ -520,7 +528,7 @@ fd4_emit_sysmem_prep(struct fd_context *ctx) OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true); /* setup scissor/offset for current tile: */ OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); @@ -677,7 +685,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true); /* setup scissor/offset for current tile: */ OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 1a6d014..a3d7123 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -450,10 +450,15 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8); for (i = 0; i < 8; i++) { enum a4xx_color_fmt format = 0; - if (i < nr) + bool srgb = false; + if (i < nr) { format = fd4_emit_format(bufs[i]); + if (bufs[i] && !emit->no_decode_srgb) + srgb = util_format_is_srgb(bufs[i]->format); + } OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | + COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) | COND(emit->key.half_precision, A4XX_SP_FS_MRT_REG_HALF_PRECISION)); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c index d2bc5fe..213b29c 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c @@ -187,9 +187,9 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size); break; case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: /* ?? not sure about _CUBE_ARRAY */ + case PIPE_TEXTURE_CUBE_ARRAY: so->texconst3 = - A4XX_TEX_CONST_3_DEPTH(1) | + A4XX_TEX_CONST_3_DEPTH(prsc->array_size / 6) | A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size); break; case PIPE_TEXTURE_3D: diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index 00b6acb..29944b7 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 98a90e2..432dce3 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10551 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 66709 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63728 bytes, from 2015-08-05 18:07:28) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67120 bytes, from 2015-08-14 23:22:03) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63915 bytes, from 2015-08-24 16:56:28) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 8e6d431..0b6b9fb 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -131,11 +131,13 @@ static void fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, unsigned flags) { + struct fd_ringbuffer *ring = fd_context(pctx)->ring; + fd_context_render(pctx); if (fence) { fd_screen_fence_ref(pctx->screen, fence, NULL); - *fence = fd_fence_create(pctx); + *fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(ring)); } } diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c index 04a9fea..5125f09 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.c +++ b/src/gallium/drivers/freedreno/freedreno_fence.c @@ -50,35 +50,18 @@ fd_screen_fence_ref(struct pipe_screen *pscreen, *ptr = pfence; } -/* TODO we need to spiff out libdrm_freedreno a bit to allow passing - * the timeout.. and maybe a better way to check if fence has been - * signaled. The current implementation is a bit lame for now to - * avoid bumping libdrm version requirement. - */ - -boolean fd_screen_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence) -{ - uint32_t timestamp = fd_ringbuffer_timestamp(fence->ctx->ring); - - /* TODO util helper for compare w/ rollover? */ - return timestamp >= fence->timestamp; -} - boolean fd_screen_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *fence, uint64_t timeout) { - if (!timeout) - return fd_screen_fence_signalled(screen, fence); - - if (fd_pipe_wait(fence->screen->pipe, fence->timestamp)) + if (fd_pipe_wait_timeout(fence->screen->pipe, fence->timestamp, timeout)) return false; return true; } -struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx) +struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx, + uint32_t timestamp) { struct pipe_fence_handle *fence; struct fd_context *ctx = fd_context(pctx); @@ -91,7 +74,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx) fence->ctx = ctx; fence->screen = ctx->screen; - fence->timestamp = fd_ringbuffer_timestamp(ctx->ring); + fence->timestamp = timestamp; return fence; } diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h index e36bcc4..06c314a 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.h +++ b/src/gallium/drivers/freedreno/freedreno_fence.h @@ -34,11 +34,10 @@ void fd_screen_fence_ref(struct pipe_screen *pscreen, struct pipe_fence_handle **ptr, struct pipe_fence_handle *pfence); -boolean fd_screen_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *pfence); boolean fd_screen_fence_finish(struct pipe_screen *screen, struct pipe_fence_handle *pfence, uint64_t timeout); -struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx); +struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx, + uint32_t timestamp); #endif /* FREEDRENO_FENCE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index 709ad4e..98de096 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -222,7 +222,7 @@ fd_resource_transfer_map(struct pipe_context *pctx, ptrans->level = level; ptrans->usage = usage; ptrans->box = *box; - ptrans->stride = slice->pitch * rsc->cpp; + ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->cpp; ptrans->layer_stride = slice->size0; if (usage & PIPE_TRANSFER_READ) @@ -375,9 +375,11 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment) for (level = 0; level <= prsc->last_level; level++) { struct fd_resource_slice *slice = fd_resource_slice(rsc, level); + uint32_t blocks; slice->pitch = width = align(width, 32); slice->offset = size; + blocks = util_format_get_nblocks(prsc->format, width, height); /* 1d array and 2d array textures must all have the same layer size * for each miplevel on a3xx. 3d textures can have different layer * sizes for high levels, but the hw auto-sizer is buggy (or at least @@ -387,9 +389,9 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment) if (prsc->target == PIPE_TEXTURE_3D && ( level == 1 || (level > 1 && rsc->slices[level - 1].size0 > 0xf000))) - slice->size0 = align(slice->pitch * height * rsc->cpp, alignment); + slice->size0 = align(blocks * rsc->cpp, alignment); else if (level == 0 || rsc->layer_first || alignment == 1) - slice->size0 = align(slice->pitch * height * rsc->cpp, alignment); + slice->size0 = align(blocks * rsc->cpp, alignment); else slice->size0 = rsc->slices[level - 1].size0; @@ -459,7 +461,6 @@ fd_resource_create(struct pipe_screen *pscreen, if (is_a4xx(fd_screen(pscreen))) { switch (tmpl->target) { case PIPE_TEXTURE_3D: - /* TODO 3D_ARRAY? */ rsc->layer_first = false; break; default: diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index b55f5b3..86e9a21 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -163,7 +163,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_MULTISAMPLE: case PIPE_CAP_TEXTURE_BARRIER: case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_CUBE_MAP_ARRAY: case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: case PIPE_CAP_START_INSTANCE: case PIPE_CAP_COMPUTE: @@ -176,6 +175,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_INDEP_BLEND_ENABLE: case PIPE_CAP_INDEP_BLEND_FUNC: case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return is_a3xx(screen) || is_a4xx(screen); case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: @@ -191,8 +191,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 16383; case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return is_a3xx(screen); + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_CUBE_MAP_ARRAY: + return is_a4xx(screen); + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: return 256; @@ -202,7 +207,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return is_ir3(screen) ? 130 : 120; /* Unsupported features. */ - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: @@ -230,8 +234,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: case PIPE_CAP_DEPTH_BOUNDS_TEST: return 0; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 0ab3345..071901a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1636,6 +1636,11 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0); } + /* the array coord for cube arrays needs 0.5 added to it */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array && + opc != OPC_ISAML) + coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0); + /* * lay out the first argument in the proper order: * - actual coordinates first @@ -1759,6 +1764,12 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) tex_info(tex, &flags, &coords); + /* Actually we want the number of dimensions, not coordinates. This + * distinction only matters for cubes. + */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) + coords = 2; + dst = get_dst(ctx, &tex->dest, 4); compile_assert(ctx, tex->num_srcs == 1); @@ -2301,7 +2312,7 @@ emit_instructions(struct ir3_compile *ctx) ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); /* Create inputs in first block: */ - ctx->block = get_block(ctx, fxn->start_block); + ctx->block = get_block(ctx, nir_start_block(fxn)); ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c index dc9e462..bed7b7b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c @@ -29,6 +29,7 @@ #include "ir3_nir.h" #include "glsl/nir/nir_builder.h" +#include "glsl/nir/nir_control_flow.h" /* Based on nir_opt_peephole_select, and hacked up to more aggressively * flatten anything that can be flattened diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index eaf3b3c..8801839 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -189,7 +189,7 @@ ir3_ra_alloc_reg_set(void *memctx) } /* allocate the reg-set.. */ - set->regs = ra_alloc_reg_set(set, ra_reg_count); + set->regs = ra_alloc_reg_set(set, ra_reg_count, true); set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count); diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c index 24e0156..b2a639c 100644 --- a/src/gallium/drivers/i915/i915_surface.c +++ b/src/gallium/drivers/i915/i915_surface.c @@ -120,7 +120,8 @@ i915_surface_copy_render(struct pipe_context *pipe, util_blitter_blit_generic(i915->blitter, dst_view, &dstbox, src_view, src_box, src_width0, src_height0, - PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL); + PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, + FALSE); return; fallback: diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources index 3fae3bc..9346ea3 100644 --- a/src/gallium/drivers/nouveau/Makefile.sources +++ b/src/gallium/drivers/nouveau/Makefile.sources @@ -121,7 +121,8 @@ NV50_CODEGEN_SOURCES := \ codegen/nv50_ir_target_nv50.cpp \ codegen/nv50_ir_target_nv50.h \ codegen/nv50_ir_util.cpp \ - codegen/nv50_ir_util.h + codegen/nv50_ir_util.h \ + codegen/unordered_set.h NVC0_CODEGEN_SOURCES := \ codegen/nv50_ir_emit_gk110.cpp \ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 3ddaeaf..ba1b085 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -29,8 +29,8 @@ #include <deque> #include <list> #include <vector> -#include <tr1/unordered_set> +#include "codegen/unordered_set.h" #include "codegen/nv50_ir_util.h" #include "codegen/nv50_ir_graph.h" @@ -585,10 +585,10 @@ public: static inline Value *get(Iterator&); - std::tr1::unordered_set<ValueRef *> uses; + unordered_set<ValueRef *> uses; std::list<ValueDef *> defs; - typedef std::tr1::unordered_set<ValueRef *>::iterator UseIterator; - typedef std::tr1::unordered_set<ValueRef *>::const_iterator UseCIterator; + typedef unordered_set<ValueRef *>::iterator UseIterator; + typedef unordered_set<ValueRef *>::const_iterator UseCIterator; typedef std::list<ValueDef *>::iterator DefIterator; typedef std::list<ValueDef *>::const_iterator DefCIterator; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index f06056f..8f15429 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -933,6 +933,7 @@ CodeEmitterGK110::emitCVT(const Instruction *i) code[0] |= typeSizeofLog2(dType) << 10; code[0] |= typeSizeofLog2(i->sType) << 12; + code[1] |= i->subOp << 12; if (isSignedIntType(dType)) code[0] |= 0x4000; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index ef5c87d..6e22788 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -818,6 +818,7 @@ CodeEmitterGM107::emitI2F() emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs()); emitCC (0x2f); emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg()); + emitField(0x29, 2, insn->subOp); emitRND (0x27, rnd, -1); emitField(0x0d, 1, isSignedType(insn->sType)); emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType))); @@ -850,6 +851,7 @@ CodeEmitterGM107::emitI2I() emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs()); emitCC (0x2f); emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg()); + emitField(0x29, 2, insn->subOp); emitField(0x0d, 1, isSignedType(insn->sType)); emitField(0x0c, 1, isSignedType(insn->dType)); emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType))); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index f607f3b..6bf5219 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1020,6 +1020,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i) code[0] |= util_logbase2(typeSizeof(dType)) << 20; code[0] |= util_logbase2(typeSizeof(i->sType)) << 23; + // for 8/16 source types, the byte/word is in subOp. word 1 is + // represented as 2. + code[1] |= i->subOp << 0x17; + if (sat) code[0] |= 0x20; if (abs) @@ -2614,11 +2618,12 @@ private: int imul; // integer MUL to MUL delay 3 } res; struct ScoreData { - int r[64]; + int r[256]; int p[8]; int c; } rd, wr; int base; + int regs; void rebase(const int base) { @@ -2627,7 +2632,7 @@ private: return; this->base = 0; - for (int i = 0; i < 64; ++i) { + for (int i = 0; i < regs; ++i) { rd.r[i] += delta; wr.r[i] += delta; } @@ -2646,16 +2651,17 @@ private: res.imul += delta; res.tex += delta; } - void wipe() + void wipe(int regs) { memset(&rd, 0, sizeof(rd)); memset(&wr, 0, sizeof(wr)); memset(&res, 0, sizeof(res)); + this->regs = regs; } int getLatest(const ScoreData& d) const { int max = 0; - for (int i = 0; i < 64; ++i) + for (int i = 0; i < regs; ++i) if (d.r[i] > max) max = d.r[i]; for (int i = 0; i < 8; ++i) @@ -2690,7 +2696,7 @@ private: } void setMax(const RegScores *that) { - for (int i = 0; i < 64; ++i) { + for (int i = 0; i < regs; ++i) { rd.r[i] = MAX2(rd.r[i], that->rd.r[i]); wr.r[i] = MAX2(wr.r[i], that->wr.r[i]); } @@ -2711,7 +2717,7 @@ private: } void print(int cycle) { - for (int i = 0; i < 64; ++i) { + for (int i = 0; i < regs; ++i) { if (rd.r[i] > cycle) INFO("rd $r%i @ %i\n", i, rd.r[i]); if (wr.r[i] > cycle) @@ -2806,9 +2812,10 @@ SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const bool SchedDataCalculator::visit(Function *func) { + int regs = targ->getFileSize(FILE_GPR) + 1; scoreBoards.resize(func->cfg.getSize()); for (size_t i = 0; i < scoreBoards.size(); ++i) - scoreBoards[i].wipe(); + scoreBoards[i].wipe(regs); return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 4847a0f..f153674 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -2990,9 +2990,15 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) case TGSI_OPCODE_UBFE: FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { src0 = fetchSrc(0, c); - src1 = fetchSrc(1, c); - src2 = fetchSrc(2, c); - mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1); + if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE && + tgsi.getSrc(2).getFile() == TGSI_FILE_IMMEDIATE) { + src1 = loadImm(NULL, tgsi.getSrc(2).getValueU32(c, info) << 8 | + tgsi.getSrc(1).getValueU32(c, info)); + } else { + src1 = fetchSrc(1, c); + src2 = fetchSrc(2, c); + mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1); + } mkOp2(OP_EXTBF, dstTy, dst0[c], src0, src1); } break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 1f3fce2..420cc4e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -193,100 +193,16 @@ GM107LoweringPass::visit(Instruction *i) checkPredicate(i); switch (i->op) { - case OP_TEX: - case OP_TXB: - case OP_TXL: - case OP_TXF: - case OP_TXG: - return handleTEX(i->asTex()); - case OP_TXD: - return handleTXD(i->asTex()); - case OP_TXLQ: - return handleTXLQ(i->asTex()); - case OP_TXQ: - return handleTXQ(i->asTex()); - case OP_EX2: - bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); - i->setSrc(0, i->getDef(0)); - break; - case OP_POW: - return handlePOW(i); - case OP_DIV: - return handleDIV(i); - case OP_MOD: - return handleMOD(i); - case OP_SQRT: - return handleSQRT(i); - case OP_EXPORT: - return handleEXPORT(i); case OP_PFETCH: return handlePFETCH(i); - case OP_EMIT: - case OP_RESTART: - return handleOUT(i); - case OP_RDSV: - return handleRDSV(i); - case OP_WRSV: - return handleWRSV(i); - case OP_LOAD: - if (i->src(0).getFile() == FILE_SHADER_INPUT) { - if (prog->getType() == Program::TYPE_COMPUTE) { - i->getSrc(0)->reg.file = FILE_MEMORY_CONST; - i->getSrc(0)->reg.fileIndex = 0; - } else - if (prog->getType() == Program::TYPE_GEOMETRY && - i->src(0).isIndirect(0)) { - // XXX: this assumes vec4 units - Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 0), bld.mkImm(4)); - i->setIndirect(0, 0, ptr); - i->op = OP_VFETCH; - } else { - i->op = OP_VFETCH; - assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP - } - } else if (i->src(0).getFile() == FILE_MEMORY_CONST) { - if (i->src(0).isIndirect(1)) { - Value *ptr; - if (i->src(0).isIndirect(0)) - ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 1), bld.mkImm(0x1010), - i->getIndirect(0, 0)); - else - ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 1), bld.mkImm(16)); - i->setIndirect(0, 1, NULL); - i->setIndirect(0, 0, ptr); - i->subOp = NV50_IR_SUBOP_LDC_IS; - } - } - break; - case OP_ATOM: - { - const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL; - handleATOM(i); - handleCasExch(i, cctl); - } - break; - case OP_SULDB: - case OP_SULDP: - case OP_SUSTB: - case OP_SUSTP: - case OP_SUREDB: - case OP_SUREDP: - handleSurfaceOpNVE4(i->asTex()); - break; case OP_DFDX: case OP_DFDY: - handleDFDX(i); - break; + return handleDFDX(i); case OP_POPCNT: - handlePOPCNT(i); - break; + return handlePOPCNT(i); default: - break; + return NVC0LoweringPass::visit(i); } - return true; } } // namespace nv50_ir diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index c3c302d..b1f4065 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -224,7 +224,7 @@ NVC0LegalizePostRA::findFirstUses( const Instruction *texi, const Instruction *insn, std::list<TexUse> &uses, - std::tr1::unordered_set<const Instruction *>& visited) + unordered_set<const Instruction *>& visited) { for (int d = 0; insn->defExists(d); ++d) { Value *v = insn->getDef(d); @@ -323,7 +323,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn) if (!uses) return false; for (size_t i = 0; i < texes.size(); ++i) { - std::tr1::unordered_set<const Instruction *> visited; + unordered_set<const Instruction *> visited; findFirstUses(texes[i], texes[i], uses[i], visited); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index 260e101..2ce52e5 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -20,8 +20,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include <tr1/unordered_set> - #include "codegen/nv50_ir.h" #include "codegen/nv50_ir_build_util.h" @@ -73,7 +71,7 @@ private: inline bool insnDominatedBy(const Instruction *, const Instruction *) const; void findFirstUses(const Instruction *tex, const Instruction *def, std::list<TexUse>&, - std::tr1::unordered_set<const Instruction *>&); + unordered_set<const Instruction *>&); void findOverwritingDefs(const Instruction *tex, Instruction *insn, const BasicBlock *term, std::list<TexUse>&); @@ -111,10 +109,11 @@ protected: void checkPredicate(Instruction *); + virtual bool visit(Instruction *); + private: virtual bool visit(Function *); virtual bool visit(BasicBlock *); - virtual bool visit(Instruction *); void readTessCoord(LValue *dst, int c); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index cea96dc..b01ef41 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1023,27 +1023,53 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) case OP_AND: { - CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp(); - if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1) - return; - if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32)) - return; - if (imm0.reg.data.f32 != 1.0) - return; - if (i->getSrc(t)->getInsn()->dType != TYPE_U32) - return; + Instruction *src = i->getSrc(t)->getInsn(); + ImmediateValue imm1; + if (imm0.reg.data.u32 == 0) { + i->op = OP_MOV; + i->setSrc(0, new_ImmediateValue(prog, 0u)); + i->src(0).mod = Modifier(0); + i->setSrc(1, NULL); + } else if (imm0.reg.data.u32 == ~0U) { + i->op = i->src(t).mod.getOp(); + if (t) { + i->setSrc(0, i->getSrc(t)); + i->src(0).mod = i->src(t).mod; + } + i->setSrc(1, NULL); + } else if (src->asCmp()) { + CmpInstruction *cmp = src->asCmp(); + if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1) + return; + if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32)) + return; + if (imm0.reg.data.f32 != 1.0) + return; + if (cmp->dType != TYPE_U32) + return; - i->getSrc(t)->getInsn()->dType = TYPE_F32; - if (i->src(t).mod != Modifier(0)) { - assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT)); - i->src(t).mod = Modifier(0); - cmp->setCond = inverseCondCode(cmp->setCond); - } - i->op = OP_MOV; - i->setSrc(s, NULL); - if (t) { - i->setSrc(0, i->getSrc(t)); - i->setSrc(t, NULL); + cmp->dType = TYPE_F32; + if (i->src(t).mod != Modifier(0)) { + assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT)); + i->src(t).mod = Modifier(0); + cmp->setCond = inverseCondCode(cmp->setCond); + } + i->op = OP_MOV; + i->setSrc(s, NULL); + if (t) { + i->setSrc(0, i->getSrc(t)); + i->setSrc(t, NULL); + } + } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) && + src->op == OP_SHR && + src->src(1).getImmediate(imm1) && + i->src(t).mod == Modifier(0) && + util_is_power_of_two(imm0.reg.data.u32 + 1)) { + // low byte = offset, high byte = width + uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32; + i->op = OP_EXTBF; + i->setSrc(0, src->getSrc(0)); + i->setSrc(1, new_ImmediateValue(prog, ext)); } } break; @@ -1106,6 +1132,84 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->op = OP_MOV; break; } + case OP_CVT: { + Storage res; + + // TODO: handle 64-bit values properly + if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8) + return; + + // TODO: handle single byte/word extractions + if (i->subOp) + return; + + bld.setPosition(i, true); /* make sure bld is init'ed */ + +#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \ + case type: \ + switch (i->sType) { \ + case TYPE_F32: \ + res.data.dst = util_iround(i->saturate ? \ + CLAMP(imm0.reg.data.f32, fmin, fmax) : \ + imm0.reg.data.f32); \ + break; \ + case TYPE_S32: \ + res.data.dst = i->saturate ? \ + CLAMP(imm0.reg.data.s32, imin, imax) : \ + imm0.reg.data.s32; \ + break; \ + case TYPE_U32: \ + res.data.dst = i->saturate ? \ + CLAMP(imm0.reg.data.u32, umin, umax) : \ + imm0.reg.data.u32; \ + break; \ + case TYPE_S16: \ + res.data.dst = i->saturate ? \ + CLAMP(imm0.reg.data.s16, imin, imax) : \ + imm0.reg.data.s16; \ + break; \ + case TYPE_U16: \ + res.data.dst = i->saturate ? \ + CLAMP(imm0.reg.data.u16, umin, umax) : \ + imm0.reg.data.u16; \ + break; \ + default: return; \ + } \ + i->setSrc(0, bld.mkImm(res.data.dst)); \ + break + + switch(i->dType) { + CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX); + CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX); + CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX); + CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX); + case TYPE_F32: + switch (i->sType) { + case TYPE_F32: + res.data.f32 = i->saturate ? + CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : + imm0.reg.data.f32; + break; + case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break; + case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break; + case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break; + case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break; + default: + return; + } + i->setSrc(0, bld.mkImm(res.data.f32)); + break; + default: + return; + } +#undef CASE + + i->setType(i->dType); /* Remove i->sType, which we don't need anymore */ + i->op = OP_MOV; + i->saturate = 0; + i->src(0).mod = Modifier(0); /* Clear the already applied modifier */ + break; + } default: return; } @@ -1212,7 +1316,8 @@ private: void handleRCP(Instruction *); void handleSLCT(Instruction *); void handleLOGOP(Instruction *); - void handleCVT(Instruction *); + void handleCVT_NEG(Instruction *); + void handleCVT_EXTBF(Instruction *); void handleSUCLAMP(Instruction *); BuildUtil bld; @@ -1463,12 +1568,12 @@ AlgebraicOpt::handleLOGOP(Instruction *logop) // nv50: // F2I(NEG(I2F(ABS(SET)))) void -AlgebraicOpt::handleCVT(Instruction *cvt) +AlgebraicOpt::handleCVT_NEG(Instruction *cvt) { + Instruction *insn = cvt->getSrc(0)->getInsn(); if (cvt->sType != TYPE_F32 || cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0)) return; - Instruction *insn = cvt->getSrc(0)->getInsn(); if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32) return; if (insn->src(0).mod != Modifier(0)) @@ -1498,6 +1603,104 @@ AlgebraicOpt::handleCVT(Instruction *cvt) delete_Instruction(prog, cvt); } +// Some shaders extract packed bytes out of words and convert them to +// e.g. float. The Fermi+ CVT instruction can extract those directly, as can +// nv50 for word sizes. +// +// CVT(EXTBF(x, byte/word)) +// CVT(AND(bytemask, x)) +// CVT(AND(bytemask, SHR(x, 8/16/24))) +// CVT(SHR(x, 16/24)) +void +AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt) +{ + Instruction *insn = cvt->getSrc(0)->getInsn(); + ImmediateValue imm; + Value *arg = NULL; + unsigned width, offset; + if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn) + return; + if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) { + width = (imm.reg.data.u32 >> 8) & 0xff; + offset = imm.reg.data.u32 & 0xff; + arg = insn->getSrc(0); + + if (width != 8 && width != 16) + return; + if (width == 8 && offset & 0x7) + return; + if (width == 16 && offset & 0xf) + return; + } else if (insn->op == OP_AND) { + int s; + if (insn->src(0).getImmediate(imm)) + s = 0; + else if (insn->src(1).getImmediate(imm)) + s = 1; + else + return; + + if (imm.reg.data.u32 == 0xff) + width = 8; + else if (imm.reg.data.u32 == 0xffff) + width = 16; + else + return; + + arg = insn->getSrc(!s); + Instruction *shift = arg->getInsn(); + offset = 0; + if (shift && shift->op == OP_SHR && + shift->sType == cvt->sType && + shift->src(1).getImmediate(imm) && + ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) || + (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) { + arg = shift->getSrc(0); + offset = imm.reg.data.u32; + } + } else if (insn->op == OP_SHR && + insn->sType == cvt->sType && + insn->src(1).getImmediate(imm)) { + arg = insn->getSrc(0); + if (imm.reg.data.u32 == 24) { + width = 8; + offset = 24; + } else if (imm.reg.data.u32 == 16) { + width = 16; + offset = 16; + } else { + return; + } + } + + if (!arg) + return; + + // Irrespective of what came earlier, we can undo a shift on the argument + // by adjusting the offset. + Instruction *shift = arg->getInsn(); + if (shift && shift->op == OP_SHL && + shift->src(1).getImmediate(imm) && + ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) || + (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) && + imm.reg.data.u32 <= offset) { + arg = shift->getSrc(0); + offset -= imm.reg.data.u32; + } + + // The unpackSnorm lowering still leaves a few shifts behind, but it's too + // annoying to detect them. + + if (width == 8) { + cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8; + } else { + assert(width == 16); + cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16; + } + cvt->setSrc(0, arg); + cvt->subOp = offset >> 3; +} + // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6) void AlgebraicOpt::handleSUCLAMP(Instruction *insn) @@ -1568,7 +1771,9 @@ AlgebraicOpt::visit(BasicBlock *bb) handleLOGOP(i); break; case OP_CVT: - handleCVT(i); + handleCVT_NEG(i); + if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32)) + handleCVT_EXTBF(i); break; case OP_SUCLAMP: handleSUCLAMP(i); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 78bc97f..0cd21cf 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -25,7 +25,6 @@ #include <stack> #include <limits> -#include <tr1/unordered_set> namespace nv50_ir { @@ -1551,7 +1550,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst) // Keep track of which instructions to delete later. Deleting them // inside the loop is unsafe since a single instruction may have // multiple destinations that all need to be spilled (like OP_SPLIT). - std::tr1::unordered_set<Instruction *> to_del; + unordered_set<Instruction *> to_del; for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end(); ++d) { @@ -1593,7 +1592,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst) } } - for (std::tr1::unordered_set<Instruction *>::const_iterator it = to_del.begin(); + for (unordered_set<Instruction *>::const_iterator it = to_del.begin(); it != to_del.end(); ++it) delete_Instruction(func->getProgram(), *it); } diff --git a/src/gallium/drivers/nouveau/codegen/unordered_set.h b/src/gallium/drivers/nouveau/codegen/unordered_set.h new file mode 100644 index 0000000..8ef6d46 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/unordered_set.h @@ -0,0 +1,48 @@ +#ifndef __NV50_UNORDERED_SET_H__ +#define __NV50_UNORDERED_SET_H__ + +#if (__cplusplus >= 201103L) || defined(ANDROID) +#include <unordered_set> +#else +#include <tr1/unordered_set> +#endif + +namespace nv50_ir { + +#if __cplusplus >= 201103L +using std::unordered_set; +#elif !defined(ANDROID) +using std::tr1::unordered_set; +#else // Android release before lollipop +using std::isfinite; +typedef std::tr1::unordered_set<void *> voidptr_unordered_set; + +template <typename V> +class unordered_set : public voidptr_unordered_set { + public: + typedef voidptr_unordered_set _base; + typedef _base::iterator _biterator; + typedef _base::const_iterator const_biterator; + + class iterator : public _biterator { + public: + iterator(const _biterator & i) : _biterator(i) {} + V operator*() const { return reinterpret_cast<V>(*_biterator(*this)); } + }; + class const_iterator : public const_biterator { + public: + const_iterator(const iterator & i) : const_biterator(i) {} + const_iterator(const const_biterator & i) : const_biterator(i) {} + const V operator*() const { return reinterpret_cast<const V>(*const_biterator(*this)); } + }; + + iterator begin() { return _base::begin(); } + iterator end() { return _base::end(); } + const_iterator begin() const { return _base::begin(); } + const_iterator end() const { return _base::end(); } +}; +#endif + +} // namespace nv50_ir + +#endif // __NV50_UNORDERED_SET_H__ diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c index 8660498..495450b 100644 --- a/src/gallium/drivers/nouveau/nouveau_compiler.c +++ b/src/gallium/drivers/nouveau/nouveau_compiler.c @@ -190,6 +190,10 @@ main(int argc, char *argv[]) type = PIPE_SHADER_GEOMETRY; else if (!strncmp(text, "COMP", 4)) type = PIPE_SHADER_COMPUTE; + else if (!strncmp(text, "TESS_CTRL", 9)) + type = PIPE_SHADER_TESS_CTRL; + else if (!strncmp(text, "TESS_EVAL", 9)) + type = PIPE_SHADER_TESS_EVAL; else { _debug_printf("Unrecognized TGSI header\n"); return 1; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 9505a0b..410e631 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -117,7 +117,6 @@ nv50_blend_state_create(struct pipe_context *pipe, struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj); int i; bool emit_common_func = cso->rt[0].blend_enable; - uint32_t ms; if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) { SB_BEGIN_3D(so, BLEND_INDEPENDENT, 1); @@ -189,15 +188,6 @@ nv50_blend_state_create(struct pipe_context *pipe, SB_DATA (so, nv50_colormask(cso->rt[0].colormask)); } - ms = 0; - if (cso->alpha_to_coverage) - ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE; - if (cso->alpha_to_one) - ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE; - - SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1); - SB_DATA (so, ms); - assert(so->size <= (sizeof(so->state) / sizeof(so->state[0]))); return so; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c index 985603d..b304a17 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c @@ -1,4 +1,6 @@ +#include "util/u_format.h" + #include "nv50/nv50_context.h" #include "nv50/nv50_defs.xml.h" @@ -314,6 +316,25 @@ nv50_validate_derived_2(struct nv50_context *nv50) } static void +nv50_validate_derived_3(struct nv50_context *nv50) +{ + struct nouveau_pushbuf *push = nv50->base.pushbuf; + struct pipe_framebuffer_state *fb = &nv50->framebuffer; + uint32_t ms = 0; + + if ((!fb->nr_cbufs || !fb->cbufs[0] || + !util_format_is_pure_integer(fb->cbufs[0]->format)) && nv50->blend) { + if (nv50->blend->pipe.alpha_to_coverage) + ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE; + if (nv50->blend->pipe.alpha_to_one) + ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE; + } + + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_CTRL), 1); + PUSH_DATA (push, ms); +} + +static void nv50_validate_clip(struct nv50_context *nv50) { struct nouveau_pushbuf *push = nv50->base.pushbuf; @@ -474,6 +495,7 @@ static struct state_validate { { nv50_validate_derived_rs, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER | NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, { nv50_validate_derived_2, NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER }, + { nv50_validate_derived_3, NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER }, { nv50_validate_clip, NV50_NEW_CLIP | NV50_NEW_RASTERIZER | NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, { nv50_constbufs_validate, NV50_NEW_CONSTBUF }, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h index cf75d1e..4b1d00c 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h @@ -19,7 +19,7 @@ struct nv50_blend_stateobj { struct pipe_blend_state pipe; int size; - uint32_t state[84]; // TODO: allocate less if !independent_blend_enable + uint32_t state[82]; // TODO: allocate less if !independent_blend_enable }; struct nv50_rasterizer_stateobj { diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index b1ae016..64348b3 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -68,6 +68,10 @@ nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal) return NV50_SURFACE_FORMAT_R16_UNORM; case 4: return NV50_SURFACE_FORMAT_BGRA8_UNORM; + case 8: + return NV50_SURFACE_FORMAT_RGBA16_FLOAT; + case 16: + return NV50_SURFACE_FORMAT_RGBA32_FLOAT; default: return 0; } @@ -1003,6 +1007,8 @@ nv50_blitctx_prepare_state(struct nv50_blitctx *blit) /* zsa state */ BEGIN_NV04(push, NV50_3D(DEPTH_TEST_ENABLE), 1); PUSH_DATA (push, 0); + BEGIN_NV04(push, NV50_3D(DEPTH_BOUNDS_EN), 1); + PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_3D(STENCIL_ENABLE), 1); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_3D(ALPHA_TEST_ENABLE), 1); @@ -1387,18 +1393,24 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info) PUSH_DATA (push, info->dst.box.z + i); } else { const unsigned z = info->dst.box.z + i; + const uint64_t address = dst->base.address + + dst->level[info->dst.level].offset + + z * dst->layer_stride; BEGIN_NV04(push, NV50_2D(DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, dst->base.address + z * dst->layer_stride); - PUSH_DATA (push, dst->base.address + z * dst->layer_stride); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); } if (src->layout_3d) { /* not possible because of depth tiling */ assert(0); } else { const unsigned z = info->src.box.z + i; + const uint64_t address = src->base.address + + src->level[info->src.level].offset + + z * src->layer_stride; BEGIN_NV04(push, NV50_2D(SRC_ADDRESS_HIGH), 2); - PUSH_DATAh(push, src->base.address + z * src->layer_stride); - PUSH_DATA (push, src->base.address + z * src->layer_stride); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); } BEGIN_NV04(push, NV50_2D(BLIT_SRC_Y_INT), 1); /* trigger */ PUSH_DATA (push, srcy >> 32); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index 84f8db6..7a15a11 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -132,6 +132,9 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0) pipe_resource_reference(res, NULL); } util_dynarray_fini(&nvc0->global_residents); + + if (nvc0->tcp_empty) + nvc0->base.pipe.delete_tcs_state(&nvc0->base.pipe, nvc0->tcp_empty); } static void @@ -306,13 +309,6 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; - if (!screen->cur_ctx) { - nvc0->state = screen->save_state; - screen->cur_ctx = nvc0; - nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx); - } - screen->base.pushbuf->kick_notify = nvc0_default_kick_notify; - nvc0_init_query_functions(nvc0); nvc0_init_surface_functions(nvc0); nvc0_init_state_functions(nvc0); @@ -326,6 +322,21 @@ nvc0_create(struct pipe_screen *pscreen, void *priv) /* shader builtin library is per-screen, but we need a context for m2mf */ nvc0_program_library_upload(nvc0); + nvc0_program_init_tcp_empty(nvc0); + if (!nvc0->tcp_empty) + goto out_err; + /* set the empty tctl prog on next draw in case one is never set */ + nvc0->dirty |= NVC0_NEW_TCTLPROG; + + /* now that there are no more opportunities for errors, set the current + * context if there isn't already one. + */ + if (!screen->cur_ctx) { + nvc0->state = screen->save_state; + screen->cur_ctx = nvc0; + nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx); + } + screen->base.pushbuf->kick_notify = nvc0_default_kick_notify; /* add permanently resident buffers to bufctxts */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index f449942..df1a891 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -128,6 +128,8 @@ struct nvc0_context { struct nvc0_program *fragprog; struct nvc0_program *compprog; + struct nvc0_program *tcp_empty; + struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS]; uint16_t constbuf_dirty[6]; uint16_t constbuf_valid[6]; @@ -227,6 +229,7 @@ void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); uint32_t nvc0_program_symbol_offset(const struct nvc0_program *, uint32_t label); +void nvc0_program_init_tcp_empty(struct nvc0_context *); /* nvc0_query.c */ void nvc0_init_query_functions(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index 507a250..12f1bb7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -22,6 +22,8 @@ #include "pipe/p_defines.h" +#include "tgsi/tgsi_ureg.h" + #include "nvc0/nvc0_context.h" #include "codegen/nv50_ir_driver.h" @@ -799,3 +801,18 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) return prog->code_base + base + syms[i].offset; return prog->code_base; /* no symbols or symbol not found */ } + +void +nvc0_program_init_tcp_empty(struct nvc0_context *nvc0) +{ + struct ureg_program *ureg; + + ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL); + if (!ureg) + return; + + ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1); + ureg_END(ureg); + + nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe); +} diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 8aa127a..8f8ac2d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -148,8 +148,13 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1); PUSH_DATA (push, tp->num_gprs); } else { - BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1); + tp = nvc0->tcp_empty; + /* not a whole lot we can do to handle this failure */ + if (!nvc0_program_validate(nvc0, tp)) + assert(!"unable to validate empty tcp"); + BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2); PUSH_DATA (push, 0x20); + PUSH_DATA (push, tp->code_base); } nvc0_program_update_context_state(nvc0, tp, 1); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 2a33857..ee29912 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -90,7 +90,6 @@ nvc0_blend_state_create(struct pipe_context *pipe, struct nvc0_blend_stateobj *so = CALLOC_STRUCT(nvc0_blend_stateobj); int i; int r; /* reference */ - uint32_t ms; uint8_t blend_en = 0; bool indep_masks = false; bool indep_funcs = false; @@ -176,15 +175,6 @@ nvc0_blend_state_create(struct pipe_context *pipe, } } - ms = 0; - if (cso->alpha_to_coverage) - ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE; - if (cso->alpha_to_one) - ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE; - - SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1); - SB_DATA (so, ms); - assert(so->size <= (sizeof(so->state) / sizeof(so->state[0]))); return so; } @@ -234,7 +224,7 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe, SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample); SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth); - if (cso->line_smooth) + if (cso->line_smooth || cso->multisample) SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1); else SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index ce1119c..47bd66d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -1,4 +1,5 @@ +#include "util/u_format.h" #include "util/u_math.h" #include "nvc0/nvc0_context.h" @@ -555,6 +556,25 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0) } static void +nvc0_validate_derived_3(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct pipe_framebuffer_state *fb = &nvc0->framebuffer; + uint32_t ms = 0; + + if ((!fb->nr_cbufs || !fb->cbufs[0] || + !util_format_is_pure_integer(fb->cbufs[0]->format)) && nvc0->blend) { + if (nvc0->blend->pipe.alpha_to_coverage) + ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE; + if (nvc0->blend->pipe.alpha_to_one) + ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE; + } + + BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_CTRL), 1); + PUSH_DATA (push, ms); +} + +static void nvc0_validate_tess_state(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; @@ -628,6 +648,7 @@ static struct state_validate { { nvc0_validate_derived_1, NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA | NVC0_NEW_RASTERIZER }, { nvc0_validate_derived_2, NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER }, + { nvc0_validate_derived_3, NVC0_NEW_BLEND | NVC0_NEW_FRAMEBUFFER }, { nvc0_validate_clip, NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER | NVC0_NEW_VERTPROG | NVC0_NEW_TEVLPROG | diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h index 18fcc12..8bc33c6 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h @@ -17,7 +17,7 @@ struct nvc0_blend_stateobj { struct pipe_blend_state pipe; int size; - uint32_t state[72]; + uint32_t state[70]; }; struct nvc0_rasterizer_stateobj { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index 51a6f93..dbdf292 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -887,6 +887,7 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit) /* zsa state */ IMMED_NVC0(push, NVC0_3D(DEPTH_TEST_ENABLE), 0); + IMMED_NVC0(push, NVC0_3D(DEPTH_BOUNDS_EN), 0); IMMED_NVC0(push, NVC0_3D(STENCIL_ENABLE), 0); IMMED_NVC0(push, NVC0_3D(ALPHA_TEST_ENABLE), 0); @@ -1336,18 +1337,24 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) PUSH_DATA (push, info->dst.box.z + i); } else { const unsigned z = info->dst.box.z + i; + const uint64_t address = dst->base.address + + dst->level[info->dst.level].offset + + z * dst->layer_stride; BEGIN_NVC0(push, NVC0_2D(DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, dst->base.address + z * dst->layer_stride); - PUSH_DATA (push, dst->base.address + z * dst->layer_stride); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); } if (src->layout_3d) { /* not possible because of depth tiling */ assert(0); } else { const unsigned z = info->src.box.z + i; + const uint64_t address = src->base.address + + src->level[info->src.level].offset + + z * src->layer_stride; BEGIN_NVC0(push, NVC0_2D(SRC_ADDRESS_HIGH), 2); - PUSH_DATAh(push, src->base.address + z * src->layer_stride); - PUSH_DATA (push, src->base.address + z * src->layer_stride); + PUSH_DATAh(push, address); + PUSH_DATA (push, address); } BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_Y_INT), 1); /* trigger */ PUSH_DATA (push, srcy >> 32); diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c index 14f93fb..e8f4087 100644 --- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c +++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c @@ -693,7 +693,8 @@ void rc_init_regalloc_state(struct rc_regalloc_state *s) }; /* Allocate the main ra data structure */ - s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW); + s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW, + true); /* Create the register classes */ for (i = 0; i < RC_REG_CLASS_COUNT; i++) { diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c index 6ea8f24..b8cc316 100644 --- a/src/gallium/drivers/r300/r300_blit.c +++ b/src/gallium/drivers/r300/r300_blit.c @@ -667,7 +667,8 @@ static void r300_resource_copy_region(struct pipe_context *pipe, r300_blitter_begin(r300, R300_COPY); util_blitter_blit_generic(r300->blitter, dst_view, &dstbox, src_view, src_box, src_width0, src_height0, - PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL); + PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, + FALSE); r300_blitter_end(r300); pipe_surface_reference(&dst_view, NULL); diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index b0002c3..22a0950 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -732,7 +732,8 @@ void r600_resource_copy_region(struct pipe_context *ctx, r600_blitter_begin(ctx, R600_COPY_TEXTURE); util_blitter_blit_generic(rctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0, src_height0, - PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL); + PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, + FALSE); r600_blitter_end(ctx); pipe_surface_reference(&dst_view, NULL); diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h index 381f06d..fdbe1c0 100644 --- a/src/gallium/drivers/r600/r600_isa.h +++ b/src/gallium/drivers/r600/r600_isa.h @@ -262,7 +262,7 @@ static const struct alu_op_info alu_op_table[] = { {"PRED_SETNE_PUSH_INT", 2, { 0x4D, 0x4D },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_NE | AF_INT_CMP }, {"PRED_SETLT_PUSH_INT", 2, { 0x4E, 0x4E },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_LT | AF_INT_CMP }, {"PRED_SETLE_PUSH_INT", 2, { 0x4F, 0x4F },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_PRED_PUSH | AF_CC_LE | AF_INT_CMP }, - {"FLT_TO_INT", 1, { 0x6B, 0x50 },{ AF_S, AF_S, AF_VS, AF_VS}, AF_INT_DST | AF_CVT }, + {"FLT_TO_INT", 1, { 0x6B, 0x50 },{ AF_S, AF_S, AF_V, AF_V}, AF_INT_DST | AF_CVT }, {"BFREV_INT", 1, { -1, 0x51 },{ 0, 0, AF_VS, AF_VS}, AF_INT_DST }, {"ADDC_UINT", 2, { -1, 0x52 },{ 0, 0, AF_VS, AF_VS}, AF_UINT_DST }, {"SUBB_UINT", 2, { -1, 0x53 },{ 0, 0, AF_VS, AF_VS}, AF_UINT_DST }, diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 9b66105..384ba80 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -90,7 +90,7 @@ struct r600_context; struct r600_bytecode; -struct r600_shader_key; +union r600_shader_key; /* This is an atom containing GPU commands that never change. * This is supposed to be copied directly into the CS. */ @@ -643,7 +643,7 @@ void r600_resource_copy_region(struct pipe_context *ctx, /* r600_shader.c */ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader, - struct r600_shader_key key); + union r600_shader_key key); void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 8d1f95a..4c4b600 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -62,7 +62,7 @@ The compiler must issue the source argument to slots z, y, and x static int r600_shader_from_tgsi(struct r600_context *rctx, struct r600_pipe_shader *pipeshader, - struct r600_shader_key key); + union r600_shader_key key); static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, @@ -133,7 +133,7 @@ static int store_shader(struct pipe_context *ctx, int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader, - struct r600_shader_key key) + union r600_shader_key key) { struct r600_context *rctx = (struct r600_context *)ctx; struct r600_pipe_shader_selector *sel = shader->selector; @@ -141,7 +141,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx, bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens); unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); - unsigned export_shader = key.vs_as_es; + unsigned export_shader = key.vs.as_es; shader->shader.bc.isa = rctx->isa; @@ -1802,7 +1802,7 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind) static int r600_shader_from_tgsi(struct r600_context *rctx, struct r600_pipe_shader *pipeshader, - struct r600_shader_key key) + union r600_shader_key key) { struct r600_screen *rscreen = rctx->screen; struct r600_shader *shader = &pipeshader->shader; @@ -1816,7 +1816,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, unsigned opcode; int i, j, k, r = 0; int next_param_base = 0, next_clip_base; - int max_color_exports = MAX2(key.nr_cbufs, 1); + int max_color_exports = MAX2(key.ps.nr_cbufs, 1); /* Declarations used by llvm code */ bool use_llvm = false; bool indirect_gprs; @@ -1830,8 +1830,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.shader = shader; ctx.native_integers = true; - shader->vs_as_gs_a = key.vs_as_gs_a; - shader->vs_as_es = key.vs_as_es; + shader->vs_as_gs_a = key.vs.as_gs_a; + shader->vs_as_es = key.vs.as_es; r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, rscreen->has_compressed_msaa_texturing); @@ -1844,9 +1844,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->processor_type = ctx.type; ctx.bc->type = shader->processor_type; - ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY); + ring_outputs = key.vs.as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY); - if (key.vs_as_es) { + if (key.vs.as_es) { ctx.gs_for_vs = &rctx->gs_shader->current->shader; } else { ctx.gs_for_vs = NULL; @@ -1866,7 +1866,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->nr_ps_color_exports = 0; shader->nr_ps_max_color_exports = 0; - shader->two_side = key.color_two_side; + shader->two_side = key.ps.color_two_side; /* register allocations */ /* Values [0,127] correspond to GPR[0..127]. @@ -1970,7 +1970,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->fs_write_all = FALSE; if (shader->vs_as_gs_a) - vs_add_primid_output(&ctx, key.vs_prim_id_out); + vs_add_primid_output(&ctx, key.vs.prim_id_out); while (!tgsi_parse_end_of_tokens(&ctx.parse)) { tgsi_parse_token(&ctx.parse); @@ -2091,7 +2091,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, radeon_llvm_ctx.chip_class = ctx.bc->chip_class; radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN); radeon_llvm_ctx.stream_outputs = &so; - radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; + radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one; radeon_llvm_ctx.has_compressed_msaa_texturing = ctx.bc->has_compressed_msaa_texturing; mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens); @@ -2270,7 +2270,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, convert_edgeflag_to_int(&ctx); if (ring_outputs) { - if (key.vs_as_es) + if (key.vs.as_es) emit_gs_ring_writes(&ctx, FALSE); } else { /* Export output */ @@ -2386,7 +2386,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, j--; continue; } - output[j].swizzle_w = key.alpha_to_one ? 5 : 3; + output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; output[j].array_base = shader->output[i].sid; output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; shader->nr_ps_color_exports++; @@ -2399,7 +2399,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].swizzle_x = 0; output[j].swizzle_y = 1; output[j].swizzle_z = 2; - output[j].swizzle_w = key.alpha_to_one ? 5 : 3; + output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; output[j].burst_count = 1; output[j].array_base = k; output[j].op = CF_OP_EXPORT; @@ -6151,10 +6151,10 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx) r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]); if (r) return r; - r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[2]); + r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]); if (r) return r; - r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[2], &ctx->src[1]); + r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]); if (r) return r; tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 5d05c81..927bac5 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -95,13 +95,17 @@ struct r600_shader { struct r600_shader_array * arrays; }; -struct r600_shader_key { - unsigned color_two_side:1; - unsigned alpha_to_one:1; - unsigned nr_cbufs:4; - unsigned vs_as_es:1; - unsigned vs_as_gs_a:1; - unsigned vs_prim_id_out:8; +union r600_shader_key { + struct { + unsigned nr_cbufs:4; + unsigned color_two_side:1; + unsigned alpha_to_one:1; + } ps; + struct { + unsigned prim_id_out:8; + unsigned as_es:1; /* export shader */ + unsigned as_gs_a:1; + } vs; }; struct r600_shader_array { @@ -122,7 +126,7 @@ struct r600_pipe_shader { unsigned flatshade; unsigned pa_cl_vs_out_cntl; unsigned nr_ps_color_outputs; - struct r600_shader_key key; + union r600_shader_key key; unsigned db_shader_control; unsigned ps_depth_export; unsigned enabled_stream_buffers_mask; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index aa4a8d0..a05dd83 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -702,29 +702,39 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom) } /* Compute the key for the hw shader variant */ -static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx, +static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx, struct r600_pipe_shader_selector * sel) { struct r600_context *rctx = (struct r600_context *)ctx; - struct r600_shader_key key; + union r600_shader_key key; memset(&key, 0, sizeof(key)); - if (sel->type == PIPE_SHADER_FRAGMENT) { - key.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side; - key.alpha_to_one = rctx->alpha_to_one && - rctx->rasterizer && rctx->rasterizer->multisample_enable && - !rctx->framebuffer.cb0_is_integer; - key.nr_cbufs = rctx->framebuffer.state.nr_cbufs; - /* Dual-source blending only makes sense with nr_cbufs == 1. */ - if (key.nr_cbufs == 1 && rctx->dual_src_blend) - key.nr_cbufs = 2; - } else if (sel->type == PIPE_SHADER_VERTEX) { - key.vs_as_es = (rctx->gs_shader != NULL); + switch (sel->type) { + case PIPE_SHADER_VERTEX: { + key.vs.as_es = (rctx->gs_shader != NULL); if (rctx->ps_shader->current->shader.gs_prim_id_input && !rctx->gs_shader) { - key.vs_as_gs_a = true; - key.vs_prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid; + key.vs.as_gs_a = true; + key.vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid; } + break; + } + case PIPE_SHADER_GEOMETRY: + break; + case PIPE_SHADER_FRAGMENT: { + key.ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side; + key.ps.alpha_to_one = rctx->alpha_to_one && + rctx->rasterizer && rctx->rasterizer->multisample_enable && + !rctx->framebuffer.cb0_is_integer; + key.ps.nr_cbufs = rctx->framebuffer.state.nr_cbufs; + /* Dual-source blending only makes sense with nr_cbufs == 1. */ + if (key.ps.nr_cbufs == 1 && rctx->dual_src_blend) + key.ps.nr_cbufs = 2; + break; } + default: + assert(0); + } + return key; } @@ -734,7 +744,7 @@ static int r600_shader_select(struct pipe_context *ctx, struct r600_pipe_shader_selector* sel, bool *dirty) { - struct r600_shader_key key; + union r600_shader_key key; struct r600_pipe_shader * shader = NULL; int r; diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 16ee541..81f3f45 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -209,8 +209,6 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family) static unsigned calc_ctx_size(struct ruvd_decoder *dec) { - unsigned width_in_mb, height_in_mb, ctx_size; - unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH); unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT); @@ -223,8 +221,7 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec) width = align (width, 16); height = align (height, 16); - ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024; - return ctx_size; + return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024; } /* calculate size of reference picture buffer */ diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 48972bd..b7450b6 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -586,7 +586,8 @@ void si_resource_copy_region(struct pipe_context *ctx, si_blitter_begin(ctx, SI_COPY); util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0, src_height0, - PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL); + PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, + FALSE); si_blitter_end(ctx); pipe_surface_reference(&dst_view, NULL); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 4288e9b..fa6c15a 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2277,7 +2277,7 @@ static void tex_fetch_args( unsigned sampler_index; unsigned num_deriv_channels = 0; bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false; - LLVMValueRef res_ptr, samp_ptr; + LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL; sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1; sampler_index = emit_data->inst->Src[sampler_src].Register.Index; @@ -2293,9 +2293,19 @@ static void tex_fetch_args( samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER); samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index); + + if (target == TGSI_TEXTURE_2D_MSAA || + target == TGSI_TEXTURE_2D_ARRAY_MSAA) { + ind_index = LLVMBuildAdd(gallivm->builder, ind_index, + lp_build_const_int32(gallivm, + SI_FMASK_TEX_OFFSET), ""); + fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE); + fmask_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index); + } } else { res_ptr = si_shader_ctx->resources[sampler_index]; samp_ptr = si_shader_ctx->samplers[sampler_index]; + fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index]; } if (target == TGSI_TEXTURE_BUFFER) { @@ -2493,7 +2503,7 @@ static void tex_fetch_args( txf_emit_data.dst_type = LLVMVectorType( LLVMInt32TypeInContext(gallivm->context), 4); txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count); - txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index]; + txf_emit_data.args[1] = fmask_ptr; txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture); txf_emit_data.arg_count = 3; @@ -2524,8 +2534,7 @@ static void tex_fetch_args( * resource descriptor is 0 (invalid), */ LLVMValueRef fmask_desc = - LLVMBuildBitCast(gallivm->builder, - si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index], + LLVMBuildBitCast(gallivm->builder, fmask_ptr, LLVMVectorType(uint_bld->elem_type, 8), ""); LLVMValueRef fmask_word1 = @@ -3973,7 +3982,7 @@ static void si_dump_key(unsigned shader, union si_shader_key *key) fprintf(stderr, " es_enabled_outputs = 0x%"PRIx64"\n", key->vs.es_enabled_outputs); fprintf(stderr, " as_es = %u\n", key->vs.as_es); - fprintf(stderr, " as_es = %u\n", key->vs.as_ls); + fprintf(stderr, " as_ls = %u\n", key->vs.as_ls); break; case PIPE_SHADER_TESS_CTRL: diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index 654c46f..3a63af8 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -270,6 +270,7 @@ struct vc4_context { struct ra_regs *regs; unsigned int reg_class_any; + unsigned int reg_class_a_or_b_or_acc; unsigned int reg_class_r4_or_a; unsigned int reg_class_a; diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 7978ea1..5b43583 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -143,15 +143,6 @@ qir_opt_algebraic(struct vc4_compile *c) case QOP_SEL_X_Y_ZC: case QOP_SEL_X_Y_NS: case QOP_SEL_X_Y_NC: - if (qir_reg_equals(inst->src[0], inst->src[1])) { - /* Turn "dst = (sf == x) ? a : a)" into - * "dst = a" - */ - replace_with_mov(c, inst, inst->src[1]); - progress = true; - break; - } - if (is_zero(c, inst->src[1])) { /* Replace references to a 0 uniform value * with the SEL_X_0 equivalent. @@ -207,6 +198,7 @@ qir_opt_algebraic(struct vc4_compile *c) /* FADD(a, FSUB(0, b)) -> FSUB(a, b) */ if (inst->src[1].file == QFILE_TEMP && + c->defs[inst->src[1].index] && c->defs[inst->src[1].index]->op == QOP_FSUB) { struct qinst *fsub = c->defs[inst->src[1].index]; if (is_zero(c, fsub->src[0])) { @@ -221,6 +213,7 @@ qir_opt_algebraic(struct vc4_compile *c) /* FADD(FSUB(0, b), a) -> FSUB(a, b) */ if (inst->src[0].file == QFILE_TEMP && + c->defs[inst->src[0].index] && c->defs[inst->src[0].index]->op == QOP_FSUB) { struct qinst *fsub = c->defs[inst->src[0].index]; if (is_zero(c, fsub->src[0])) { @@ -236,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c) break; case QOP_FMUL: - if (replace_x_0_with_0(c, inst, 0) || - replace_x_0_with_0(c, inst, 1) || - fmul_replace_one(c, inst, 0) || - fmul_replace_one(c, inst, 1)) { + if (!inst->dst.pack && + (replace_x_0_with_0(c, inst, 0) || + replace_x_0_with_0(c, inst, 1) || + fmul_replace_one(c, inst, 0) || + fmul_replace_one(c, inst, 1))) { progress = true; break; } break; case QOP_MUL24: - if (replace_x_0_with_0(c, inst, 0) || - replace_x_0_with_0(c, inst, 1)) { + if (!inst->dst.pack && + (replace_x_0_with_0(c, inst, 0) || + replace_x_0_with_0(c, inst, 1))) { progress = true; break; } @@ -280,6 +275,14 @@ qir_opt_algebraic(struct vc4_compile *c) } break; + case QOP_RCP: + if (is_1f(c, inst->src[0])) { + replace_with_mov(c, inst, inst->src[0]); + progress = true; + break; + } + break; + default: break; } diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index a755de9..fd2539a 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -39,21 +39,27 @@ qir_opt_copy_propagation(struct vc4_compile *c) { bool progress = false; bool debug = false; - struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg)); list_for_each_entry(struct qinst, inst, &c->instructions, link) { for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { int index = inst->src[i].index; if (inst->src[i].file == QFILE_TEMP && - (movs[index].file == QFILE_TEMP || - movs[index].file == QFILE_UNIF)) { + c->defs[index] && + c->defs[index]->op == QOP_MOV && + (c->defs[index]->src[0].file == QFILE_TEMP || + c->defs[index]->src[0].file == QFILE_UNIF)) { + /* If it has a pack, it shouldn't be an SSA + * def. + */ + assert(!c->defs[index]->dst.pack); + if (debug) { fprintf(stderr, "Copy propagate: "); qir_dump_inst(c, inst); fprintf(stderr, "\n"); } - inst->src[i] = movs[index]; + inst->src[i] = c->defs[index]->src[0]; if (debug) { fprintf(stderr, "to: "); @@ -64,14 +70,6 @@ qir_opt_copy_propagation(struct vc4_compile *c) progress = true; } } - - if (inst->op == QOP_MOV && - inst->dst.file == QFILE_TEMP && - inst->src[0].file != QFILE_VPM) { - movs[inst->dst.index] = inst->src[0]; - } } - - free(movs); return progress; } diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c index e04f028..f2cdf8f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c +++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c @@ -68,7 +68,7 @@ qir_opt_vpm_writes(struct vc4_compile *c) continue; struct qinst *inst = c->defs[temp]; - if (qir_is_multi_instruction(inst)) + if (!inst || qir_is_multi_instruction(inst)) continue; if (qir_depends_on_flags(inst) || inst->sf) @@ -79,22 +79,6 @@ qir_opt_vpm_writes(struct vc4_compile *c) continue; } - /* A QOP_TEX_RESULT destination is r4, so we can't move - * accesses to it past another QOP_TEX_RESULT which would - * update it. - */ - int src; - for (src = 0; src < qir_get_op_nsrc(inst->op); src++) { - if (inst->src[src].file == QFILE_TEMP) { - if (c->defs[inst->src[src].index]->op == - QOP_TEX_RESULT) { - break; - } - } - } - if (src != qir_get_op_nsrc(inst->op)) - continue; - /* Move the generating instruction to the end of the program * to maintain the order of the VPM writes. */ diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 13c4721..e002983 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size) c->ubo_ranges[array_id].used = false; } +static bool +ntq_src_is_only_ssa_def_user(nir_src *src) +{ + if (!src->is_ssa) + return false; + + if (!list_empty(&src->ssa->if_uses)) + return false; + + return (src->ssa->uses.next == &src->use_link && + src->ssa->uses.next->next == &src->ssa->uses); +} + +/** + * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack + * bit set. + * + * However, as an optimization, it tries to find the instructions generating + * the sources to be packed and just emit the pack flag there, if possible. + */ +static void +ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) +{ + struct qreg result = qir_get_temp(c); + struct nir_alu_instr *vec4 = NULL; + + /* If packing from a vec4 op (as expected), identify it so that we can + * peek back at what generated its sources. + */ + if (instr->src[0].src.is_ssa && + instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu && + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op == + nir_op_vec4) { + vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + } + + for (int i = 0; i < 4; i++) { + int swiz = instr->src[0].swizzle[i]; + struct qreg src; + if (vec4) { + src = ntq_get_src(c, vec4->src[swiz].src, + vec4->src[swiz].swizzle[0]); + } else { + src = ntq_get_src(c, instr->src[0].src, swiz); + } + + if (vec4 && + ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) && + src.file == QFILE_TEMP && + c->defs[src.index] && + qir_is_mul(c->defs[src.index]) && + !c->defs[src.index]->dst.pack) { + struct qinst *rewrite = c->defs[src.index]; + c->defs[src.index] = NULL; + rewrite->dst = result; + rewrite->dst.pack = QPU_PACK_MUL_8A + i; + continue; + } + + qir_PACK_8_F(c, result, src, i); + } + + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + *dest = result; +} + static void ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) { @@ -839,17 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) } if (instr->op == nir_op_pack_unorm_4x8) { - struct qreg result; - for (int i = 0; i < 4; i++) { - struct qreg src = ntq_get_src(c, instr->src[0].src, - instr->src[0].swizzle[i]); - if (i == 0) - result = qir_PACK_8888_F(c, src); - else - result = qir_PACK_8_F(c, result, src, i); - } - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); - *dest = result; + ntq_emit_pack_unorm_4x8(c, instr); return; } @@ -1130,20 +1186,24 @@ emit_frag_end(struct vc4_compile *c) static void emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w) { - struct qreg xyi[2]; + struct qreg packed = qir_get_temp(c); for (int i = 0; i < 2; i++) { struct qreg scale = qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0); - xyi[i] = qir_FTOI(c, qir_FMUL(c, - qir_FMUL(c, - c->outputs[c->output_position_index + i], - scale), - rcp_w)); + struct qreg packed_chan = packed; + packed_chan.pack = QPU_PACK_A_16A + i; + + qir_FTOI_dest(c, packed_chan, + qir_FMUL(c, + qir_FMUL(c, + c->outputs[c->output_position_index + i], + scale), + rcp_w)); } - qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1])); + qir_VPM_WRITE(c, packed); } static void diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 254140a..9d93071 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -71,12 +71,11 @@ static const struct qir_op_info qir_op_info[] = { [QOP_RSQ] = { "rsq", 1, 1, false, true }, [QOP_EXP2] = { "exp2", 1, 2, false, true }, [QOP_LOG2] = { "log2", 1, 2, false, true }, - [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true }, - [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true }, - [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true }, - [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true }, - [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true }, - [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true }, + [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1 }, + [QOP_PACK_8A_F] = { "pack_8a_f", 1, 1 }, + [QOP_PACK_8B_F] = { "pack_8b_f", 1, 1 }, + [QOP_PACK_8C_F] = { "pack_8c_f", 1, 1 }, + [QOP_PACK_8D_F] = { "pack_8d_f", 1, 1 }, [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true }, [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true }, [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true }, @@ -169,6 +168,18 @@ qir_is_multi_instruction(struct qinst *inst) } bool +qir_is_mul(struct qinst *inst) +{ + switch (inst->op) { + case QOP_FMUL: + case QOP_MUL24: + return true; + default: + return false; + } +} + +bool qir_is_tex(struct qinst *inst) { return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT; @@ -273,6 +284,14 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst) inst->sf ? ".sf" : ""); qir_print_reg(c, inst->dst, true); + if (inst->dst.pack) { + if (inst->dst.pack) { + if (qir_is_mul(inst)) + vc4_qpu_disasm_pack_mul(stderr, inst->dst.pack); + else + vc4_qpu_disasm_pack_a(stderr, inst->dst.pack); + } + } for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { fprintf(stderr, ", "); qir_print_reg(c, inst->src[i], false); @@ -348,7 +367,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst) if (inst->dst.file == QFILE_TEMP) c->defs[inst->dst.index] = inst; - list_addtail(&inst->link, &c->instructions); + qir_emit_nodef(c, inst); } bool @@ -389,8 +408,11 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst) struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg) { - while (reg.file == QFILE_TEMP && c->defs[reg.index]->op == QOP_MOV) + while (reg.file == QFILE_TEMP && + c->defs[reg.index] && + c->defs[reg.index]->op == QOP_MOV) { reg = c->defs[reg.index]->src[0]; + } return reg; } diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index cade795..a2b21fa 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -58,6 +58,7 @@ enum qfile { struct qreg { enum qfile file; uint32_t index; + int pack; }; enum qop { @@ -104,7 +105,6 @@ enum qop { QOP_LOG2, QOP_VW_SETUP, QOP_VR_SETUP, - QOP_PACK_SCALED, QOP_PACK_8888_F, QOP_PACK_8A_F, QOP_PACK_8B_F, @@ -444,13 +444,20 @@ struct qreg qir_uniform(struct vc4_compile *c, enum quniform_contents contents, uint32_t data); void qir_reorder_uniforms(struct vc4_compile *c); + void qir_emit(struct vc4_compile *c, struct qinst *inst); +static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst) +{ + list_addtail(&inst->link, &c->instructions); +} + struct qreg qir_get_temp(struct vc4_compile *c); int qir_get_op_nsrc(enum qop qop); bool qir_reg_equals(struct qreg a, struct qreg b); bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst); bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst); bool qir_is_multi_instruction(struct qinst *inst); +bool qir_is_mul(struct qinst *inst); bool qir_is_tex(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); @@ -509,6 +516,12 @@ qir_##name(struct vc4_compile *c, struct qreg a) \ struct qreg t = qir_get_temp(c); \ qir_emit(c, qir_inst(QOP_##name, t, a, c->undef)); \ return t; \ +} \ +static inline void \ +qir_##name##_dest(struct vc4_compile *c, struct qreg dest, \ + struct qreg a) \ +{ \ + qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef)); \ } #define QIR_ALU2(name) \ @@ -518,6 +531,12 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b) \ struct qreg t = qir_get_temp(c); \ qir_emit(c, qir_inst(QOP_##name, t, a, b)); \ return t; \ +} \ +static inline void \ +qir_##name##_dest(struct vc4_compile *c, struct qreg dest, \ + struct qreg a, struct qreg b) \ +{ \ + qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, b)); \ } #define QIR_NODST_1(name) \ @@ -534,6 +553,14 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b) \ qir_emit(c, qir_inst(QOP_##name, c->undef, a, b)); \ } +#define QIR_PACK(name) \ +static inline struct qreg \ +qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a) \ +{ \ + qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef)); \ + return dest; \ +} + QIR_ALU1(MOV) QIR_ALU2(FADD) QIR_ALU2(FSUB) @@ -570,12 +597,11 @@ QIR_ALU1(RCP) QIR_ALU1(RSQ) QIR_ALU1(EXP2) QIR_ALU1(LOG2) -QIR_ALU2(PACK_SCALED) QIR_ALU1(PACK_8888_F) -QIR_ALU2(PACK_8A_F) -QIR_ALU2(PACK_8B_F) -QIR_ALU2(PACK_8C_F) -QIR_ALU2(PACK_8D_F) +QIR_PACK(PACK_8A_F) +QIR_PACK(PACK_8B_F) +QIR_PACK(PACK_8C_F) +QIR_PACK(PACK_8D_F) QIR_ALU1(VARY_ADD_C) QIR_NODST_2(TEX_S) QIR_NODST_2(TEX_T) @@ -627,11 +653,12 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i) } static inline struct qreg -qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan) +qir_PACK_8_F(struct vc4_compile *c, struct qreg dest, struct qreg val, int chan) { - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val)); - return t; + qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, dest, val, c->undef)); + if (dest.file == QFILE_TEMP) + c->defs[dest.index] = NULL; + return dest; } static inline struct qreg diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index fbb90ba..0719d28 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -24,6 +24,7 @@ #ifndef VC4_QPU_H #define VC4_QPU_H +#include <stdio.h> #include <stdint.h> #include "util/u_math.h" @@ -206,6 +207,12 @@ void vc4_qpu_disasm(const uint64_t *instructions, int num_instructions); void +vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack); + +void +vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack); + +void vc4_qpu_validate(uint64_t *insts, uint32_t num_inst); #endif /* VC4_QPU_H */ diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index 00aeb30..0879787 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -245,6 +245,18 @@ get_special_write_desc(int reg, bool is_a) return special_write[reg]; } +void +vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack) +{ + fprintf(out, ".%s", DESC(qpu_pack_mul, pack)); +} + +void +vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack) +{ + fprintf(out, "%s", DESC(qpu_pack_a, pack)); +} + static void print_alu_dst(uint64_t inst, bool is_mul) { @@ -263,9 +275,9 @@ print_alu_dst(uint64_t inst, bool is_mul) fprintf(stderr, "%s%d?", file, waddr); if (is_mul && (inst & QPU_PM)) { - fprintf(stderr, ".%s", DESC(qpu_pack_mul, pack)); + vc4_qpu_disasm_pack_mul(stderr, pack); } else if (is_a && !(inst & QPU_PM)) { - fprintf(stderr, "%s", DESC(qpu_pack_a, pack)); + vc4_qpu_disasm_pack_a(stderr, pack); } } diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index f324056..adf3a8b 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -179,10 +179,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) static const struct { uint32_t op; - bool is_mul; } translate[] = { -#define A(name) [QOP_##name] = {QPU_A_##name, false} -#define M(name) [QOP_##name] = {QPU_M_##name, true} +#define A(name) [QOP_##name] = {QPU_A_##name} +#define M(name) [QOP_##name] = {QPU_M_##name} A(FADD), A(FSUB), A(FMIN), @@ -336,28 +335,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QOP_PACK_8B_F: case QOP_PACK_8C_F: case QOP_PACK_8D_F: - /* If dst doesn't happen to already contain src[0], - * then we have to move it in. - */ - if (qinst->src[0].file != QFILE_NULL && - (src[0].mux != dst.mux || src[0].addr != dst.addr)) { - /* Don't overwrite src1 while setting up - * the dst! - */ - if (dst.mux == src[1].mux && - dst.addr == src[1].addr) { - queue(c, qpu_m_MOV(qpu_rb(31), src[1])); - src[1] = qpu_rb(31); - } - - queue(c, qpu_m_MOV(dst, src[0])); - } - - queue(c, qpu_m_MOV(dst, src[1])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + - qinst->op - QOP_PACK_8A_F, - QPU_PACK); + queue(c, + qpu_m_MOV(dst, src[0]) | + QPU_PM | + QPU_SET_FIELD(QPU_PACK_MUL_8A + + qinst->op - QOP_PACK_8A_F, + QPU_PACK)); break; case QOP_FRAG_X: @@ -419,24 +402,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_a_FADD(dst, src[0], qpu_r5())); break; - case QOP_PACK_SCALED: { - uint64_t a = (qpu_a_MOV(dst, src[0]) | - QPU_SET_FIELD(QPU_PACK_A_16A, - QPU_PACK)); - uint64_t b = (qpu_a_MOV(dst, src[1]) | - QPU_SET_FIELD(QPU_PACK_A_16B, - QPU_PACK)); - - if (dst.mux == src[1].mux && dst.addr == src[1].addr) { - queue(c, b); - queue(c, a); - } else { - queue(c, a); - queue(c, b); - } - break; - } - case QOP_TEX_S: case QOP_TEX_T: case QOP_TEX_R: @@ -529,14 +494,24 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) fixup_raddr_conflict(c, dst, &src[0], &src[1]); - if (translate[qinst->op].is_mul) { + if (qir_is_mul(qinst)) { queue(c, qpu_m_alu2(translate[qinst->op].op, dst, src[0], src[1])); + if (qinst->dst.pack) { + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack, + QPU_PACK); + } } else { queue(c, qpu_a_alu2(translate[qinst->op].op, dst, src[0], src[1])); + if (qinst->dst.pack) { + assert(dst.mux == QPU_MUX_A); + *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack, + QPU_PACK); + } } break; diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index a29db1f..3ced50f 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -113,9 +113,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4) if (vc4->regs) return; - vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs)); + vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true); vc4->reg_class_any = ra_alloc_reg_class(vc4->regs); + vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs); vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs); vc4->reg_class_a = ra_alloc_reg_class(vc4->regs); for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) { @@ -130,10 +131,12 @@ vc4_alloc_reg_set(struct vc4_context *vc4) */ if (vc4_regs[i].mux == QPU_MUX_R4) { ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); continue; } ra_class_add_reg(vc4->regs, vc4->reg_class_any, i); + ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i); } for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) { @@ -177,7 +180,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) uint8_t class_bits[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); - memset(def, 0, sizeof(def)); + for (int i = 0; i < ARRAY_SIZE(def); i++) + def[i] = ~0; memset(use, 0, sizeof(use)); /* If things aren't ever written (undefined values), just read from @@ -196,7 +200,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) uint32_t ip = 0; list_for_each_entry(struct qinst, inst, &c->instructions, link) { if (inst->dst.file == QFILE_TEMP) { - def[inst->dst.index] = ip; + def[inst->dst.index] = MIN2(ip, def[inst->dst.index]); use[inst->dst.index] = ip; } @@ -267,17 +271,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); break; - case QOP_PACK_SCALED: - /* The pack flags require an A-file dst register. */ - class_bits[inst->dst.index] &= CLASS_BIT_A; - break; - default: break; } + if (inst->dst.pack && !qir_is_mul(inst)) { + /* The non-MUL pack flags require an A-file dst + * register. + */ + class_bits[inst->dst.index] &= CLASS_BIT_A; + } + if (qir_src_needs_a_file(inst)) { - class_bits[inst->src[0].index] &= CLASS_BIT_A; + switch (inst->op) { + case QOP_UNPACK_8A_F: + case QOP_UNPACK_8B_F: + case QOP_UNPACK_8C_F: + case QOP_UNPACK_8D_F: + /* Special case: these can be done as R4 + * unpacks, as well. + */ + class_bits[inst->src[0].index] &= (CLASS_BIT_A | + CLASS_BIT_R4); + break; + default: + class_bits[inst->src[0].index] &= CLASS_BIT_A; + break; + } } ip++; } @@ -287,9 +307,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) switch (class_bits[i]) { case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4: - case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: ra_set_node_class(g, node, vc4->reg_class_any); break; + case CLASS_BIT_A | CLASS_BIT_B_OR_ACC: + ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc); + break; case CLASS_BIT_A | CLASS_BIT_R4: ra_set_node_class(g, node, vc4->reg_class_r4_or_a); break; |