diff options
Diffstat (limited to 'src/mesa')
37 files changed, 1109 insertions, 630 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index cea1e87..5d69039 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -1,6 +1,7 @@ i965_compiler_FILES = \ brw_cfg.cpp \ brw_cfg.h \ + brw_compiler.c \ brw_compiler.h \ brw_dead_control_flow.cpp \ brw_dead_control_flow.h \ @@ -51,6 +52,7 @@ i965_compiler_FILES = \ brw_shader.cpp \ brw_shader.h \ brw_surface_formats.c \ + brw_surface_formats.h \ brw_util.c \ brw_util.h \ brw_vec4_builder.h \ diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp index c6ae3d8..fd23e23 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp @@ -85,7 +85,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst, unsigned msg_length) { fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf), - brw_imm_ud(0u)); + brw_imm_ud(0u), brw_imm_ud(0u)); inst->base_mrf = base_mrf; inst->mlen = msg_length; diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c new file mode 100644 index 0000000..3d93772 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_compiler.c @@ -0,0 +1,180 @@ +/* + * Copyright © 2015-2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_compiler.h" +#include "brw_context.h" +#include "glsl/nir/nir.h" +#include "main/errors.h" +#include "util/debug.h" + +static void +shader_debug_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + va_list args; + + va_start(args, fmt); + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_OTHER, + MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args); + va_end(args); +} + +static void +shader_perf_log_mesa(void *data, const char *fmt, ...) +{ + struct brw_context *brw = (struct brw_context *)data; + + va_list args; + va_start(args, fmt); + + if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { + va_list args_copy; + va_copy(args_copy, args); + vfprintf(stderr, fmt, args_copy); + va_end(args_copy); + } + + if (brw->perf_debug) { + GLuint msg_id = 0; + _mesa_gl_vdebug(&brw->ctx, &msg_id, + MESA_DEBUG_SOURCE_SHADER_COMPILER, + MESA_DEBUG_TYPE_PERFORMANCE, + MESA_DEBUG_SEVERITY_MEDIUM, fmt, args); + } + va_end(args); +} + +#define COMMON_OPTIONS \ + /* In order to help allow for better CSE at the NIR level we tell NIR to \ + * split all ffma instructions during opt_algebraic and we then re-combine \ + * them as a later step. \ + */ \ + .lower_ffma = true, \ + .lower_sub = true, \ + .lower_fdiv = true, \ + .lower_scmp = true, \ + .lower_fmod = true, \ + .lower_bitfield_extract = true, \ + .lower_bitfield_insert = true, \ + .lower_uadd_carry = true, \ + .lower_usub_borrow = true, \ + .lower_fdiv = true, \ + .native_integers = true, \ + .vertex_id_zero_based = true + +static const struct nir_shader_compiler_options scalar_nir_options = { + COMMON_OPTIONS, + .lower_pack_half_2x16 = true, + .lower_pack_snorm_2x16 = true, + .lower_pack_snorm_4x8 = true, + .lower_pack_unorm_2x16 = true, + .lower_pack_unorm_4x8 = true, + .lower_unpack_half_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_snorm_4x8 = true, + .lower_unpack_unorm_2x16 = true, + .lower_unpack_unorm_4x8 = true, +}; + +static const struct nir_shader_compiler_options vector_nir_options = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, +}; + +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) +{ + struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); + + compiler->devinfo = devinfo; + compiler->shader_debug_log = shader_debug_log_mesa; + compiler->shader_perf_log = shader_perf_log_mesa; + + brw_fs_alloc_reg_sets(compiler); + brw_vec4_alloc_reg_set(compiler); + + compiler->scalar_stage[MESA_SHADER_VERTEX] = + devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; + compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); + compiler->scalar_stage[MESA_SHADER_GEOMETRY] = + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true); + compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; + compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; + + /* We want the GLSL compiler to emit code that uses condition codes */ + for (int i = 0; i < MESA_SHADER_STAGES; i++) { + compiler->glsl_compiler_options[i].MaxUnrollIterations = 32; + compiler->glsl_compiler_options[i].MaxIfDepth = + devinfo->gen < 6 ? 16 : UINT_MAX; + + compiler->glsl_compiler_options[i].EmitCondCodes = true; + compiler->glsl_compiler_options[i].EmitNoNoise = true; + compiler->glsl_compiler_options[i].EmitNoMainReturn = true; + compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; + compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; + compiler->glsl_compiler_options[i].LowerClipDistance = true; + + bool is_scalar = compiler->scalar_stage[i]; + + compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar; + compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar; + compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar; + + /* !ARB_gpu_shader5 */ + if (devinfo->gen < 7) + compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; + + compiler->glsl_compiler_options[i].NirOptions = + is_scalar ? &scalar_nir_options : &vector_nir_options; + + compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; + } + + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; + + if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) + compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; + + compiler->glsl_compiler_options[MESA_SHADER_COMPUTE] + .LowerShaderSharedVariables = true; + + return compiler; +} diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 748ffe5..cd28bbb 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -94,6 +94,9 @@ struct brw_compiler { struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES]; }; +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo); + /** * Program key structures. @@ -687,6 +690,9 @@ struct brw_gs_prog_data /** @} */ +struct brw_compiler * +brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo); + /** * Compile a vertex shader. * diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 10a6d39..9edb6f5 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -63,6 +63,7 @@ # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 8) # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM (1 << 8) +#ifndef _3DPRIM_POINTLIST /* FIXME: Avoid clashing with defines from bdw_pack.h */ #define _3DPRIM_POINTLIST 0x01 #define _3DPRIM_LINELIST 0x02 #define _3DPRIM_LINESTRIP 0x03 @@ -86,6 +87,7 @@ #define _3DPRIM_TRIFAN_NOSTIPPLE 0x16 #define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); }) +#endif /* bdw_pack.h */ /* We use this offset to be able to pass native primitive types in struct * _mesa_prim::mode. Native primitive types are BRW_PRIM_OFFSET + @@ -1085,6 +1087,18 @@ enum opcode { */ SHADER_OPCODE_BROADCAST, + /** + * Pick the byte from its first source register given by the index + * specified as second source. + */ + SHADER_OPCODE_EXTRACT_BYTE, + + /** + * Pick the word from its first source register given by the index + * specified as second source. + */ + SHADER_OPCODE_EXTRACT_WORD, + VEC4_OPCODE_MOV_BYTES, VEC4_OPCODE_PACK_BYTES, VEC4_OPCODE_UNPACK_UNIFORM, diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c index 7ab70fe..0587225 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.c +++ b/src/mesa/drivers/dri/i965/brw_device_info.c @@ -483,3 +483,15 @@ brw_get_device_info(int devid) return devinfo; } + +const char * +brw_get_device_name(int devid) +{ + switch (devid) { +#undef CHIPSET +#define CHIPSET(id, family, name) case id: return name; +#include "pci_ids/i965_pci_ids.h" + default: + return NULL; + } +} diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h index 73d6820..48e0dee 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.h +++ b/src/mesa/drivers/dri/i965/brw_device_info.h @@ -98,3 +98,4 @@ struct brw_device_info }; const struct brw_device_info *brw_get_device_info(int devid); +const char *brw_get_device_name(int devid); diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c index 8737c64..23e71fd 100644 --- a/src/mesa/drivers/dri/i965/brw_draw.c +++ b/src/mesa/drivers/dri/i965/brw_draw.c @@ -54,23 +54,6 @@ #define FILE_DEBUG_FLAG DEBUG_PRIMS -static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = { - [GL_POINTS] =_3DPRIM_POINTLIST, - [GL_LINES] = _3DPRIM_LINELIST, - [GL_LINE_LOOP] = _3DPRIM_LINELOOP, - [GL_LINE_STRIP] = _3DPRIM_LINESTRIP, - [GL_TRIANGLES] = _3DPRIM_TRILIST, - [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, - [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN, - [GL_QUADS] = _3DPRIM_QUADLIST, - [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP, - [GL_POLYGON] = _3DPRIM_POLYGON, - [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ, - [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, - [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ, - [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, -}; - static const GLenum reduced_prim[GL_POLYGON+1] = { [GL_POINTS] = GL_POINTS, @@ -85,18 +68,6 @@ static const GLenum reduced_prim[GL_POLYGON+1] = { [GL_POLYGON] = GL_TRIANGLES }; -uint32_t -get_hw_prim_for_gl_prim(int mode) -{ - if (mode >= BRW_PRIM_OFFSET) - return mode - BRW_PRIM_OFFSET; - else { - assert(mode < ARRAY_SIZE(prim_to_hw_prim)); - return prim_to_hw_prim[mode]; - } -} - - /* When the primitive changes, set a state bit and re-validate. Not * the nicest and would rather deal with this by having all the * programs be immune to the active primitive (ie. cope with all diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index cbeab6f..922f720 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -174,7 +174,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * CSE can later notice that those loads are all the same and eliminate * the redundant ones. */ - fs_reg vec4_offset = vgrf(glsl_type::int_type); + fs_reg vec4_offset = vgrf(glsl_type::uint_type); bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); int scale = 1; @@ -433,7 +433,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) : { this->reg_offset = 0; this->subreg_offset = 0; - this->reladdr = NULL; this->stride = 1; if (this->file == IMM && (this->type != BRW_REGISTER_TYPE_V && @@ -448,7 +447,6 @@ fs_reg::equals(const fs_reg &r) const { return (this->backend_reg::equals(r) && subreg_offset == r.subreg_offset && - !reladdr && !r.reladdr && stride == r.stride); } @@ -510,6 +508,7 @@ type_size_scalar(const struct glsl_type *type) case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_FUNCTION: unreachable("not reached"); } @@ -739,15 +738,15 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_LOD_LOGICAL: case SHADER_OPCODE_TG4_LOGICAL: case SHADER_OPCODE_TG4_OFFSET_LOGICAL: - assert(src[8].file == IMM && src[9].file == IMM); + assert(src[9].file == IMM && src[10].file == IMM); /* Texture coordinates. */ if (i == 0) - return src[8].ud; + return src[9].ud; /* Texture derivatives. */ else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL) - return src[9].ud; + return src[10].ud; /* Texture offset. */ - else if (i == 7) + else if (i == 8) return 2; /* MCS */ else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) @@ -850,7 +849,10 @@ fs_inst::regs_read(int arg) const assert(src[2].file == IMM); unsigned region_length = src[2].ud; - if (src[0].file == FIXED_GRF) { + if (src[0].file == UNIFORM) { + assert(region_length % 4 == 0); + return region_length / 4; + } else if (src[0].file == FIXED_GRF) { /* If the start of the region is not register aligned, then * there's some portion of the register that's technically * unread at the beginning. @@ -864,7 +866,7 @@ fs_inst::regs_read(int arg) const * unread portion at the beginning. */ if (src[0].subnr) - region_length += src[0].subnr * type_sz(src[0].type); + region_length += src[0].subnr; return DIV_ROUND_UP(region_length, REG_SIZE); } else { @@ -1020,7 +1022,6 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; - this->param_size = v->param_size; } fs_reg * @@ -1923,9 +1924,7 @@ fs_visitor::compact_virtual_grfs() * maximum number of fragment shader uniform components (64). If * there are too many of these, they'd fill up all of register space. * So, this will push some of them out to the pull constant buffer and - * update the program to load them. We also use pull constants for all - * indirect constant loads because we don't support indirect accesses in - * registers yet. + * update the program to load them. */ void fs_visitor::assign_constant_locations() @@ -1934,20 +1933,21 @@ fs_visitor::assign_constant_locations() if (dispatch_width != 8) return; - unsigned int num_pull_constants = 0; - - pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); - bool is_live[uniforms]; memset(is_live, 0, sizeof(is_live)); + /* For each uniform slot, a value of true indicates that the given slot and + * the next slot must remain contiguous. This is used to keep us from + * splitting arrays apart. + */ + bool contiguous[uniforms]; + memset(contiguous, 0, sizeof(contiguous)); + /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. * - * 2) Find all indirect access of uniform arrays and flag them as needing - * to go into the pull constant buffer. + * 2) Mark any indirectly used ranges of registers as contiguous. * * Note that we don't move constant-indexed accesses to arrays. No * testing has been done of the performance impact of this choice. @@ -1957,20 +1957,19 @@ fs_visitor::assign_constant_locations() if (inst->src[i].file != UNIFORM) continue; - if (inst->src[i].reladdr) { - int uniform = inst->src[i].nr; + int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - assert(param_size[uniform]); - for (int j = 0; j < param_size[uniform]; j++) - pull_constant_loc[uniform + j] = num_pull_constants++; + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { + assert(inst->src[2].ud % 4 == 0); + unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; + assert(last < uniforms); + + for (unsigned j = constant_nr; j < last; j++) { + is_live[j] = true; + contiguous[j] = true; } + is_live[last] = true; } else { - /* Mark the the one accessed uniform as live */ - int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; if (constant_nr >= 0 && constant_nr < (int) uniforms) is_live[constant_nr] = true; } @@ -1985,29 +1984,48 @@ fs_visitor::assign_constant_locations() * If changing this value, note the limitation about total_regs in * brw_curbe.c. */ - unsigned int max_push_components = 16 * 8; + const unsigned int max_push_components = 16 * 8; + + /* For vulkan we don't limit the max_chunk_size. We set it to 32 float = + * 128 bytes, which is the maximum vulkan push constant size. + */ + const unsigned int max_chunk_size = 32; + unsigned int num_push_constants = 0; + unsigned int num_pull_constants = 0; push_constant_loc = ralloc_array(mem_ctx, int, uniforms); + pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); - for (unsigned int i = 0; i < uniforms; i++) { - if (!is_live[i] || pull_constant_loc[i] != -1) { - /* This UNIFORM register is either dead, or has already been demoted - * to a pull const. Mark it as no longer living in the param[] array. - */ - push_constant_loc[i] = -1; + int chunk_start = -1; + for (unsigned u = 0; u < uniforms; u++) { + push_constant_loc[u] = -1; + pull_constant_loc[u] = -1; + + if (!is_live[u]) continue; - } - if (num_push_constants < max_push_components) { - /* Retain as a push constant. Record the location in the params[] - * array. - */ - push_constant_loc[i] = num_push_constants++; - } else { - /* Demote to a pull constant. */ - push_constant_loc[i] = -1; - pull_constant_loc[i] = num_pull_constants++; + /* This is the first live uniform in the chunk */ + if (chunk_start < 0) + chunk_start = u; + + /* If this element does not need to be contiguous with the next, we + * split at this point and everthing between chunk_start and u forms a + * single chunk. + */ + if (!contiguous[u]) { + unsigned chunk_size = u - chunk_start + 1; + + if (num_push_constants + chunk_size <= max_push_components && + chunk_size <= max_chunk_size) { + for (unsigned j = chunk_start; j <= u; j++) + push_constant_loc[j] = num_push_constants++; + } else { + for (unsigned j = chunk_start; j <= u; j++) + pull_constant_loc[j] = num_pull_constants++; + } + + chunk_start = -1; } } @@ -2038,51 +2056,67 @@ fs_visitor::assign_constant_locations() * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. */ void -fs_visitor::demote_pull_constants() +fs_visitor::lower_constant_loads() { - foreach_block_and_inst (block, fs_inst, inst, cfg) { + const unsigned index = stage_prog_data->binding_table.pull_constants_start; + + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(this, block, inst); + for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file != UNIFORM) continue; - int pull_index; + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + unsigned location = inst->src[i].nr + inst->src[i].reg_offset; - if (location >= uniforms) /* Out of bounds access */ - pull_index = -1; - else - pull_index = pull_constant_loc[location]; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; if (pull_index == -1) continue; - /* Set up the annotation tracking for new generated instructions. */ - const fs_builder ibld(this, block, inst); - const unsigned index = stage_prog_data->binding_table.pull_constants_start; - fs_reg dst = vgrf(glsl_type::float_type); - assert(inst->src[i].stride == 0); - /* Generate a pull load into dst. */ - if (inst->src[i].reladdr) { - VARYING_PULL_CONSTANT_LOAD(ibld, dst, - brw_imm_ud(index), - *inst->src[i].reladdr, - pull_index * 4); - inst->src[i].reladdr = NULL; - inst->src[i].stride = 1; - } else { - const fs_builder ubld = ibld.exec_all().group(8, 0); - struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, brw_imm_ud(index), offset); - inst->src[i].set_smear(pull_index & 3); - } - brw_mark_surface_used(prog_data, index); + fs_reg dst = vgrf(glsl_type::float_type); + const fs_builder ubld = ibld.exec_all().group(8, 0); + struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15); + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + dst, brw_imm_ud(index), offset); /* Rewrite the instruction to use the temporary VGRF. */ inst->src[i].file = VGRF; inst->src[i].nr = dst.nr; inst->src[i].reg_offset = 0; + inst->src[i].set_smear(pull_index & 3); + + brw_mark_surface_used(prog_data, index); + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + unsigned location = inst->src[0].nr + inst->src[0].reg_offset; + if (location >= uniforms) + continue; /* Out of bounds access */ + + int pull_index = pull_constant_loc[location]; + + if (pull_index == -1) + continue; + + VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + inst->src[1], + pull_index * 4); + inst->remove(block); + + brw_mark_surface_used(prog_data, index); } } invalidate_live_intervals(); @@ -2792,10 +2826,23 @@ fs_visitor::emit_repclear_shader() brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; int base_mrf = 1; int color_mrf = base_mrf + 2; + fs_inst *mov; - fs_inst *mov = bld.exec_all().group(4, 0) - .MOV(brw_message_reg(color_mrf), - fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + if (uniforms == 1) { + mov = bld.exec_all().group(4, 0) + .MOV(brw_message_reg(color_mrf), + fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); + } else { + struct brw_reg reg = + brw_reg(BRW_GENERAL_REGISTER_FILE, + 2, 3, 0, 0, BRW_REGISTER_TYPE_F, + BRW_VERTICAL_STRIDE_8, + BRW_WIDTH_2, + BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); + + mov = bld.exec_all().group(4, 0) + .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); + } fs_inst *write; if (key->nr_color_regions == 1) { @@ -2824,8 +2871,10 @@ fs_visitor::emit_repclear_shader() assign_curb_setup(); /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ - assert(mov->src[0].file == FIXED_GRF); - mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + if (uniforms == 1) { + assert(mov->src[0].file == FIXED_GRF); + mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); + } } /** @@ -3651,6 +3700,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &coordinate, const fs_reg &shadow_c, const fs_reg &lod, const fs_reg &lod2, + const fs_reg &surface, const fs_reg &sampler, unsigned coord_components, unsigned grad_components) @@ -3743,8 +3793,9 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, inst->opcode = op; inst->src[0] = reg_undef; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = msg_begin.nr; inst->mlen = msg_end.nr - msg_begin.nr; inst->header_size = 1; @@ -3756,6 +3807,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &shadow_c, fs_reg lod, fs_reg lod2, const fs_reg &sample_index, + const fs_reg &surface, const fs_reg &sampler, const fs_reg &offset_value, unsigned coord_components, @@ -3838,8 +3890,9 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, inst->opcode = op; inst->src[0] = reg_undef; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = message.nr; inst->mlen = msg_end.nr - message.nr; inst->header_size = header_size; @@ -3863,7 +3916,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, const fs_reg &shadow_c, fs_reg lod, fs_reg lod2, const fs_reg &sample_index, - const fs_reg &mcs, const fs_reg &sampler, + const fs_reg &mcs, + const fs_reg &surface, + const fs_reg &sampler, fs_reg offset_value, unsigned coord_components, unsigned grad_components) @@ -4066,8 +4121,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, /* Generate the SEND. */ inst->opcode = op; inst->src[0] = src_payload; - inst->src[1] = sampler; - inst->resize_sources(2); + inst->src[1] = surface; + inst->src[2] = sampler; + inst->resize_sources(3); inst->base_mrf = -1; inst->mlen = mlen; inst->header_size = header_size; @@ -4086,25 +4142,27 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const fs_reg &lod2 = inst->src[3]; const fs_reg &sample_index = inst->src[4]; const fs_reg &mcs = inst->src[5]; - const fs_reg &sampler = inst->src[6]; - const fs_reg &offset_value = inst->src[7]; - assert(inst->src[8].file == IMM && inst->src[9].file == IMM); - const unsigned coord_components = inst->src[8].ud; - const unsigned grad_components = inst->src[9].ud; + const fs_reg &surface = inst->src[6]; + const fs_reg &sampler = inst->src[7]; + const fs_reg &offset_value = inst->src[8]; + assert(inst->src[9].file == IMM && inst->src[10].file == IMM); + const unsigned coord_components = inst->src[9].ud; + const unsigned grad_components = inst->src[10].ud; if (devinfo->gen >= 7) { lower_sampler_logical_send_gen7(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - mcs, sampler, offset_value, + mcs, surface, sampler, offset_value, coord_components, grad_components); } else if (devinfo->gen >= 5) { lower_sampler_logical_send_gen5(bld, inst, op, coordinate, shadow_c, lod, lod2, sample_index, - sampler, offset_value, + surface, sampler, offset_value, coord_components, grad_components); } else { lower_sampler_logical_send_gen4(bld, inst, op, coordinate, - shadow_c, lod, lod2, sampler, + shadow_c, lod, lod2, + surface, sampler, coord_components, grad_components); } } @@ -4431,6 +4489,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: return 8; + case SHADER_OPCODE_MOV_INDIRECT: + /* Prior to Broadwell, we only have 8 address subregisters */ + return devinfo->gen < 8 ? 8 : inst->exec_size; + default: return inst->exec_size; } @@ -4713,9 +4775,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) break; case UNIFORM: fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset); - if (inst->src[i].reladdr) { - fprintf(file, "+reladdr"); - } else if (inst->src[i].subreg_offset) { + if (inst->src[i].subreg_offset) { fprintf(file, "+%d.%d", inst->src[i].reg_offset, inst->src[i].subreg_offset); } @@ -4826,7 +4886,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start, { if (end == start || end->is_partial_write() || - reg.reladdr || !reg.equals(end->dst)) { return NULL; } else { @@ -5039,7 +5098,7 @@ fs_visitor::optimize() bld = fs_builder(this, 64); assign_constant_locations(); - demote_pull_constants(); + lower_constant_loads(); validate(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 9a54c2d..c931910 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -139,7 +139,7 @@ public: void split_virtual_grfs(); bool compact_virtual_grfs(); void assign_constant_locations(); - void demote_pull_constants(); + void lower_constant_loads(); void invalidate_live_intervals(); void calculate_live_intervals(); void calculate_register_pressure(); @@ -207,6 +207,8 @@ public: fs_reg mcs, int gather_component, bool is_cube_array, + uint32_t surface, + fs_reg surface_reg, uint32_t sampler, fs_reg sampler_reg); fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components, @@ -222,7 +224,7 @@ public: void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg, uint32_t spill_offset, int count); void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg, - uint32_t spill_offset, int count); + uint32_t spill_offset, int count, bool we_all); void emit_nir_code(); void nir_setup_inputs(); @@ -321,8 +323,6 @@ public: const struct brw_vue_map *input_vue_map; - int *param_size; - int *virtual_grf_start; int *virtual_grf_end; brw::fs_live_variables *live_intervals; @@ -448,6 +448,7 @@ private: void generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src); void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, + struct brw_reg surface_index, struct brw_reg sampler_index); void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst, struct brw_reg src, diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 21f0b70..cbad47e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -72,6 +72,13 @@ channel_expressions_predicate(ir_instruction *ir) return false; switch (expr->operation) { + case ir_unop_pack_half_2x16: + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_snorm_4x8: + case ir_unop_pack_unorm_2x16: + case ir_unop_pack_unorm_4x8: + return false; + /* these opcodes need to act on the whole vector, * just like texturing. */ @@ -162,6 +169,11 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) return visit_continue; switch (expr->operation) { + case ir_unop_pack_half_2x16: + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_snorm_4x8: + case ir_unop_pack_unorm_2x16: + case ir_unop_pack_unorm_4x8: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: @@ -399,9 +411,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) case ir_unop_ssbo_unsized_array_length: unreachable("should have been lowered"); - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: - case ir_binop_pack_half_2x16_split: case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 3b65a38..cde6566 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -78,6 +78,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst) case FS_OPCODE_LINTERP: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_EXTRACT_BYTE: + case SHADER_OPCODE_EXTRACT_WORD: case SHADER_OPCODE_MOV_INDIRECT: return true; case SHADER_OPCODE_RCP: diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index e05622a..cac92b3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -351,23 +351,47 @@ fs_generator::generate_mov_indirect(fs_inst *inst, unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; - /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ - struct brw_reg addr = vec8(brw_address_reg(0)); + if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { + imm_byte_offset += indirect_byte_offset.ud; - /* The destination stride of an instruction (in bytes) must be greater - * than or equal to the size of the rest of the instruction. Since the - * address register is of type UW, we can't use a D-type instruction. - * In order to get around this, re re-type to UW and use a stride. - */ - indirect_byte_offset = - retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + reg.nr = imm_byte_offset / REG_SIZE; + reg.subnr = imm_byte_offset % REG_SIZE; + brw_MOV(p, dst, reg); + } else { + /* Prior to Broadwell, there are only 8 address registers. */ + assert(inst->exec_size == 8 || devinfo->gen >= 8); - /* Prior to Broadwell, there are only 8 address registers. */ - assert(inst->exec_size == 8 || devinfo->gen >= 8); + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); - brw_MOV(p, addr, indirect_byte_offset); - brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE); - brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type)); + /* The destination stride of an instruction (in bytes) must be greater + * than or equal to the size of the rest of the instruction. Since the + * address register is of type UW, we can't use a D-type instruction. + * In order to get around this, re re-type to UW and use a stride. + */ + indirect_byte_offset = + retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); + + if (devinfo->gen < 8) { + /* Prior to broadwell, we have a restriction that the bottom 5 bits + * of the base offset and the bottom 5 bits of the indirect must add + * to less than 32. In other words, the hardware needs to be able to + * add the bottom five bits of the two to get the subnumber and add + * the next 7 bits of each to get the actual register number. Since + * the indirect may cause us to cross a register boundary, this makes + * it almost useless. We could try and do something clever where we + * use a actual base offset if base_offset % 32 == 0 but that would + * mean we were generating different code depending on the base + * offset. Instead, for the sake of consistency, we'll just do the + * add ourselves. + */ + brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), dst.type)); + } else { + brw_MOV(p, addr, indirect_byte_offset); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type)); + } + } } void @@ -678,6 +702,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst, void fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, + struct brw_reg surface_index, struct brw_reg sampler_index) { int msg_type = -1; @@ -933,14 +958,16 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src ? prog_data->binding_table.gather_texture_start : prog_data->binding_table.texture_start; - if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + if (surface_index.file == BRW_IMMEDIATE_VALUE && + sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t surface = surface_index.ud; uint32_t sampler = sampler_index.ud; brw_SAMPLE(p, retype(dst, BRW_REGISTER_TYPE_UW), inst->base_mrf, src, - sampler + base_binding_table_index, + surface + base_binding_table_index, sampler % 16, msg_type, rlen, @@ -949,19 +976,24 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src simd_mode, return_format); - brw_mark_surface_used(prog_data, sampler + base_binding_table_index); + brw_mark_surface_used(prog_data, surface + base_binding_table_index); } else { /* Non-const sampler index */ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_access_mode(p, BRW_ALIGN_1); - /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */ - brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) { + brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + } else { + brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); + brw_OR(p, addr, addr, surface_reg); + } if (base_binding_table_index) brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); brw_AND(p, addr, addr, brw_imm_ud(0xfff)); @@ -2070,7 +2102,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: - generate_tex(inst, dst, src[0], src[1]); + generate_tex(inst, dst, src[0], src[1], src[2]); break; case FS_OPCODE_DDX_COARSE: case FS_OPCODE_DDX_FINE: @@ -2201,6 +2233,28 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_EXTRACT_BYTE: { + assert(src[0].type == BRW_REGISTER_TYPE_D || + src[0].type == BRW_REGISTER_TYPE_UD); + + enum brw_reg_type type = + src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_B + : BRW_REGISTER_TYPE_UB; + brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 4)); + break; + } + + case SHADER_OPCODE_EXTRACT_WORD: { + assert(src[0].type == BRW_REGISTER_TYPE_D || + src[0].type == BRW_REGISTER_TYPE_UD); + + enum brw_reg_type type = + src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_W + : BRW_REGISTER_TYPE_UW; + brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 2)); + break; + } + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index f9df2a4..48cdaf6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -179,15 +179,6 @@ fs_visitor::nir_setup_uniforms() return; uniforms = nir->num_uniforms / 4; - - nir_foreach_variable(var, &nir->uniforms) { - /* UBO's and atomics don't take up space in the uniform file */ - if (var->interface_type != NULL || var->type->contains_atomic()) - continue; - - if (type_size_scalar(var->type) > 0) - param_size[var->data.driver_location / 4] = type_size_scalar(var->type); - } } static bool @@ -728,15 +719,29 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_fsin: - inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); - inst->saturate = instr->dest.saturate; + case nir_op_fsin: { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F); + inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]); + if (instr->dest.saturate) { + inst->dst = result; + inst->saturate = true; + } else { + bld.MUL(result, tmp, brw_imm_f(0.99997)); + } break; + } - case nir_op_fcos: - inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); - inst->saturate = instr->dest.saturate; + case nir_op_fcos: { + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F); + inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]); + if (instr->dest.saturate) { + inst->dst = result; + inst->saturate = true; + } else { + bld.MUL(result, tmp, brw_imm_f(0.99997)); + } break; + } case nir_op_fddx: if (fs_key->high_quality_derivatives) { @@ -807,9 +812,41 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) unreachable("Should have been lowered by borrow_to_arith()."); case nir_op_umod: + case nir_op_irem: + /* According to the sign table for INT DIV in the Ivy Bridge PRM, it + * appears that our hardware just does the right thing for signed + * remainder. + */ bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); break; + case nir_op_imod: { + /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ + bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); + + /* Math instructions don't support conditional mod */ + inst = bld.MOV(bld.null_reg_d(), result); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Now, we need to determine if signs of the sources are different. + * When we XOR the sources, the top bit is 0 if they are the same and 1 + * if they are different. We can then use a conditional modifier to + * turn that into a predicate. This leads us to an XOR.l instruction. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); + inst = bld.XOR(tmp, op[0], op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_L; + + /* If the result of the initial remainder operation is non-zero and the + * two sources have different signs, add in a copy of op[1] to get the + * final integer modulus value. + */ + inst = bld.ADD(result, result, op[1]); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + case nir_op_flt: case nir_op_ilt: case nir_op_ult: @@ -947,6 +984,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; + case nir_op_fquantize2f16: { + fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); + fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); + fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); + + /* The destination stride must be at least as big as the source stride. */ + tmp16.type = BRW_REGISTER_TYPE_W; + tmp16.stride = 2; + + /* Check for denormal */ + fs_reg abs_src0 = op[0]; + abs_src0.abs = true; + bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), + BRW_CONDITIONAL_L); + /* Get the appropriately signed zero */ + bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x80000000)); + /* Do the actual F32 -> F16 -> F32 conversion */ + bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); + bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); + /* Select that or zero based on normal status */ + inst = bld.SEL(result, zero, tmp32); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->saturate = instr->dest.saturate; + break; + } + case nir_op_fmin: case nir_op_imin: case nir_op_umin: @@ -1079,6 +1144,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->predicate = BRW_PREDICATE_NORMAL; break; + case nir_op_extract_ubyte: + case nir_op_extract_ibyte: { + nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); + bld.emit(SHADER_OPCODE_EXTRACT_BYTE, + result, op[0], brw_imm_ud(byte->u[0])); + break; + } + + case nir_op_extract_uword: + case nir_op_extract_iword: { + nir_const_value *word = nir_src_as_const_value(instr->src[1].src); + bld.emit(SHADER_OPCODE_EXTRACT_WORD, + result, op[0], brw_imm_ud(word->u[0])); + break; + } + default: unreachable("unhandled instruction"); } @@ -1154,6 +1235,8 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) { fs_reg image(UNIFORM, deref->var->data.driver_location / 4, BRW_REGISTER_TYPE_UD); + fs_reg indirect; + unsigned indirect_max = 0; for (const nir_deref *tail = &deref->deref; tail->child; tail = tail->child) { @@ -1165,7 +1248,7 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) image = offset(image, bld, base * element_size); if (deref_array->deref_array_type == nir_deref_array_type_indirect) { - fs_reg tmp = vgrf(glsl_type::int_type); + fs_reg tmp = vgrf(glsl_type::uint_type); if (devinfo->gen == 7 && !devinfo->is_haswell) { /* IVB hangs when trying to access an invalid surface index with @@ -1183,15 +1266,31 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) bld.MOV(tmp, get_nir_src(deref_array->indirect)); } + indirect_max += element_size * (tail->type->length - 1); + bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4)); - if (image.reladdr) - bld.ADD(*image.reladdr, *image.reladdr, tmp); - else - image.reladdr = new(mem_ctx) fs_reg(tmp); + if (indirect.file == BAD_FILE) { + indirect = tmp; + } else { + bld.ADD(indirect, indirect, tmp); + } } } - return image; + if (indirect.file == BAD_FILE) { + return image; + } else { + /* Emit a pile of MOVs to load the uniform into a temporary. The + * dead-code elimination pass will get rid of what we don't use. + */ + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE); + for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(tmp, bld, j), offset(image, bld, j), + indirect, brw_imm_ud((indirect_max + 1) * 4)); + } + return tmp; + } } void @@ -2280,6 +2379,82 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); break; + case nir_intrinsic_load_shared: { + assert(devinfo->gen >= 7); + + fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); + + /* Get the offset to read from */ + fs_reg offset_reg; + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + if (const_offset) { + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(instr->const_index[0])); + } + + /* Read the vector */ + fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, + 1 /* dims */, + instr->num_components, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + for (int i = 0; i < instr->num_components; i++) + bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); + + break; + } + + case nir_intrinsic_store_shared: { + assert(devinfo->gen >= 7); + + /* Block index */ + fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); + + /* Value */ + fs_reg val_reg = get_nir_src(instr->src[0]); + + /* Writemask */ + unsigned writemask = instr->const_index[1]; + + /* Combine groups of consecutive enabled channels in one write + * message. We use ffs to find the first enabled channel and then ffs on + * the bit-inverse, down-shifted writemask to determine the length of + * the block of enabled bits. + */ + while (writemask) { + unsigned first_component = ffs(writemask) - 1; + unsigned length = ffs(~(writemask >> first_component)) - 1; + fs_reg offset_reg; + + nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); + if (const_offset) { + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] + + 4 * first_component); + } else { + offset_reg = vgrf(glsl_type::uint_type); + bld.ADD(offset_reg, + retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), + brw_imm_ud(instr->const_index[0] + 4 * first_component)); + } + + emit_untyped_write(bld, surf_index, offset_reg, + offset(val_reg, bld, first_component), + 1 /* dims */, length, + BRW_PREDICATE_NONE); + + /* Clear the bits in the writemask that we just wrote, then try + * again to see if more channels are left. + */ + writemask &= (15 << (first_component + length)); + } + + break; + } + default: nir_emit_intrinsic(bld, instr); break; @@ -2492,12 +2667,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr /* Offsets are in bytes but they should always be multiples of 4 */ assert(const_offset->u[0] % 4 == 0); src.reg_offset = const_offset->u[0] / 4; + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + } } else { - src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0])); - } + fs_reg indirect = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); - for (unsigned j = 0; j < instr->num_components; j++) { - bld.MOV(offset(dest, bld, j), offset(src, bld, j)); + /* We need to pass a size to the MOV_INDIRECT but we don't want it to + * go past the end of the uniform. In order to keep the n'th + * component from running past, we subtract off the size of all but + * one component of the vector. + */ + assert(instr->const_index[1] >= instr->num_components * 4); + unsigned read_size = instr->const_index[1] - + (instr->num_components - 1) * 4; + + for (unsigned j = 0; j < instr->num_components; j++) { + bld.emit(SHADER_OPCODE_MOV_INDIRECT, + offset(dest, bld, j), offset(src, bld, j), + indirect, brw_imm_ud(read_size)); + } } break; } @@ -2605,82 +2796,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_load_shared: { - assert(devinfo->gen >= 7); - - fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); - - /* Get the offset to read from */ - fs_reg offset_reg; - nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - if (const_offset) { - offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]); - } else { - offset_reg = vgrf(glsl_type::uint_type); - bld.ADD(offset_reg, - retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), - brw_imm_ud(instr->const_index[0])); - } - - /* Read the vector */ - fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, - 1 /* dims */, - instr->num_components, - BRW_PREDICATE_NONE); - read_result.type = dest.type; - for (int i = 0; i < instr->num_components; i++) - bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); - - break; - } - - case nir_intrinsic_store_shared: { - assert(devinfo->gen >= 7); - - /* Block index */ - fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); - - /* Value */ - fs_reg val_reg = get_nir_src(instr->src[0]); - - /* Writemask */ - unsigned writemask = instr->const_index[1]; - - /* Combine groups of consecutive enabled channels in one write - * message. We use ffs to find the first enabled channel and then ffs on - * the bit-inverse, down-shifted writemask to determine the length of - * the block of enabled bits. - */ - while (writemask) { - unsigned first_component = ffs(writemask) - 1; - unsigned length = ffs(~(writemask >> first_component)) - 1; - fs_reg offset_reg; - - nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); - if (const_offset) { - offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] + - 4 * first_component); - } else { - offset_reg = vgrf(glsl_type::uint_type); - bld.ADD(offset_reg, - retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), - brw_imm_ud(instr->const_index[0] + 4 * first_component)); - } - - emit_untyped_write(bld, surf_index, offset_reg, - offset(val_reg, bld, first_component), - 1 /* dims */, length, - BRW_PREDICATE_NONE); - - /* Clear the bits in the writemask that we just wrote, then try - * again to see if more channels are left. - */ - writemask &= (15 << (first_component + length)); - } - - break; - } - case nir_intrinsic_load_input: { fs_reg src; if (stage == MESA_SHADER_VERTEX) { @@ -2924,7 +3039,9 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, void fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) { + unsigned texture = instr->texture_index; unsigned sampler = instr->sampler_index; + fs_reg texture_reg(brw_imm_ud(texture)); fs_reg sampler_reg(brw_imm_ud(sampler)); int gather_component = instr->component; @@ -2937,6 +3054,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset; + /* Our hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + lod = brw_imm_d(0); + for (unsigned i = 0; i < instr->num_srcs; i++) { fs_reg src = get_nir_src(instr->src[i].src); switch (instr->src[i].src_type) { @@ -2991,9 +3112,9 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) case nir_tex_src_projector: unreachable("should be lowered"); - case nir_tex_src_sampler_offset: { - /* Figure out the highest possible sampler index and mark it as used */ - uint32_t max_used = sampler + instr->sampler_array_size - 1; + case nir_tex_src_texture_offset: { + /* Figure out the highest possible texture index and mark it as used */ + uint32_t max_used = texture + instr->texture_array_size - 1; if (instr->op == nir_texop_tg4 && devinfo->gen < 8) { max_used += stage_prog_data->binding_table.gather_texture_start; } else { @@ -3002,6 +3123,14 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) brw_mark_surface_used(prog_data, max_used); /* Emit code to evaluate the actual indexing expression */ + texture_reg = vgrf(glsl_type::uint_type); + bld.ADD(texture_reg, src, brw_imm_ud(texture)); + texture_reg = bld.emit_uniformize(texture_reg); + break; + } + + case nir_tex_src_sampler_offset: { + /* Emit code to evaluate the actual indexing expression */ sampler_reg = vgrf(glsl_type::uint_type); bld.ADD(sampler_reg, src, brw_imm_ud(sampler)); sampler_reg = bld.emit_uniformize(sampler_reg); @@ -3016,8 +3145,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) if (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical) { if (devinfo->gen >= 7 && - key_tex->compressed_multisample_layout_mask & (1 << sampler)) { - mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg); + key_tex->compressed_multisample_layout_mask & (1 << texture)) { + mcs = emit_mcs_fetch(coordinate, instr->coord_components, texture_reg); } else { mcs = brw_imm_ud(0u); } @@ -3054,7 +3183,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, dst, bld.vgrf(BRW_REGISTER_TYPE_D, 1), - sampler_reg); + texture_reg, texture_reg); inst->mlen = 1; inst->header_size = 1; inst->base_mrf = -1; @@ -3068,7 +3197,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) emit_texture(op, dest_type, coordinate, instr->coord_components, shadow_comparitor, lod, lod2, lod_components, sample_index, tex_offset, mcs, gather_component, - is_cube_array, sampler, sampler_reg); + is_cube_array, texture, texture_reg, sampler, sampler_reg); fs_reg dest = get_nir_dest(instr->dest); dest.type = this->result.type; diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 2347cd5..8396854 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -751,6 +751,7 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst, dst); unspill_inst->offset = spill_offset; unspill_inst->regs_written = reg_size; + unspill_inst->force_writemask_all = true; if (!gen7_read) { unspill_inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; @@ -764,11 +765,11 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst, void fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src, - uint32_t spill_offset, int count) + uint32_t spill_offset, int count, bool we_all) { int reg_size = 1; int spill_base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; - if (dispatch_width == 16 && count % 2 == 0) { + if (inst->exec_size == 16 && count % 2 == 0) { spill_base_mrf = FIRST_SPILL_MRF(devinfo->gen); reg_size = 2; } @@ -784,6 +785,8 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src, spill_inst->offset = spill_offset + i * reg_size * REG_SIZE; spill_inst->mlen = 1 + reg_size; /* header, value */ spill_inst->base_mrf = spill_base_mrf; + spill_inst->force_writemask_all = we_all; + spill_inst->force_sechalf = inst->force_sechalf; } } @@ -805,30 +808,13 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) */ foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == VGRF) { + if (inst->src[i].file == VGRF) spill_costs[inst->src[i].nr] += loop_scale; - - /* Register spilling logic assumes full-width registers; smeared - * registers have a width of 1 so if we try to spill them we'll - * generate invalid assembly. This shouldn't be a problem because - * smeared registers are only used as short-term temporaries when - * loading pull constants, so spilling them is unlikely to reduce - * register pressure anyhow. - */ - if (!inst->src[i].is_contiguous()) { - no_spill[inst->src[i].nr] = true; - } - } } - if (inst->dst.file == VGRF) { + if (inst->dst.file == VGRF) spill_costs[inst->dst.nr] += inst->regs_written * loop_scale; - if (!inst->dst.is_contiguous()) { - no_spill[inst->dst.nr] = true; - } - } - switch (inst->opcode) { case BRW_OPCODE_DO: @@ -938,12 +924,15 @@ fs_visitor::spill_reg(int spill_reg) * inst->regs_written(), then we need to unspill the destination * since we write back out all of the regs_written(). */ - if (inst->is_partial_write()) + bool need_unspill = inst->is_partial_write() || + type_sz(inst->dst.type) != 4; + if (need_unspill) emit_unspill(block, inst, spill_src, subset_spill_offset, inst->regs_written); emit_spill(block, inst, spill_src, subset_spill_offset, - inst->regs_written); + inst->regs_written, + need_unspill || inst->force_writemask_all); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp index 45694ec..f2facee 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp @@ -717,6 +717,15 @@ namespace { bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c), brw_imm_d(-(int)scale(widths[c] - s) - 1), BRW_CONDITIONAL_GE); + + /* Mask off all but the bits we actually want. Otherwise, if + * we pass a negative number into the hardware when it's + * expecting something like UINT8, it will happily clamp it to + * +255 for us. + */ + if (is_signed && widths[c] < 32) + bld.AND(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_d((1 << widths[c]) - 1)); } } @@ -787,6 +796,15 @@ namespace { /* Convert to integer. */ bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c)); bld.MOV(offset(dst, bld, c), offset(fdst, bld, c)); + + /* Mask off all but the bits we actually want. Otherwise, if + * we pass a negative number into the hardware when it's + * expecting something like UINT8, it will happily clamp it to + * +255 for us. + */ + if (is_signed && widths[c] < 32) + bld.AND(offset(dst, bld, c), offset(dst, bld, c), + brw_imm_d((1 << widths[c]) - 1)); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 25240ad..4011bf5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -79,12 +79,12 @@ fs_visitor::emit_vs_system_value(int location) /* Sample from the MCS surface attached to this multisample texture. */ fs_reg fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, - const fs_reg &sampler) + const fs_reg &texture) { const fs_reg dest = vgrf(glsl_type::uvec4_type); const fs_reg srcs[] = { coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(), - sampler, fs_reg(), brw_imm_ud(components), brw_imm_d(0) + texture, texture, fs_reg(), brw_imm_ud(components), brw_imm_d(0) }; fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, ARRAY_SIZE(srcs)); @@ -108,6 +108,8 @@ fs_visitor::emit_texture(ir_texture_opcode op, fs_reg mcs, int gather_component, bool is_cube_array, + uint32_t surface, + fs_reg surface_reg, uint32_t sampler, fs_reg sampler_reg) { @@ -147,7 +149,7 @@ fs_visitor::emit_texture(ir_texture_opcode op, fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1)); const fs_reg srcs[] = { coordinate, shadow_c, lod, lod2, - sample_index, mcs, sampler_reg, offset_value, + sample_index, mcs, surface_reg, sampler_reg, offset_value, brw_imm_d(coord_components), brw_imm_d(grad_components) }; enum opcode opcode; @@ -200,7 +202,7 @@ fs_visitor::emit_texture(ir_texture_opcode op, if (op == ir_tg4) { if (gather_component == 1 && - key_tex->gather_channel_quirk_mask & (1 << sampler)) { + key_tex->gather_channel_quirk_mask & (1 << surface)) { /* gather4 sampler is broken for green channel on RG32F -- * we must ask for blue instead. */ @@ -210,7 +212,7 @@ fs_visitor::emit_texture(ir_texture_opcode op, } if (devinfo->gen == 6) - emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst); + emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], dst); } /* fixup #layers for cube map arrays */ @@ -924,6 +926,8 @@ void fs_visitor::emit_barrier() { assert(devinfo->gen >= 7); + const uint32_t barrier_id_mask = + devinfo->gen >= 9 ? 0x8f000000u : 0x0f000000u; /* We are getting the barrier ID from the compute shader header */ assert(stage == MESA_SHADER_COMPUTE); @@ -937,7 +941,7 @@ fs_visitor::emit_barrier() /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */ fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)); - pbld.AND(component(payload, 2), r0_2, brw_imm_ud(0x0f000000u)); + pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask)); /* Emit a gateway "barrier" message using the payload we set up, followed * by a wait instruction. @@ -1035,9 +1039,6 @@ fs_visitor::init() this->spilled_any_registers = false; this->do_dual_src = false; - - if (dispatch_width == 8) - this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params); } fs_visitor::~fs_visitor() diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index c3eec2e..e4f20f4 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -58,8 +58,6 @@ public: */ int subreg_offset; - fs_reg *reladdr; - /** Register region horizontal stride */ uint8_t stride; }; @@ -136,8 +134,7 @@ component(fs_reg reg, unsigned idx) static inline bool is_uniform(const fs_reg ®) { - return (reg.stride == 0 || reg.is_null()) && - (!reg.reladdr || is_uniform(*reg.reladdr)); + return (reg.stride == 0 || reg.is_null()); } /** diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index 234afd5..ab9d792 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -73,36 +73,13 @@ brw_lower_packing_builtins(struct brw_context *brw, gl_shader_stage shader_type, exec_list *ir) { - const struct brw_compiler *compiler = brw->intelScreen->compiler; - - int ops = LOWER_PACK_SNORM_2x16 - | LOWER_UNPACK_SNORM_2x16 - | LOWER_PACK_UNORM_2x16 - | LOWER_UNPACK_UNORM_2x16; - - if (compiler->scalar_stage[shader_type]) { - ops |= LOWER_UNPACK_UNORM_4x8 - | LOWER_UNPACK_SNORM_4x8 - | LOWER_PACK_UNORM_4x8 - | LOWER_PACK_SNORM_4x8; - } - - if (brw->gen >= 7) { - /* Gen7 introduced the f32to16 and f16to32 instructions, which can be - * used to execute packHalf2x16 and unpackHalf2x16. For AOS code, no - * lowering is needed. For SOA code, the Half2x16 ops must be - * scalarized. - */ - if (compiler->scalar_stage[shader_type]) { - ops |= LOWER_PACK_HALF_2x16_TO_SPLIT - | LOWER_UNPACK_HALF_2x16_TO_SPLIT; - } - } else { - ops |= LOWER_PACK_HALF_2x16 - | LOWER_UNPACK_HALF_2x16; - } + /* Gens < 7 don't have instructions to convert to or from half-precision, + * and Gens < 6 don't expose that functionality. + */ + if (brw->gen != 6) + return; - lower_packing_builtins(ir, ops); + lower_packing_builtins(ir, LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16); } static void diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 935529a..d6987c8 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -409,6 +409,14 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar) } } +static void +brw_nir_lower_shared(nir_shader *nir) +{ + nir_assign_var_locations(&nir->shared, &nir->num_shared, + type_size_scalar_bytes); + nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes); +} + #define OPT(pass, ...) ({ \ bool this_progress = false; \ NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ @@ -488,7 +496,7 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar) /* Get rid of split copies */ nir = nir_optimize(nir, is_scalar); - OPT(nir_remove_dead_variables); + OPT(nir_remove_dead_variables, nir_var_local); return nir; } @@ -504,6 +512,8 @@ brw_nir_lower_io(nir_shader *nir, OPT_V(brw_nir_lower_inputs, devinfo, is_scalar); OPT_V(brw_nir_lower_outputs, devinfo, is_scalar); + if (nir->stage == MESA_SHADER_COMPUTE) + OPT_V(brw_nir_lower_shared); OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4); return nir_optimize(nir, is_scalar); @@ -529,7 +539,7 @@ brw_postprocess_nir(nir_shader *nir, if (devinfo->gen >= 6) { /* Try and fuse multiply-adds */ - OPT(brw_nir_opt_peephole_ffma); +// OPT(brw_nir_opt_peephole_ffma); } OPT(nir_opt_algebraic_late); diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 20d4e0d..94ceb52 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -279,7 +279,7 @@ brw_get_scratch_bo(struct brw_context *brw, void brwInitFragProgFuncs( struct dd_function_table *functions ) { - assert(functions->ProgramStringNotify == _tnl_program_string); + /* assert(functions->ProgramStringNotify == _tnl_program_string); */ functions->NewProgram = brwNewProgram; functions->DeleteProgram = brwDeleteProgram; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index d92bad2..e4ce8cb 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -29,136 +29,6 @@ #include "brw_vec4_tes.h" #include "main/shaderobj.h" #include "main/uniforms.h" -#include "util/debug.h" - -static void -shader_debug_log_mesa(void *data, const char *fmt, ...) -{ - struct brw_context *brw = (struct brw_context *)data; - va_list args; - - va_start(args, fmt); - GLuint msg_id = 0; - _mesa_gl_vdebug(&brw->ctx, &msg_id, - MESA_DEBUG_SOURCE_SHADER_COMPILER, - MESA_DEBUG_TYPE_OTHER, - MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args); - va_end(args); -} - -static void -shader_perf_log_mesa(void *data, const char *fmt, ...) -{ - struct brw_context *brw = (struct brw_context *)data; - - va_list args; - va_start(args, fmt); - - if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { - va_list args_copy; - va_copy(args_copy, args); - vfprintf(stderr, fmt, args_copy); - va_end(args_copy); - } - - if (brw->perf_debug) { - GLuint msg_id = 0; - _mesa_gl_vdebug(&brw->ctx, &msg_id, - MESA_DEBUG_SOURCE_SHADER_COMPILER, - MESA_DEBUG_TYPE_PERFORMANCE, - MESA_DEBUG_SEVERITY_MEDIUM, fmt, args); - } - va_end(args); -} - -struct brw_compiler * -brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) -{ - struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler); - - compiler->devinfo = devinfo; - compiler->shader_debug_log = shader_debug_log_mesa; - compiler->shader_perf_log = shader_perf_log_mesa; - - brw_fs_alloc_reg_sets(compiler); - brw_vec4_alloc_reg_set(compiler); - - compiler->scalar_stage[MESA_SHADER_VERTEX] = - devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); - compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; - compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); - compiler->scalar_stage[MESA_SHADER_GEOMETRY] = - devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); - compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; - compiler->scalar_stage[MESA_SHADER_COMPUTE] = true; - - nir_shader_compiler_options *nir_options = - rzalloc(compiler, nir_shader_compiler_options); - nir_options->native_integers = true; - nir_options->lower_fdiv = true; - /* In order to help allow for better CSE at the NIR level we tell NIR - * to split all ffma instructions during opt_algebraic and we then - * re-combine them as a later step. - */ - nir_options->lower_ffma = true; - nir_options->lower_sub = true; - nir_options->lower_fdiv = true; - nir_options->lower_scmp = true; - nir_options->lower_fmod = true; - nir_options->lower_bitfield_extract = true; - nir_options->lower_bitfield_insert = true; - nir_options->lower_uadd_carry = true; - nir_options->lower_usub_borrow = true; - - /* In the vec4 backend, our dpN instruction replicates its result to all - * the components of a vec4. We would like NIR to give us replicated fdot - * instructions because it can optimize better for us. - * - * For the FS backend, it should be lowered away by the scalarizing pass so - * we should never see fdot anyway. - */ - nir_options->fdot_replicates = true; - - /* We want the GLSL compiler to emit code that uses condition codes */ - for (int i = 0; i < MESA_SHADER_STAGES; i++) { - compiler->glsl_compiler_options[i].MaxUnrollIterations = 32; - compiler->glsl_compiler_options[i].MaxIfDepth = - devinfo->gen < 6 ? 16 : UINT_MAX; - - compiler->glsl_compiler_options[i].EmitCondCodes = true; - compiler->glsl_compiler_options[i].EmitNoNoise = true; - compiler->glsl_compiler_options[i].EmitNoMainReturn = true; - compiler->glsl_compiler_options[i].EmitNoIndirectInput = true; - compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false; - compiler->glsl_compiler_options[i].LowerClipDistance = true; - - bool is_scalar = compiler->scalar_stage[i]; - - compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar; - compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar; - compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar; - - /* !ARB_gpu_shader5 */ - if (devinfo->gen < 7) - compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; - - compiler->glsl_compiler_options[i].NirOptions = nir_options; - - compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; - } - - compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; - compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; - - if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) - compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; - - compiler->glsl_compiler_options[MESA_SHADER_COMPUTE] - .LowerShaderSharedVariables = true; - - return compiler; -} extern "C" struct gl_shader * brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) @@ -214,6 +84,7 @@ brw_type_for_base_type(const struct glsl_type *type) case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_FUNCTION: unreachable("not reached"); } @@ -442,6 +313,10 @@ brw_instruction_name(enum opcode op) case SHADER_OPCODE_BROADCAST: return "broadcast"; + case SHADER_OPCODE_EXTRACT_BYTE: + return "extract_byte"; + case SHADER_OPCODE_EXTRACT_WORD: + return "extract_word"; case VEC4_OPCODE_MOV_BYTES: return "mov_bytes"; case VEC4_OPCODE_PACK_BYTES: diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 5933613..82374a4 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -259,9 +259,6 @@ struct brw_gs_compile unsigned control_data_header_size_bits; }; -struct brw_compiler * -brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo); - void brw_assign_common_binding_table_offsets(gl_shader_stage stage, const struct brw_device_info *devinfo, diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c index b5c1a35..f42a953 100644 --- a/src/mesa/drivers/dri/i965/brw_surface_formats.c +++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c @@ -25,21 +25,8 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" - -struct surface_format_info { - bool exists; - int sampling; - int filtering; - int shadow_compare; - int chroma_key; - int render_target; - int alpha_blend; - int input_vb; - int streamed_output_vb; - int color_processing; - int lossless_compression; - const char *name; -}; +#include "brw_wm.h" +#include "brw_surface_formats.h" /* This macro allows us to write the table almost as it appears in the PRM, * while restructuring it to turn it into the C code we want. @@ -86,7 +73,7 @@ struct surface_format_info { * - VOL4_Part1 section 3.9.11 Render Target Write. * - Render Target Surface Types [SKL+] */ -const struct surface_format_info surface_formats[] = { +const struct brw_surface_format_info surface_formats[] = { /* smpl filt shad CK RT AB VB SO color ccs_e */ SF( Y, 50, x, x, Y, Y, Y, Y, x, 90, R32G32B32A32_FLOAT) SF( Y, x, x, x, Y, x, Y, Y, x, 90, R32G32B32A32_SINT) @@ -218,7 +205,7 @@ const struct surface_format_info surface_formats[] = { SF(50, 50, x, x, x, x, x, x, x, x, P8A8_UNORM_PALETTE0) SF(50, 50, x, x, x, x, x, x, x, x, P8A8_UNORM_PALETTE1) SF( x, x, x, x, x, x, x, x, x, x, A1B5G5R5_UNORM) - SF( x, x, x, x, x, x, x, x, x, x, A4B4G4R4_UNORM) + SF( Y, Y, x, Y, 90, x, x, x, x, x, A4B4G4R4_UNORM) SF( x, x, x, x, x, x, x, x, x, x, L8A8_UINT) SF( x, x, x, x, x, x, x, x, x, x, L8A8_SINT) SF( Y, Y, x, 45, Y, Y, Y, x, x, x, R8_UNORM) @@ -281,13 +268,13 @@ const struct surface_format_info surface_formats[] = { SF(70, 70, x, x, x, x, x, x, x, x, BC6H_UF16) SF( x, x, x, x, x, x, x, x, x, x, PLANAR_420_8) SF( x, x, x, x, x, x, x, x, x, x, R8G8B8_UNORM_SRGB) - SF( x, x, x, x, x, x, x, x, x, x, ETC1_RGB8) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_RGB8) - SF( x, x, x, x, x, x, x, x, x, x, EAC_R11) - SF( x, x, x, x, x, x, x, x, x, x, EAC_RG11) - SF( x, x, x, x, x, x, x, x, x, x, EAC_SIGNED_R11) - SF( x, x, x, x, x, x, x, x, x, x, EAC_SIGNED_RG11) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_SRGB8) + SF(80, 80, x, x, x, x, x, x, x, x, ETC1_RGB8) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_RGB8) + SF(80, 80, x, x, x, x, x, x, x, x, EAC_R11) + SF(80, 80, x, x, x, x, x, x, x, x, EAC_RG11) + SF(80, 80, x, x, x, x, x, x, x, x, EAC_SIGNED_R11) + SF(80, 80, x, x, x, x, x, x, x, x, EAC_SIGNED_RG11) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_SRGB8) SF( x, x, x, x, x, x, x, x, x, x, R16G16B16_UINT) SF( x, x, x, x, x, x, x, x, x, x, R16G16B16_SINT) SF( x, x, x, x, x, x, x, x, x, x, R32_SFIXED) @@ -302,10 +289,10 @@ const struct surface_format_info surface_formats[] = { SF( x, x, x, x, x, x, x, x, x, x, B10G10R10A2_SINT) SF( x, x, x, x, x, x, x, x, x, x, R64G64B64A64_PASSTHRU) SF( x, x, x, x, x, x, x, x, x, x, R64G64B64_PASSTHRU) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_RGB8_PTA) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_SRGB8_PTA) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_EAC_RGBA8) - SF( x, x, x, x, x, x, x, x, x, x, ETC2_EAC_SRGB8_A8) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_RGB8_PTA) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_SRGB8_PTA) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_EAC_RGBA8) + SF(80, 80, x, x, x, x, x, x, x, x, ETC2_EAC_SRGB8_A8) SF( x, x, x, x, x, x, x, x, x, x, R8G8B8_UINT) SF( x, x, x, x, x, x, x, x, x, x, R8G8B8_SINT) SF(80, 80, x, x, x, x, x, x, x, x, ASTC_LDR_2D_4x4_FLT16) @@ -618,7 +605,7 @@ brw_init_surface_formats(struct brw_context *brw) for (format = MESA_FORMAT_NONE + 1; format < MESA_FORMAT_COUNT; format++) { uint32_t texture, render; - const struct surface_format_info *rinfo, *tinfo; + const struct brw_surface_format_info *rinfo, *tinfo; bool is_integer = _mesa_is_format_integer_color(format); render = texture = brw_format_for_mesa_format(format); @@ -827,7 +814,7 @@ bool brw_losslessly_compressible_format(struct brw_context *brw, uint32_t brw_format) { - const struct surface_format_info * const sinfo = + const struct brw_surface_format_info * const sinfo = &surface_formats[brw_format]; const int gen = brw->gen * 10; diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.h b/src/mesa/drivers/dri/i965/brw_surface_formats.h new file mode 100644 index 0000000..a5cd49f --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_surface_formats.h @@ -0,0 +1,41 @@ +/* + * Copyright © 2011 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +struct brw_surface_format_info { + bool exists; + int sampling; + int filtering; + int shadow_compare; + int chroma_key; + int render_target; + int alpha_blend; + int input_vb; + int streamed_output_vb; + int color_processing; + int lossless_compression; + const char *name; +}; + +extern const struct brw_surface_format_info surface_formats[]; diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c index bf7f9c6..934b6b8 100644 --- a/src/mesa/drivers/dri/i965/brw_util.c +++ b/src/mesa/drivers/dri/i965/brw_util.c @@ -98,3 +98,31 @@ GLuint brw_translate_blend_factor( GLenum factor ) unreachable("not reached"); } } + +static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = { + [GL_POINTS] =_3DPRIM_POINTLIST, + [GL_LINES] = _3DPRIM_LINELIST, + [GL_LINE_LOOP] = _3DPRIM_LINELOOP, + [GL_LINE_STRIP] = _3DPRIM_LINESTRIP, + [GL_TRIANGLES] = _3DPRIM_TRILIST, + [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, + [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN, + [GL_QUADS] = _3DPRIM_QUADLIST, + [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP, + [GL_POLYGON] = _3DPRIM_POLYGON, + [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ, + [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, + [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ, + [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, +}; + +uint32_t +get_hw_prim_for_gl_prim(int mode) +{ + if (mode >= BRW_PRIM_OFFSET) + return mode - BRW_PRIM_OFFSET; + else { + assert(mode < ARRAY_SIZE(prim_to_hw_prim)); + return prim_to_hw_prim[mode]; + } +} diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index e7aec1f..394e321 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -477,11 +477,6 @@ vec4_visitor::split_uniform_registers() inst->src[i].reg_offset = 0; } } - - /* Update that everything is now vector-sized. */ - for (int i = 0; i < this->uniforms; i++) { - this->uniform_size[i] = 1; - } } void @@ -539,7 +534,6 @@ vec4_visitor::pack_uniform_registers() * push constants. */ for (int src = 0; src < uniforms; src++) { - assert(src < uniform_array_size); int size = chans_used[src]; if (size == 0) @@ -786,7 +780,7 @@ vec4_visitor::move_push_constants_to_pull_constants() dst_reg temp = dst_reg(this, glsl_type::vec4_type); emit_pull_constant_load(block, inst, temp, inst->src[i], - pull_constant_loc[uniform]); + pull_constant_loc[uniform], src_reg()); inst->src[i].file = temp.file; inst->src[i].nr = temp.nr; @@ -1606,8 +1600,6 @@ vec4_visitor::setup_uniforms(int reg) * matter what, or the GPU would hang. */ if (devinfo->gen < 6 && this->uniforms == 0) { - assert(this->uniforms < this->uniform_array_size); - stage_prog_data->param = reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4); for (unsigned int i = 0; i < 4; i++) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index ddfd87d..83d9eda 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -115,8 +115,6 @@ public: */ dst_reg output_reg[BRW_VARYING_SLOT_COUNT]; const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT]; - int *uniform_size; - int uniform_array_size; /*< Size of the uniform_size array */ int uniforms; src_reg shader_start_time; @@ -260,6 +258,7 @@ public: src_reg offset_value, src_reg mcs, bool is_cube_array, + uint32_t surface, src_reg surface_reg, uint32_t sampler, src_reg sampler_reg); src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate, @@ -284,8 +283,6 @@ public: src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst, src_reg *reladdr, int reg_offset); - src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset); void emit_scratch_read(bblock_t *block, vec4_instruction *inst, dst_reg dst, src_reg orig_src, @@ -295,7 +292,8 @@ public: void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, dst_reg dst, src_reg orig_src, - int base_offset); + int base_offset, + src_reg indirect); void emit_pull_constant_load_reg(dst_reg dst, src_reg surf_index, src_reg offset, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 730be21..237534d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -109,6 +109,7 @@ generate_tex(struct brw_codegen *p, vec4_instruction *inst, struct brw_reg dst, struct brw_reg src, + struct brw_reg surface_index, struct brw_reg sampler_index) { const struct brw_device_info *devinfo = p->devinfo; @@ -264,14 +265,16 @@ generate_tex(struct brw_codegen *p, ? prog_data->base.binding_table.gather_texture_start : prog_data->base.binding_table.texture_start; - if (sampler_index.file == BRW_IMMEDIATE_VALUE) { + if (surface_index.file == BRW_IMMEDIATE_VALUE && + sampler_index.file == BRW_IMMEDIATE_VALUE) { + uint32_t surface = surface_index.ud; uint32_t sampler = sampler_index.ud; brw_SAMPLE(p, dst, inst->base_mrf, src, - sampler + base_binding_table_index, + surface + base_binding_table_index, sampler % 16, msg_type, 1, /* response length */ @@ -285,14 +288,19 @@ generate_tex(struct brw_codegen *p, /* Non-constant sampler index. */ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); + struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); brw_push_insn_state(p); brw_set_default_mask_control(p, BRW_MASK_DISABLE); brw_set_default_access_mode(p, BRW_ALIGN_1); - /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */ - brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) { + brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); + } else { + brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); + brw_OR(p, addr, addr, surface_reg); + } if (base_binding_table_index) brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); brw_AND(p, addr, addr, brw_imm_ud(0xfff)); @@ -1383,6 +1391,48 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p, } static void +generate_mov_indirect(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, struct brw_reg reg, + struct brw_reg indirect, struct brw_reg length) +{ + assert(indirect.type == BRW_REGISTER_TYPE_UD); + + unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); + + /* This instruction acts in align1 mode */ + assert(inst->force_writemask_all || reg.writemask == 0xf); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + struct brw_reg addr = vec2(brw_address_reg(0)); + + /* We need to move the indirect value into the address register. In order + * to make things make some sense, we want to respect at least the X + * component of the swizzle. In order to do that, we need to convert the + * subnr (probably 0) to an align1 subnr and add in the swizzle. We then + * use a region of <8,4,0>:uw to pick off the first 2 bytes of the indirect + * and splat it out to all four channels of the given half of a0. + */ + assert(brw_is_single_value_swizzle(indirect.swizzle)); + indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)) * 2; + indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); + + brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); + + /* Use a <4,1> region Vx1 region*/ + struct brw_reg src = brw_VxH_indirect(0, 0); + src.width = BRW_WIDTH_4; + src.hstride = BRW_HORIZONTAL_STRIDE_1; + + brw_MOV(p, dst, retype(src, reg.type)); + + brw_pop_insn_state(p); +} + +static void generate_code(struct brw_codegen *p, const struct brw_compiler *compiler, void *log_data, @@ -1664,7 +1714,7 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: case SHADER_OPCODE_SAMPLEINFO: - generate_tex(p, prog_data, inst, dst, src[0], src[1]); + generate_tex(p, prog_data, inst, dst, src[0], src[1], src[2]); break; case VS_OPCODE_URB_WRITE: @@ -1928,6 +1978,9 @@ generate_code(struct brw_codegen *p, brw_WAIT(p); break; + case SHADER_OPCODE_MOV_INDIRECT: + generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]); + default: unreachable("Unsupported opcode"); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 1b87e30..2b261fc 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -132,15 +132,6 @@ void vec4_visitor::nir_setup_uniforms() { uniforms = nir->num_uniforms / 16; - - nir_foreach_variable(var, &nir->uniforms) { - /* UBO's and atomics don't take up space in the uniform file */ - if (var->interface_type != NULL || var->type->contains_atomic()) - continue; - - if (type_size_vec4(var->type) > 0) - uniform_size[var->data.driver_location / 16] = type_size_vec4(var->type); - } } void @@ -710,12 +701,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) /* Offsets are in bytes but they should always be multiples of 16 */ assert(const_offset->u[0] % 16 == 0); src.reg_offset = const_offset->u[0] / 16; + + emit(MOV(dest, src)); } else { - src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1); - src.reladdr = new(mem_ctx) src_reg(tmp); - } + src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); - emit(MOV(dest, src)); + emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, + indirect, brw_imm_ud(instr->const_index[1])); + } break; } @@ -1093,15 +1086,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_fsin: - inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); - inst->saturate = instr->dest.saturate; + case nir_op_fsin: { + src_reg tmp = src_reg(this, glsl_type::vec4_type); + inst = emit_math(SHADER_OPCODE_SIN, dst_reg(tmp), op[0]); + if (instr->dest.saturate) { + inst->dst = dst; + inst->saturate = true; + } else { + emit(MUL(dst, tmp, brw_imm_f(0.99997))); + } break; + } - case nir_op_fcos: - inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); - inst->saturate = instr->dest.saturate; + case nir_op_fcos: { + src_reg tmp = src_reg(this, glsl_type::vec4_type); + inst = emit_math(SHADER_OPCODE_COS, dst_reg(tmp), op[0]); + if (instr->dest.saturate) { + inst->dst = dst; + inst->saturate = true; + } else { + emit(MUL(dst, tmp, brw_imm_f(0.99997))); + } break; + } case nir_op_idiv: case nir_op_udiv: @@ -1109,9 +1116,41 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_umod: + case nir_op_irem: + /* According to the sign table for INT DIV in the Ivy Bridge PRM, it + * appears that our hardware just does the right thing for signed + * remainder. + */ emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); break; + case nir_op_imod: { + /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ + inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); + + /* Math instructions don't support conditional mod */ + inst = emit(MOV(dst_null_d(), src_reg(dst))); + inst->conditional_mod = BRW_CONDITIONAL_NZ; + + /* Now, we need to determine if signs of the sources are different. + * When we XOR the sources, the top bit is 0 if they are the same and 1 + * if they are different. We can then use a conditional modifier to + * turn that into a predicate. This leads us to an XOR.l instruction. + */ + src_reg tmp = src_reg(this, glsl_type::ivec4_type); + inst = emit(XOR(dst_reg(tmp), op[0], op[1])); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_L; + + /* If the result of the initial remainder operation is non-zero and the + * two sources have different signs, add in a copy of op[1] to get the + * final integer modulus value. + */ + inst = emit(ADD(dst, src_reg(dst), op[1])); + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } + case nir_op_ldexp: unreachable("not reached: should be handled by ldexp_to_arith()"); @@ -1181,6 +1220,32 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; + case nir_op_fquantize2f16: { + /* See also vec4_visitor::emit_pack_half_2x16() */ + src_reg tmp16 = src_reg(this, glsl_type::uvec4_type); + src_reg tmp32 = src_reg(this, glsl_type::vec4_type); + src_reg zero = src_reg(this, glsl_type::vec4_type); + + /* Check for denormal */ + src_reg abs_src0 = op[0]; + abs_src0.abs = true; + emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), + BRW_CONDITIONAL_L)); + /* Get the appropriately signed zero */ + emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD), + retype(op[0], BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x80000000))); + /* Do the actual F32 -> F16 -> F32 conversion */ + emit(F32TO16(dst_reg(tmp16), op[0])); + emit(F16TO32(dst_reg(tmp32), tmp16)); + /* Select that or zero based on normal status */ + inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32); + inst->predicate = BRW_PREDICATE_NORMAL; + inst->predicate_inverse = true; + inst->saturate = instr->dest.saturate; + break; + } + case nir_op_fmin: case nir_op_imin: case nir_op_umin: @@ -1325,6 +1390,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_pack_unorm_2x16: unreachable("not reached: should be handled by lower_packing_builtins"); + case nir_op_pack_uvec4_to_uint: + unreachable("not reached"); + + case nir_op_pack_uvec2_to_uint: { + dst_reg tmp1 = dst_reg(this, glsl_type::uint_type); + tmp1.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_YYYY; + emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); + + dst_reg tmp2 = dst_reg(this, glsl_type::uint_type); + tmp2.writemask = WRITEMASK_X; + op[0].swizzle = BRW_SWIZZLE_XXXX; + emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); + + emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); + break; + } + case nir_op_unpack_half_2x16: /* As NIR does not guarantee that we have a correct swizzle outside the * boundaries of a vector, and the implementation of emit_unpack_half_2x16 @@ -1568,7 +1651,6 @@ vec4_visitor::nir_emit_jump(nir_jump_instr *instr) break; case nir_jump_return: - /* fall through */ default: unreachable("unknown jump"); } @@ -1621,7 +1703,9 @@ glsl_type_for_nir_alu_type(nir_alu_type alu_type, void vec4_visitor::nir_emit_texture(nir_tex_instr *instr) { + unsigned texture = instr->texture_index; unsigned sampler = instr->sampler_index; + src_reg texture_reg = brw_imm_ud(texture); src_reg sampler_reg = brw_imm_ud(sampler); src_reg coordinate; const glsl_type *coord_type = NULL; @@ -1636,6 +1720,10 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) nir_tex_instr_dest_size(instr)); dst_reg dest = get_nir_dest(instr->dest, instr->dest_type); + /* Our hardware requires a LOD for buffer textures */ + if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) + lod = brw_imm_d(0); + /* Load the texture operation sources */ for (unsigned i = 0; i < instr->num_srcs; i++) { switch (instr->src[i].src_type) { @@ -1697,13 +1785,12 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); break; - case nir_tex_src_sampler_offset: { - /* The highest sampler which may be used by this operation is + case nir_tex_src_texture_offset: { + /* The highest texture which may be used by this operation is * the last element of the array. Mark it here, because the generator * doesn't have enough information to determine the bound. */ - uint32_t array_size = instr->sampler_array_size; - uint32_t max_used = sampler + array_size - 1; + uint32_t max_used = texture + instr->texture_array_size - 1; if (instr->op == nir_texop_tg4) { max_used += prog_data->base.binding_table.gather_texture_start; } else { @@ -1715,6 +1802,15 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) /* Emit code to evaluate the actual indexing expression */ src_reg src = get_nir_src(instr->src[i].src, 1); src_reg temp(this, glsl_type::uint_type); + emit(ADD(dst_reg(temp), src, brw_imm_ud(texture))); + texture_reg = emit_uniformize(temp); + break; + } + + case nir_tex_src_sampler_offset: { + /* Emit code to evaluate the actual indexing expression */ + src_reg src = get_nir_src(instr->src[i].src, 1); + src_reg temp(this, glsl_type::uint_type); emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler))); sampler_reg = emit_uniformize(temp); break; @@ -1753,7 +1849,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) /* Stuff the channel select bits in the top of the texture offset */ if (instr->op == nir_texop_tg4) { if (instr->component == 1 && - (key_tex->gather_channel_quirk_mask & (1 << sampler))) { + (key_tex->gather_channel_quirk_mask & (1 << texture))) { /* gather4 sampler is broken for green channel on RG32F -- * we must ask for blue instead. */ @@ -1774,7 +1870,8 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) shadow_comparitor, lod, lod2, sample_index, constant_offset, offset_value, - mcs, is_cube_array, sampler, sampler_reg); + mcs, is_cube_array, + texture, texture_reg, sampler, sampler_reg); } void diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index 9b75f45..b7d02e9 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -59,8 +59,6 @@ vec4_tcs_visitor::emit_nir_code() * copies VS outputs to TES inputs. */ uniforms = 2; - uniform_size[0] = 1; - uniform_size[1] = 1; uint64_t varyings = key->outputs_written; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 443d0eb..0c5bfb8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -622,6 +622,7 @@ type_size_vec4(const struct glsl_type *type) case GLSL_TYPE_DOUBLE: case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: unreachable("not reached"); } @@ -877,6 +878,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op, src_reg offset_value, src_reg mcs, bool is_cube_array, + uint32_t surface, + src_reg surface_reg, uint32_t sampler, src_reg sampler_reg) { @@ -942,7 +945,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op, inst->dst.writemask = WRITEMASK_XYZW; inst->shadow_compare = shadow_comparitor.file != BAD_FILE; - inst->src[1] = sampler_reg; + inst->src[1] = surface_reg; + inst->src[2] = sampler_reg; /* MRF for the first parameter */ int param_base = inst->base_mrf + inst->header_size; @@ -1068,7 +1072,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op, } if (devinfo->gen == 6 && op == ir_tg4) { - emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst); + emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst); } if (op == ir_query_levels) { @@ -1464,27 +1468,6 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, } } -src_reg -vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset) -{ - if (reladdr) { - src_reg index = src_reg(this, glsl_type::int_type); - - emit_before(block, inst, ADD(dst_reg(index), *reladdr, - brw_imm_d(reg_offset * 16))); - - return index; - } else if (devinfo->gen >= 8) { - /* Store the offset in a GRF so we can send-from-GRF. */ - src_reg offset = src_reg(this, glsl_type::int_type); - emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16))); - return offset; - } else { - return brw_imm_d(reg_offset * 16); - } -} - /** * Emits an instruction before @inst to load the value named by @orig_src * from scratch space at @base_offset to @temp. @@ -1662,12 +1645,24 @@ vec4_visitor::move_grf_array_access_to_scratch() void vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, dst_reg temp, src_reg orig_src, - int base_offset) + int base_offset, src_reg indirect) { int reg_offset = base_offset + orig_src.reg_offset; const unsigned index = prog_data->base.binding_table.pull_constants_start; - src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr, - reg_offset); + + src_reg offset; + if (indirect.file != BAD_FILE) { + offset = src_reg(this, glsl_type::int_type); + + emit_before(block, inst, ADD(dst_reg(offset), indirect, + brw_imm_d(reg_offset * 16))); + } else if (devinfo->gen >= 8) { + /* Store the offset in a GRF so we can send-from-GRF. */ + offset = src_reg(this, glsl_type::int_type); + emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16))); + } else { + offset = brw_imm_d(reg_offset * 16); + } emit_pull_constant_load_reg(temp, brw_imm_ud(index), @@ -1694,59 +1689,55 @@ vec4_visitor::move_uniform_array_access_to_pull_constants() { int pull_constant_loc[this->uniforms]; memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); - bool nested_reladdr; - /* Walk through and find array access of uniforms. Put a copy of that - * uniform in the pull constant buffer. - * - * Note that we don't move constant-indexed accesses to arrays. No - * testing has been done of the performance impact of this choice. + /* First, walk through the instructions and determine which things need to + * be pulled. We mark something as needing to be pulled by setting + * pull_constant_loc to 0. */ - do { - nested_reladdr = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - for (int i = 0 ; i < 3; i++) { - if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) - continue; + foreach_block_and_inst(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; - int uniform = inst->src[i].nr; + int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset; - if (inst->src[i].reladdr->reladdr) - nested_reladdr = true; /* will need another pass */ + for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) + pull_constant_loc[uniform_nr + j] = 0; + } - /* If this array isn't already present in the pull constant buffer, - * add it. - */ - if (pull_constant_loc[uniform] == -1) { - const gl_constant_value **values = - &stage_prog_data->param[uniform * 4]; + /* Next, we walk the list of uniforms and assign real pull constant + * locations and set their corresponding entries in pull_param. + */ + for (int j = 0; j < this->uniforms; j++) { + if (pull_constant_loc[j] < 0) + continue; - pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4; + pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; - assert(uniform < uniform_array_size); - for (int j = 0; j < uniform_size[uniform] * 4; j++) { - stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] - = values[j]; - } - } + for (int i = 0; i < 4; i++) { + stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] + = stage_prog_data->param[j * 4 + i]; + } + } - /* Set up the annotation tracking for new generated instructions. */ - base_ir = inst->ir; - current_annotation = inst->annotation; + /* Finally, we can walk through the instructions and lower MOV_INDIRECT + * instructions to actual uniform pulls. + */ + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + /* We only care about MOV_INDIRECT of a uniform */ + if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || + inst->src[0].file != UNIFORM) + continue; - dst_reg temp = dst_reg(this, glsl_type::vec4_type); + int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset; - emit_pull_constant_load(block, inst, temp, inst->src[i], - pull_constant_loc[uniform]); + assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); - inst->src[i].file = temp.file; - inst->src[i].nr = temp.nr; - inst->src[i].reg_offset = temp.reg_offset; - inst->src[i].reladdr = NULL; - } - } - } while (nested_reladdr); + emit_pull_constant_load(block, inst, inst->dst, inst->src[0], + pull_constant_loc[uniform_nr], inst->src[1]); + inst->remove(block); + } /* Now there are no accesses of the UNIFORM file with a reladdr, so * no need to track them as larger-than-vec4 objects. This will be @@ -1799,17 +1790,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; this->uniforms = 0; - - /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires - * at least one. See setup_uniforms() in brw_vec4.cpp. - */ - this->uniform_array_size = 1; - if (prog_data) { - this->uniform_array_size = - MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1); - } - - this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size); } vec4_visitor::~vec4_visitor() diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp index 1d69149..86701f3 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp @@ -270,7 +270,6 @@ void vec4_vs_visitor::setup_uniform_clipplane_values() { for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { - assert(this->uniforms < uniform_array_size); this->userplane[i] = dst_reg(UNIFORM, this->uniforms); this->userplane[i].type = BRW_REGISTER_TYPE_F; for (int j = 0; j < 4; ++j) { diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 15dd1ca..c5400ab 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2485,6 +2485,11 @@ struct gl_uniform_block GLuint Binding; /** + * Vulkan descriptor set qualifier for this block. + */ + GLuint Set; + + /** * Minimum size (in bytes) of a buffer object to back this uniform buffer * (GL_UNIFORM_BLOCK_DATA_SIZE). */ diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 9cde28d..88d8337 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -541,6 +541,7 @@ type_size(const struct glsl_type *type) case GLSL_TYPE_VOID: case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: assert(!"Invalid type in type_size"); break; } @@ -1244,10 +1245,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir) case ir_unop_unpack_unorm_2x16: case ir_unop_unpack_unorm_4x8: case ir_unop_unpack_half_2x16: - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: case ir_unop_unpack_double_2x32: - case ir_binop_pack_half_2x16_split: case ir_unop_bitfield_reverse: case ir_unop_bit_count: case ir_unop_find_msb: @@ -2446,6 +2444,7 @@ _mesa_associate_uniform_storage(struct gl_context *ctx, case GLSL_TYPE_STRUCT: case GLSL_TYPE_ERROR: case GLSL_TYPE_INTERFACE: + case GLSL_TYPE_FUNCTION: assert(!"Should not get here."); break; } diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index ce6f699..72c9e97 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -609,6 +609,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src, instr->op = op; instr->dest_type = nir_type_float; instr->is_shadow = prog_inst->TexShadow; + instr->texture_index = prog_inst->TexSrcUnit; instr->sampler_index = prog_inst->TexSrcUnit; switch (prog_inst->TexSrcTarget) { diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index d424e3b..a06683f 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -2177,12 +2177,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir) case ir_unop_unpack_snorm_2x16: case ir_unop_unpack_unorm_2x16: - case ir_unop_unpack_half_2x16_split_x: - case ir_unop_unpack_half_2x16_split_y: case ir_unop_unpack_snorm_4x8: case ir_unop_unpack_unorm_4x8: - case ir_binop_pack_half_2x16_split: case ir_quadop_vector: case ir_binop_vector_extract: case ir_triop_vector_insert: |