diff options
Diffstat (limited to 'src/compiler/nir')
40 files changed, 1028 insertions, 385 deletions
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp index da5d730..7b8b466 100644 --- a/src/compiler/nir/glsl_to_nir.cpp +++ b/src/compiler/nir/glsl_to_nir.cpp @@ -731,7 +731,7 @@ nir_visitor::visit(ir_call *ir) ir_dereference *param = (ir_dereference *) ir->actual_parameters.get_head(); instr->variables[0] = evaluate_deref(&instr->instr, param); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -765,7 +765,7 @@ nir_visitor::visit(ir_call *ir) const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; nir_ssa_dest_init(&instr->instr, &instr->dest, - info->dest_components, NULL); + info->dest_components, 32, NULL); } if (op == nir_intrinsic_image_size || @@ -826,7 +826,7 @@ nir_visitor::visit(ir_call *ir) nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_shader_clock: - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_store_ssbo: { @@ -867,7 +867,7 @@ nir_visitor::visit(ir_call *ir) /* Setup destination register */ nir_ssa_dest_init(&instr->instr, &instr->dest, - type->vector_elements, NULL); + type->vector_elements, 32, NULL); /* Insert the created nir instruction now since in the case of boolean * result we will need to emit another instruction after it @@ -890,7 +890,7 @@ nir_visitor::visit(ir_call *ir) load_ssbo_compare->src[1].swizzle[i] = 0; nir_ssa_dest_init(&load_ssbo_compare->instr, &load_ssbo_compare->dest.dest, - type->vector_elements, NULL); + type->vector_elements, 32, NULL); load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1; nir_builder_instr_insert(&b, &load_ssbo_compare->instr); dest = &load_ssbo_compare->dest.dest; @@ -936,7 +936,7 @@ nir_visitor::visit(ir_call *ir) /* Atomic result */ assert(ir->return_deref); nir_ssa_dest_init(&instr->instr, &instr->dest, - ir->return_deref->type->vector_elements, NULL); + ir->return_deref->type->vector_elements, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -951,8 +951,9 @@ nir_visitor::visit(ir_call *ir) instr->num_components = type->vector_elements; /* Setup destination register */ + unsigned bit_size = glsl_get_bit_size(type->base_type); nir_ssa_dest_init(&instr->instr, &instr->dest, - type->vector_elements, NULL); + type->vector_elements, bit_size, NULL); nir_builder_instr_insert(&b, &instr->instr); break; @@ -1013,8 +1014,10 @@ nir_visitor::visit(ir_call *ir) /* Atomic result */ assert(ir->return_deref); + unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type); nir_ssa_dest_init(&instr->instr, &instr->dest, - ir->return_deref->type->vector_elements, NULL); + ir->return_deref->type->vector_elements, + bit_size, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -1061,6 +1064,9 @@ nir_visitor::visit(ir_assignment *ir) { unsigned num_components = ir->lhs->type->vector_elements; + b.exact = ir->lhs->variable_referenced()->data.invariant || + ir->lhs->variable_referenced()->data.precise; + if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) && (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) { /* We're doing a plain-as-can-be copy, so emit a copy_var */ @@ -1163,7 +1169,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components) nir_dest *dest = get_instr_dest(instr); if (dest) - nir_ssa_dest_init(instr, dest, num_components, NULL); + nir_ssa_dest_init(instr, dest, num_components, 32, NULL); nir_builder_instr_insert(&b, instr); @@ -1203,6 +1209,7 @@ nir_visitor::visit(ir_expression *ir) nir_intrinsic_instr *load = nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo); load->num_components = ir->type->vector_elements; + load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type); load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); add_instr(&load->instr, ir->type->vector_elements); diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c index 7e41ed3..b67916d 100644 --- a/src/compiler/nir/nir.c +++ b/src/compiler/nir/nir.c @@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list) list_inithead(®->if_uses); reg->num_components = 0; + reg->bit_size = 32; reg->num_array_elems = 0; reg->is_packed = false; reg->name = NULL; @@ -473,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components) nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr); instr_init(&instr->instr, nir_instr_type_load_const); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); return instr; } @@ -562,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components) nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr); instr_init(&instr->instr, nir_instr_type_ssa_undef); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); return instr; } @@ -699,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref) case GLSL_TYPE_FLOAT: case GLSL_TYPE_INT: case GLSL_TYPE_UINT: - load->value.u[i] = constant->value.u[matrix_offset + i]; + load->value.u32[i] = constant->value.u[matrix_offset + i]; break; case GLSL_TYPE_BOOL: - load->value.u[i] = constant->value.b[matrix_offset + i] ? + load->value.u32[i] = constant->value.b[matrix_offset + i] ? NIR_TRUE : NIR_FALSE; break; default: @@ -731,18 +732,11 @@ reduce_cursor(nir_cursor cursor) { switch (cursor.option) { case nir_cursor_before_block: + assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL || + nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block); if (exec_list_is_empty(&cursor.block->instr_list)) { /* Empty block. After is as good as before. */ cursor.option = nir_cursor_after_block; - } else { - /* Try to switch to after the previous block if there is one. - * (This isn't likely, but it can happen.) - */ - nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node); - if (prev_node && prev_node->type == nir_cf_node_block) { - cursor.block = nir_cf_node_as_block(prev_node); - cursor.option = nir_cursor_after_block; - } } return cursor; @@ -1379,15 +1373,18 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest) src_add_all_uses(dest->reg.indirect, instr, NULL); } +/* note: does *not* take ownership of 'name' */ void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, - unsigned num_components, const char *name) + unsigned num_components, + unsigned bit_size, const char *name) { - def->name = name; + def->name = ralloc_strdup(instr, name); def->parent_instr = instr; list_inithead(&def->uses); list_inithead(&def->if_uses); def->num_components = num_components; + def->bit_size = bit_size; if (instr->block) { nir_function_impl *impl = @@ -1399,12 +1396,14 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, } } +/* note: does *not* take ownership of 'name' */ void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest, - unsigned num_components, const char *name) + unsigned num_components, unsigned bit_size, + const char *name) { dest->is_ssa = true; - nir_ssa_def_init(instr, &dest->ssa, num_components, name); + nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name); } void diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ab1afdb..2fd75ec 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -101,6 +101,7 @@ union nir_constant_data { int i[16]; float f[16]; bool b[16]; + double d[16]; }; typedef struct nir_constant { @@ -381,6 +382,9 @@ typedef struct nir_register { unsigned num_components; /** < number of vector components */ unsigned num_array_elems; /** < size of array (0 for no array) */ + /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */ + uint8_t bit_size; + /** generic register index. */ unsigned index; @@ -488,6 +492,9 @@ typedef struct nir_ssa_def { struct list_head if_uses; uint8_t num_components; + + /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */ + uint8_t bit_size; } nir_ssa_def; struct nir_src; @@ -594,6 +601,18 @@ nir_dest_for_reg(nir_register *reg) return dest; } +static inline unsigned +nir_src_bit_size(nir_src src) +{ + return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size; +} + +static inline unsigned +nir_dest_bit_size(nir_dest dest) +{ + return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size; +} + void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if); void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr); @@ -649,9 +668,36 @@ typedef enum { nir_type_float, nir_type_int, nir_type_uint, - nir_type_bool + nir_type_bool, + nir_type_bool32 = 32 | nir_type_bool, + nir_type_int8 = 8 | nir_type_int, + nir_type_int16 = 16 | nir_type_int, + nir_type_int32 = 32 | nir_type_int, + nir_type_int64 = 64 | nir_type_int, + nir_type_uint8 = 8 | nir_type_uint, + nir_type_uint16 = 16 | nir_type_uint, + nir_type_uint32 = 32 | nir_type_uint, + nir_type_uint64 = 64 | nir_type_uint, + nir_type_float16 = 16 | nir_type_float, + nir_type_float32 = 32 | nir_type_float, + nir_type_float64 = 64 | nir_type_float, } nir_alu_type; +#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8 +#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007 + +static inline unsigned +nir_alu_type_get_type_size(nir_alu_type type) +{ + return type & NIR_ALU_TYPE_SIZE_MASK; +} + +static inline unsigned +nir_alu_type_get_base_type(nir_alu_type type) +{ + return type & NIR_ALU_TYPE_BASE_TYPE_MASK; +} + typedef enum { NIR_OP_IS_COMMUTATIVE = (1 << 0), NIR_OP_IS_ASSOCIATIVE = (1 << 1), @@ -708,6 +754,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes]; typedef struct nir_alu_instr { nir_instr instr; nir_op op; + + /** Indicates that this ALU instruction generates an exact value + * + * This is kind of a mixture of GLSL "precise" and "invariant" and not + * really equivalent to either. This indicates that the value generated by + * this operation is high-precision and any code transformations that touch + * it must ensure that the resulting value is bit-for-bit identical to the + * original. + */ + bool exact; + nir_alu_dest dest; nir_alu_src src[]; } nir_alu_instr; @@ -1218,9 +1275,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type) typedef struct { union { - float f[4]; - int32_t i[4]; - uint32_t u[4]; + float f32[4]; + double f64[4]; + int32_t i32[4]; + uint32_t u32[4]; + int64_t i64[4]; + uint64_t u64[4]; }; } nir_const_value; @@ -2061,9 +2121,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest); void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest, - unsigned num_components, const char *name); + unsigned num_components, unsigned bit_size, + const char *name); void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, - unsigned num_components, const char *name); + unsigned num_components, unsigned bit_size, + const char *name); void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src); void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src, nir_instr *after_me); @@ -2094,9 +2156,10 @@ void nir_index_blocks(nir_function_impl *impl); void nir_print_shader(nir_shader *shader, FILE *fp); void nir_print_instr(const nir_instr *instr, FILE *fp); -nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s); +nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s); nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi); nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var); +nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader); #ifdef DEBUG void nir_validate_shader(nir_shader *shader); diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py index 2357b57..d05564f 100644 --- a/src/compiler/nir/nir_algebraic.py +++ b/src/compiler/nir/nir_algebraic.py @@ -63,12 +63,13 @@ class Value(object): static const ${val.c_type} ${val.name} = { { ${val.type_enum} }, % if isinstance(val, Constant): - { ${hex(val)} /* ${val.value} */ }, + ${val.type()}, { ${hex(val)} /* ${val.value} */ }, % elif isinstance(val, Variable): ${val.index}, /* ${val.var_name} */ ${'true' if val.is_constant else 'false'}, - nir_type_${ val.required_type or 'invalid' }, + ${val.type() or 'nir_type_invalid' }, % elif isinstance(val, Expression): + ${'true' if val.inexact else 'false'}, nir_op_${val.opcode}, { ${', '.join(src.c_ptr for src in val.sources)} }, % endif @@ -107,10 +108,18 @@ class Constant(Value): if isinstance(self.value, (int, long)): return hex(self.value) elif isinstance(self.value, float): - return hex(struct.unpack('I', struct.pack('f', self.value))[0]) + return hex(struct.unpack('Q', struct.pack('d', self.value))[0]) else: assert False + def type(self): + if isinstance(self.value, (bool)): + return "nir_type_bool32" + elif isinstance(self.value, (int, long)): + return "nir_type_int" + elif isinstance(self.value, float): + return "nir_type_float" + _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?") class Variable(Value): @@ -129,12 +138,26 @@ class Variable(Value): self.index = varset[self.var_name] + def type(self): + if self.required_type == 'bool': + return "nir_type_bool32" + elif self.required_type in ('int', 'unsigned'): + return "nir_type_int" + elif self.required_type == 'float': + return "nir_type_float" + +_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)") + class Expression(Value): def __init__(self, expr, name_base, varset): Value.__init__(self, name_base, "expression") assert isinstance(expr, tuple) - self.opcode = expr[0] + m = _opcode_re.match(expr[0]) + assert m and m.group('opcode') is not None + + self.opcode = m.group('opcode') + self.inexact = m.group('inexact') is not None self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset) for (i, src) in enumerate(expr[1:]) ] diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index b4dde54..94f183c 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -31,6 +31,9 @@ struct exec_list; typedef struct nir_builder { nir_cursor cursor; + /* Whether new ALU instructions will be marked "exact" */ + bool exact; + nir_shader *shader; nir_function_impl *impl; } nir_builder; @@ -39,6 +42,7 @@ static inline void nir_builder_init(nir_builder *build, nir_function_impl *impl) { memset(build, 0, sizeof(*build)); + build->exact = false; build->impl = impl; build->shader = impl->function->shader; } @@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx, { build->shader = nir_shader_create(mem_ctx, stage, options); nir_function *func = nir_function_create(build->shader, "main"); + build->exact = false; build->impl = nir_function_impl_create(func); build->cursor = nir_after_cf_list(&build->impl->body); } @@ -104,7 +109,7 @@ nir_imm_float(nir_builder *build, float x) nir_const_value v; memset(&v, 0, sizeof(v)); - v.f[0] = x; + v.f32[0] = x; return nir_build_imm(build, 1, v); } @@ -115,10 +120,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w) nir_const_value v; memset(&v, 0, sizeof(v)); - v.f[0] = x; - v.f[1] = y; - v.f[2] = z; - v.f[3] = w; + v.f32[0] = x; + v.f32[1] = y; + v.f32[2] = z; + v.f32[3] = w; return nir_build_imm(build, 4, v); } @@ -129,7 +134,7 @@ nir_imm_int(nir_builder *build, int x) nir_const_value v; memset(&v, 0, sizeof(v)); - v.i[0] = x; + v.i32[0] = x; return nir_build_imm(build, 1, v); } @@ -140,10 +145,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w) nir_const_value v; memset(&v, 0, sizeof(v)); - v.i[0] = x; - v.i[1] = y; - v.i[2] = z; - v.i[3] = w; + v.i32[0] = x; + v.i32[1] = y; + v.i32[2] = z; + v.i32[3] = w; return nir_build_imm(build, 4, v); } @@ -157,6 +162,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, if (!instr) return NULL; + instr->exact = build->exact; + instr->src[0].src = nir_src_for_ssa(src0); if (src1) instr->src[1].src = nir_src_for_ssa(src1); @@ -178,6 +185,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, } assert(num_components != 0); + /* Figure out the bitwidth based on the source bitwidth if the instruction + * is variable-width. + */ + unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type); + if (bit_size == 0) { + for (unsigned i = 0; i < op_info->num_inputs; i++) { + unsigned src_bit_size = instr->src[i].src.ssa->bit_size; + if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) { + if (bit_size) + assert(src_bit_size == bit_size); + else + bit_size = src_bit_size; + } else { + assert(src_bit_size == + nir_alu_type_get_type_size(op_info->input_types[i])); + } + } + } + /* Make sure we don't swizzle from outside of our source vector (like if a * scalar value was passed into a multiply with a vector). */ @@ -187,7 +213,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, } } - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + bit_size, NULL); instr->dest.write_mask = (1 << num_components) - 1; nir_builder_instr_insert(build, &instr->instr); @@ -252,7 +279,9 @@ static inline nir_ssa_def * nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components) { nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, + nir_src_bit_size(src.src), NULL); + mov->exact = build->exact; mov->dest.write_mask = (1 << num_components) - 1; mov->src[0] = src; nir_builder_instr_insert(build, &mov->instr); @@ -264,7 +293,9 @@ static inline nir_ssa_def * nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components) { nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, + nir_src_bit_size(src.src), NULL); + mov->exact = build->exact; mov->dest.write_mask = (1 << num_components) - 1; mov->src[0] = src; nir_builder_instr_insert(build, &mov->instr); @@ -360,7 +391,8 @@ nir_load_var(nir_builder *build, nir_variable *var) nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var); load->num_components = num_components; load->variables[0] = nir_deref_var_create(load, var); - nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, num_components, + glsl_get_bit_size(glsl_get_base_type(var->type)), NULL); nir_builder_instr_insert(build, &load->instr); return &load->dest.ssa; } @@ -426,7 +458,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index) load->num_components = nir_intrinsic_infos[op].dest_components; load->const_index[0] = index; nir_ssa_dest_init(&load->instr, &load->dest, - nir_intrinsic_infos[op].dest_components, NULL); + nir_intrinsic_infos[op].dest_components, 32, NULL); nir_builder_instr_insert(build, &load->instr); return &load->dest.ssa; } diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c index 3268deb..7d2e383 100644 --- a/src/compiler/nir/nir_clone.c +++ b/src/compiler/nir/nir_clone.c @@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar) /* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid * having to deal with locals and globals separately: */ -static nir_variable * -clone_variable(clone_state *state, const nir_variable *var) +nir_variable * +nir_variable_clone(const nir_variable *var, nir_shader *shader) { - nir_variable *nvar = rzalloc(state->ns, nir_variable); - add_remap(state, nvar, var); + nir_variable *nvar = rzalloc(shader, nir_variable); nvar->type = var->type; nvar->name = ralloc_strdup(nvar, var->name); @@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var) return nvar; } +static nir_variable * +clone_variable(clone_state *state, const nir_variable *var) +{ + nir_variable *nvar = nir_variable_clone(var, state->ns); + add_remap(state, nvar, var); + + return nvar; +} + /* clone list of nir_variable: */ static void clone_var_list(clone_state *state, struct exec_list *dst, @@ -220,7 +228,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr, { ndst->is_ssa = dst->is_ssa; if (dst->is_ssa) { - nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name); + nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, + dst->ssa.bit_size, dst->ssa.name); add_remap(state, &ndst->ssa, &dst->ssa); } else { ndst->reg.reg = remap_reg(state, dst->reg.reg); @@ -303,6 +312,7 @@ static nir_alu_instr * clone_alu(clone_state *state, const nir_alu_instr *alu) { nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op); + nalu->exact = alu->exact; __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest); nalu->dest.saturate = alu->dest.saturate; diff --git a/src/compiler/nir/nir_constant_expressions.h b/src/compiler/nir/nir_constant_expressions.h index 97997f2..201f278 100644 --- a/src/compiler/nir/nir_constant_expressions.h +++ b/src/compiler/nir/nir_constant_expressions.h @@ -28,4 +28,4 @@ #include "nir.h" nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components, - nir_const_value *src); + unsigned bit_size, nir_const_value *src); diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py index 32784f6..e36dc48 100644 --- a/src/compiler/nir/nir_constant_expressions.py +++ b/src/compiler/nir/nir_constant_expressions.py @@ -1,4 +1,43 @@ #! /usr/bin/python2 + +def type_has_size(type_): + return type_[-1:].isdigit() + +def type_sizes(type_): + if type_.endswith("8"): + return [8] + elif type_.endswith("16"): + return [16] + elif type_.endswith("32"): + return [32] + elif type_.endswith("64"): + return [64] + else: + return [32, 64] + +def type_add_size(type_, size): + if type_has_size(type_): + return type_ + return type_ + str(size) + +def get_const_field(type_): + if type_ == "int32": + return "i32" + if type_ == "uint32": + return "u32" + if type_ == "int64": + return "i64" + if type_ == "uint64": + return "u64" + if type_ == "bool32": + return "u32" + if type_ == "float32": + return "f32" + if type_ == "float64": + return "f64" + raise Exception(str(type_)) + assert(0) + template = """\ /* * Copyright (C) 2014 Intel Corporation @@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u) } /* Some typed vector structures to make things like src0.y work */ -% for type in ["float", "int", "uint", "bool"]: -struct ${type}_vec { - ${type} x; - ${type} y; - ${type} z; - ${type} w; +typedef float float32_t; +typedef double float64_t; +typedef bool bool32_t; +% for type in ["float", "int", "uint"]: +% for width in [32, 64]: +struct ${type}${width}_vec { + ${type}${width}_t x; + ${type}${width}_t y; + ${type}${width}_t z; + ${type}${width}_t w; }; % endfor +% endfor + +struct bool32_vec { + bool x; + bool y; + bool z; + bool w; +}; % for name, op in sorted(opcodes.iteritems()): static nir_const_value -evaluate_${name}(unsigned num_components, nir_const_value *_src) +evaluate_${name}(unsigned num_components, unsigned bit_size, + nir_const_value *_src) { nir_const_value _dst_val = { { {0, 0, 0, 0} } }; - ## For each non-per-component input, create a variable srcN that - ## contains x, y, z, and w elements which are filled in with the - ## appropriately-typed values. - % for j in range(op.num_inputs): - % if op.input_sizes[j] == 0: - <% continue %> - % elif "src" + str(j) not in op.const_expr: - ## Avoid unused variable warnings - <% continue %> - %endif - - struct ${op.input_types[j]}_vec src${j} = { - % for k in range(op.input_sizes[j]): - % if op.input_types[j] == "bool": - _src[${j}].u[${k}] != 0, - % else: - _src[${j}].${op.input_types[j][:1]}[${k}], - % endif - % endfor - }; - % endfor + switch (bit_size) { + % for bit_size in [32, 64]: + case ${bit_size}: { + <% + output_type = type_add_size(op.output_type, bit_size) + input_types = [type_add_size(type_, bit_size) for type_ in op.input_types] + %> + + ## For each non-per-component input, create a variable srcN that + ## contains x, y, z, and w elements which are filled in with the + ## appropriately-typed values. + % for j in range(op.num_inputs): + % if op.input_sizes[j] == 0: + <% continue %> + % elif "src" + str(j) not in op.const_expr: + ## Avoid unused variable warnings + <% continue %> + %endif - % if op.output_size == 0: - ## For per-component instructions, we need to iterate over the - ## components and apply the constant expression one component - ## at a time. - for (unsigned _i = 0; _i < num_components; _i++) { - ## For each per-component input, create a variable srcN that - ## contains the value of the current (_i'th) component. - % for j in range(op.num_inputs): - % if op.input_sizes[j] != 0: - <% continue %> - % elif "src" + str(j) not in op.const_expr: - ## Avoid unused variable warnings - <% continue %> - % elif op.input_types[j] == "bool": - bool src${j} = _src[${j}].u[_i] != 0; + struct ${input_types[j]}_vec src${j} = { + % for k in range(op.input_sizes[j]): + % if input_types[j] == "bool32": + _src[${j}].u32[${k}] != 0, % else: - ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i]; + _src[${j}].${get_const_field(input_types[j])}[${k}], % endif % endfor + }; + % endfor + + % if op.output_size == 0: + ## For per-component instructions, we need to iterate over the + ## components and apply the constant expression one component + ## at a time. + for (unsigned _i = 0; _i < num_components; _i++) { + ## For each per-component input, create a variable srcN that + ## contains the value of the current (_i'th) component. + % for j in range(op.num_inputs): + % if op.input_sizes[j] != 0: + <% continue %> + % elif "src" + str(j) not in op.const_expr: + ## Avoid unused variable warnings + <% continue %> + % elif input_types[j] == "bool32": + bool src${j} = _src[${j}].u32[_i] != 0; + % else: + ${input_types[j]}_t src${j} = + _src[${j}].${get_const_field(input_types[j])}[_i]; + % endif + % endfor + + ## Create an appropriately-typed variable dst and assign the + ## result of the const_expr to it. If const_expr already contains + ## writes to dst, just include const_expr directly. + % if "dst" in op.const_expr: + ${output_type}_t dst; + ${op.const_expr} + % else: + ${output_type}_t dst = ${op.const_expr}; + % endif + + ## Store the current component of the actual destination to the + ## value of dst. + % if output_type == "bool32": + ## Sanitize the C value to a proper NIR bool + _dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE; + % else: + _dst_val.${get_const_field(output_type)}[_i] = dst; + % endif + } + % else: + ## In the non-per-component case, create a struct dst with + ## appropriately-typed elements x, y, z, and w and assign the result + ## of the const_expr to all components of dst, or include the + ## const_expr directly if it writes to dst already. + struct ${output_type}_vec dst; - ## Create an appropriately-typed variable dst and assign the - ## result of the const_expr to it. If const_expr already contains - ## writes to dst, just include const_expr directly. % if "dst" in op.const_expr: - ${op.output_type} dst; ${op.const_expr} % else: - ${op.output_type} dst = ${op.const_expr}; + ## Splat the value to all components. This way expressions which + ## write the same value to all components don't need to explicitly + ## write to dest. One such example is fnoise which has a + ## const_expr of 0.0f. + dst.x = dst.y = dst.z = dst.w = ${op.const_expr}; % endif - ## Store the current component of the actual destination to the - ## value of dst. - % if op.output_type == "bool": - ## Sanitize the C value to a proper NIR bool - _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE; - % else: - _dst_val.${op.output_type[:1]}[_i] = dst; - % endif - } - % else: - ## In the non-per-component case, create a struct dst with - ## appropriately-typed elements x, y, z, and w and assign the result - ## of the const_expr to all components of dst, or include the - ## const_expr directly if it writes to dst already. - struct ${op.output_type}_vec dst; - - % if "dst" in op.const_expr: - ${op.const_expr} - % else: - ## Splat the value to all components. This way expressions which - ## write the same value to all components don't need to explicitly - ## write to dest. One such example is fnoise which has a - ## const_expr of 0.0f. - dst.x = dst.y = dst.z = dst.w = ${op.const_expr}; + ## For each component in the destination, copy the value of dst to + ## the actual destination. + % for k in range(op.output_size): + % if output_type == "bool32": + ## Sanitize the C value to a proper NIR bool + _dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE; + % else: + _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]}; + % endif + % endfor % endif - ## For each component in the destination, copy the value of dst to - ## the actual destination. - % for k in range(op.output_size): - % if op.output_type == "bool": - ## Sanitize the C value to a proper NIR bool - _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE; - % else: - _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]}; - % endif - % endfor - % endif + break; + } + % endfor + + default: + unreachable("unknown bit width"); + } return _dst_val; } @@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src) nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components, - nir_const_value *src) + unsigned bit_width, nir_const_value *src) { switch (op) { % for name in sorted(opcodes.iterkeys()): case nir_op_${name}: { - return evaluate_${name}(num_components, src); + return evaluate_${name}(num_components, bit_width, src); break; } % endfor @@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components, from nir_opcodes import opcodes from mako.template import Template -print Template(template).render(opcodes=opcodes) +print Template(template).render(opcodes=opcodes, type_sizes=type_sizes, + type_has_size=type_has_size, + type_add_size=type_add_size, + get_const_field=get_const_field) diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c index 8bc9f24..82317c2 100644 --- a/src/compiler/nir/nir_from_ssa.c +++ b/src/compiler/nir/nir_from_ssa.c @@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state) nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, nir_parallel_copy_entry); nir_ssa_dest_init(&pcopy->instr, &entry->dest, - phi->dest.ssa.num_components, src->src.ssa->name); + phi->dest.ssa.num_components, + phi->dest.ssa.bit_size, src->src.ssa->name); exec_list_push_tail(&pcopy->entries, &entry->node); assert(src->src.is_ssa); @@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state) nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, nir_parallel_copy_entry); nir_ssa_dest_init(&block_pcopy->instr, &entry->dest, - phi->dest.ssa.num_components, phi->dest.ssa.name); + phi->dest.ssa.num_components, phi->dest.ssa.bit_size, + phi->dest.ssa.name); exec_list_push_tail(&block_pcopy->entries, &entry->node); nir_ssa_def_rewrite_uses(&phi->dest.ssa, diff --git a/src/compiler/nir/nir_gs_count_vertices.c b/src/compiler/nir/nir_gs_count_vertices.c index db15d16..3c1bd2a 100644 --- a/src/compiler/nir/nir_gs_count_vertices.c +++ b/src/compiler/nir/nir_gs_count_vertices.c @@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader) return -1; if (count == -1) - count = val->i[0]; + count = val->i32[0]; /* We've found contradictory set_vertex_count intrinsics. * This can happen if there are early-returns in main() and * different paths emit different numbers of vertices. */ - if (count != val->i[0]) + if (count != val->i32[0]) return -1; } } diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c index 159ded0..e244122 100644 --- a/src/compiler/nir/nir_instr_set.c +++ b/src/compiler/nir/nir_instr_set.c @@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr) { hash = HASH(hash, instr->op); hash = HASH(hash, instr->dest.dest.ssa.num_components); + /* We explicitly don't hash instr->dest.dest.exact */ if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { assert(nir_op_infos[instr->op].num_inputs == 2); @@ -81,9 +82,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr) { hash = HASH(hash, instr->def.num_components); - hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f, + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32, instr->def.num_components - * sizeof(instr->value.f[0])); + * sizeof(instr->value.f32[0])); return hash; } @@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) return false; + /* We explicitly don't hash instr->dest.dest.exact */ + if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { assert(nir_op_infos[alu1->op].num_inputs == 2); return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && @@ -322,8 +325,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (load1->def.num_components != load2->def.num_components) return false; - return memcmp(load1->value.f, load2->value.f, - load1->def.num_components * sizeof(*load2->value.f)) == 0; + return memcmp(load1->value.f32, load2->value.f32, + load1->def.num_components * sizeof(*load2->value.f32)) == 0; } case nir_instr_type_phi: { nir_phi_instr *phi1 = nir_instr_as_phi(instr1); @@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr) struct set_entry *entry = _mesa_set_search(instr_set, instr); if (entry) { nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr); - nir_ssa_def *new_def = - nir_instr_get_dest_ssa_def((nir_instr *) entry->key); + nir_instr *match = (nir_instr *) entry->key; + nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match); + + /* It's safe to replace a exact instruction with an inexact one as + * long as we make it exact. If we got here, the two instructions are + * exactly identical in every other way so, once we've set the exact + * bit, they are the same. + */ + if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact) + nir_instr_as_alu(match)->exact = true; + nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def)); return true; } diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c index 312d2f9..e8ba640 100644 --- a/src/compiler/nir/nir_lower_alu_to_scalar.c +++ b/src/compiler/nir/nir_lower_alu_to_scalar.c @@ -31,9 +31,11 @@ */ static void -nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components) +nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components, + unsigned bit_size) { - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + bit_size, NULL); instr->dest.write_mask = (1 << num_components) - 1; } @@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op, nir_ssa_def *last = NULL; for (unsigned i = 0; i < num_components; i++) { nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op); - nir_alu_ssa_dest_init(chan, 1); + nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size); nir_alu_src_copy(&chan->src[0], &instr->src[0], chan); chan->src[0].swizzle[0] = chan->src[0].swizzle[i]; if (nir_op_infos[chan_op].num_inputs > 1) { @@ -80,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) assert(instr->dest.write_mask != 0); b->cursor = nir_before_instr(&instr->instr); + b->exact = instr->exact; #define LOWER_REDUCTION(name, chan, merge) \ case name##2: \ @@ -220,7 +223,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan]; } - nir_alu_ssa_dest_init(lower, 1); + nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size); lower->dest.saturate = instr->dest.saturate; comps[chan] = &lower->dest.dest.ssa; diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c index eefcb55..70381a7 100644 --- a/src/compiler/nir/nir_lower_atomics.c +++ b/src/compiler/nir/nir_lower_atomics.c @@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr, state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index); nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1); - offset_const->value.u[0] = instr->variables[0]->var->data.offset; + offset_const->value.u32[0] = instr->variables[0]->var->data.offset; nir_instr_insert_before(&instr->instr, &offset_const->instr); @@ -90,17 +90,17 @@ lower_instr(nir_intrinsic_instr *instr, unsigned child_array_elements = tail->child != NULL ? glsl_get_aoa_size(tail->type) : 1; - offset_const->value.u[0] += deref_array->base_offset * + offset_const->value.u32[0] += deref_array->base_offset * child_array_elements * ATOMIC_COUNTER_SIZE; if (deref_array->deref_array_type == nir_deref_array_type_indirect) { nir_load_const_instr *atomic_counter_size = nir_load_const_instr_create(mem_ctx, 1); - atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE; + atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE; nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr); nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul); - nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL); + nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL); mul->dest.write_mask = 0x1; nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul); mul->src[1].src.is_ssa = true; @@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_instr_insert_before(&instr->instr, &mul->instr); nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd); - nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL); + nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL); add->dest.write_mask = 0x1; add->src[0].src.is_ssa = true; add->src[0].src.ssa = &mul->dest.dest.ssa; @@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr, if (instr->dest.is_ssa) { nir_ssa_dest_init(&new_instr->instr, &new_instr->dest, - instr->dest.ssa.num_components, NULL); + instr->dest.ssa.num_components, 32, NULL); nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&new_instr->dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c index bcbad53..c711230 100644 --- a/src/compiler/nir/nir_lower_clip.c +++ b/src/compiler/nir/nir_lower_clip.c @@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val) load->num_components = 4; nir_intrinsic_set_base(load, in->data.driver_location); load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); val[0] = nir_channel(b, &load->dest.ssa, 0); diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c index a4affa7..62b8c84 100644 --- a/src/compiler/nir/nir_lower_indirect_derefs.c +++ b/src/compiler/nir/nir_lower_indirect_derefs.c @@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr, if (src == NULL) { /* We're a load. We need to insert a phi node */ nir_phi_instr *phi = nir_phi_instr_create(b->shader); + unsigned bit_size = then_dest->bit_size; nir_ssa_dest_init(&phi->instr, &phi->dest, - then_dest->num_components, NULL); + then_dest->num_components, bit_size, NULL); nir_phi_src *src0 = ralloc(phi, nir_phi_src); src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt)); @@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr, load->num_components = orig_instr->num_components; load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &deref->deref)); + unsigned bit_size = orig_instr->dest.ssa.bit_size; nir_ssa_dest_init(&load->instr, &load->dest, - load->num_components, NULL); + load->num_components, bit_size, NULL); nir_builder_instr_insert(b, &load->instr); *dest = &load->dest.ssa; } else { diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 9d502ee..a30061d 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -289,7 +289,8 @@ nir_lower_io_block(nir_block *block, void *void_state) if (intrin->dest.is_ssa) { nir_ssa_dest_init(&load->instr, &load->dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&load->dest.ssa)); } else { @@ -369,7 +370,8 @@ nir_lower_io_block(nir_block *block, void *void_state) if (intrin->dest.is_ssa) { nir_ssa_dest_init(&atomic->instr, &atomic->dest, - intrin->dest.ssa.num_components, NULL); + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&atomic->dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c index 1eeed13..b5df464 100644 --- a/src/compiler/nir/nir_lower_load_const_to_scalar.c +++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c @@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower) nir_ssa_def *loads[4]; for (unsigned i = 0; i < lower->def.num_components; i++) { nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1); - load_comp->value.u[0] = lower->value.u[i]; + load_comp->value.u32[0] = lower->value.u32[i]; nir_builder_instr_insert(&b, &load_comp->instr); loads[i] = &load_comp->def; } diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c index 45036fa..0438802 100644 --- a/src/compiler/nir/nir_lower_locals_to_regs.c +++ b/src/compiler/nir/nir_lower_locals_to_regs.c @@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, if (src.reg.indirect) { nir_load_const_instr *load_const = nir_load_const_instr_create(state->shader, 1); - load_const->value.u[0] = glsl_get_length(parent_type); + load_const->value.u32[0] = glsl_get_length(parent_type); nir_instr_insert_before(instr, &load_const->instr); nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul); @@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, mul->src[1].src.is_ssa = true; mul->src[1].src.ssa = &load_const->def; mul->dest.write_mask = 1; - nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL); + nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL); nir_instr_insert_before(instr, &mul->instr); src.reg.indirect->is_ssa = true; @@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, add->src[0].src = *src.reg.indirect; nir_src_copy(&add->src[1].src, &deref_array->indirect, add); add->dest.write_mask = 1; - nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL); + nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL); nir_instr_insert_before(instr, &add->instr); src.reg.indirect->is_ssa = true; @@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state) mov->dest.write_mask = (1 << intrin->num_components) - 1; if (intrin->dest.is_ssa) { nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&mov->dest.dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_phis_to_scalar.c b/src/compiler/nir/nir_lower_phis_to_scalar.c index dd2abcf..026c866 100644 --- a/src/compiler/nir/nir_lower_phis_to_scalar.c +++ b/src/compiler/nir/nir_lower_phis_to_scalar.c @@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) if (!should_lower_phi(phi, state)) continue; + unsigned bit_size = phi->dest.ssa.bit_size; + /* Create a vecN operation to combine the results. Most of these * will be redundant, but copy propagation should clean them up for * us. No need to add the complexity here. @@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op); nir_ssa_dest_init(&vec->instr, &vec->dest.dest, - phi->dest.ssa.num_components, NULL); + phi->dest.ssa.num_components, + bit_size, NULL); vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) { nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx); - nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL); + nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, + phi->dest.ssa.bit_size, NULL); vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa); @@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) /* We need to insert a mov to grab the i'th component of src */ nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx, nir_op_imov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL); mov->dest.write_mask = 1; nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx); mov->src[0].swizzle[0] = i; diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c index 79f6bed..c1cd139 100644 --- a/src/compiler/nir/nir_lower_system_values.c +++ b/src/compiler/nir/nir_lower_system_values.c @@ -65,9 +65,9 @@ convert_block(nir_block *block, void *void_state) */ nir_const_value local_size; - local_size.u[0] = b->shader->info.cs.local_size[0]; - local_size.u[1] = b->shader->info.cs.local_size[1]; - local_size.u[2] = b->shader->info.cs.local_size[2]; + local_size.u32[0] = b->shader->info.cs.local_size[0]; + local_size.u32[1] = b->shader->info.cs.local_size[1]; + local_size.u32[2] = b->shader->info.cs.local_size[2]; nir_ssa_def *group_id = nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0); diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c index 806acd8..4999603 100644 --- a/src/compiler/nir/nir_lower_tex.c +++ b/src/compiler/nir/nir_lower_tex.c @@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex) txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0)); txs->src[0].src_type = nir_tex_src_lod; - nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL); + nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL); nir_builder_instr_insert(b, &txs->instr); return nir_i2f(b, &txs->dest.ssa); @@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val) memset(&v, 0, sizeof(v)); if (swizzle_val == 4) { - v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0; + v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0; } else { assert(swizzle_val == 5); if (type == nir_type_float) - v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0; + v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0; else - v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1; + v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1; } return nir_build_imm(b, 4, v); diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c index fe3507c..c7fb67e 100644 --- a/src/compiler/nir/nir_lower_two_sided_color.c +++ b/src/compiler/nir/nir_lower_two_sided_color.c @@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in) load->num_components = 4; nir_intrinsic_set_base(load, in->data.driver_location); load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); return &load->dest.ssa; diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c index 7db9839..c994f0f 100644 --- a/src/compiler/nir/nir_lower_var_copies.c +++ b/src/compiler/nir/nir_lower_var_copies.c @@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr, assert(src_tail->type == dest_tail->type); unsigned num_components = glsl_get_vector_elements(src_tail->type); + unsigned bit_size = + glsl_get_bit_size(glsl_get_base_type(src_tail->type)); nir_intrinsic_instr *load = nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var); load->num_components = num_components; load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref)); - nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size, + NULL); nir_instr_insert_before(©_instr->instr, &load->instr); diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c index a3f3fcf..9f9e454 100644 --- a/src/compiler/nir/nir_lower_vars_to_ssa.c +++ b/src/compiler/nir/nir_lower_vars_to_ssa.c @@ -505,6 +505,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(state->shader, intrin->num_components); + undef->def.bit_size = intrin->dest.ssa.bit_size; nir_instr_insert_before(&intrin->instr, &undef->instr); nir_instr_remove(&intrin->instr); @@ -528,7 +529,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) mov->dest.write_mask = (1 << intrin->num_components) - 1; nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_instr_insert_before(&intrin->instr, &mov->instr); nir_instr_remove(&intrin->instr); @@ -719,6 +721,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl) node->pb_value = nir_phi_builder_add_value(state.phi_builder, glsl_get_vector_elements(node->type), + glsl_get_bit_size(glsl_get_base_type(node->type)), store_blocks); if (node->deref->var->constant_initializer) { diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 60ade4a..d6b658d 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -90,8 +90,12 @@ class Opcode(object): # helper variables for strings tfloat = "float" tint = "int" -tbool = "bool" +tbool = "bool32" tuint = "uint" +tfloat32 = "float32" +tint32 = "int32" +tuint32 = "uint32" +tfloat64 = "float64" commutative = "commutative " associative = "associative " @@ -155,57 +159,57 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)") unop("fsqrt", tfloat, "sqrtf(src0)") unop("fexp2", tfloat, "exp2f(src0)") unop("flog2", tfloat, "log2f(src0)") -unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion. -unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion -unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion. +unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion. +unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion +unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion. # Float-to-boolean conversion -unop_convert("f2b", tbool, tfloat, "src0 != 0.0f") +unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f") # Boolean-to-float conversion -unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f") +unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f") # Int-to-boolean conversion -unop_convert("i2b", tbool, tint, "src0 != 0") -unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion -unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion. +unop_convert("i2b", tbool, tint32, "src0 != 0") +unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion +unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion. # Unary floating-point rounding operations. -unop("ftrunc", tfloat, "truncf(src0)") -unop("fceil", tfloat, "ceilf(src0)") -unop("ffloor", tfloat, "floorf(src0)") -unop("ffract", tfloat, "src0 - floorf(src0)") -unop("fround_even", tfloat, "_mesa_roundevenf(src0)") +unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") +unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") +unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") +unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") +unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") # Trigonometric operations. -unop("fsin", tfloat, "sinf(src0)") -unop("fcos", tfloat, "cosf(src0)") +unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") +unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") # Partial derivatives. -unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0. -unop("fddy", tfloat, "0.0f") -unop("fddx_fine", tfloat, "0.0f") -unop("fddy_fine", tfloat, "0.0f") -unop("fddx_coarse", tfloat, "0.0f") -unop("fddy_coarse", tfloat, "0.0f") +unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. +unop("fddy", tfloat, "0.0") +unop("fddx_fine", tfloat, "0.0") +unop("fddy_fine", tfloat, "0.0") +unop("fddx_coarse", tfloat, "0.0") +unop("fddy_coarse", tfloat, "0.0") # Floating point pack and unpack operations. def pack_2x16(fmt): - unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """ + unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ dst.x = (uint32_t) pack_fmt_1x16(src0.x); dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; """.replace("fmt", fmt)) def pack_4x8(fmt): - unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """ + unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ dst.x = (uint32_t) pack_fmt_1x8(src0.x); dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; @@ -213,13 +217,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; """.replace("fmt", fmt)) def unpack_2x16(fmt): - unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """ + unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); """.replace("fmt", fmt)) def unpack_4x8(fmt): - unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """ + unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); @@ -238,11 +242,11 @@ unpack_2x16("unorm") unpack_4x8("unorm") unpack_2x16("half") -unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """ +unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ dst.x = (src0.x & 0xffff) | (src0.y >> 16); """) -unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """ +unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ dst.x = (src0.x << 0) | (src0.y << 8) | (src0.z << 16) | @@ -252,22 +256,22 @@ dst.x = (src0.x << 0) | # Lowered floating point unpacking operations. -unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint, +unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, "unpack_half_1x16((uint16_t)(src0.x & 0xffff))") -unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint, +unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, "unpack_half_1x16((uint16_t)(src0.x >> 16))") # Bit operations, part of ARB_gpu_shader5. -unop("bitfield_reverse", tuint, """ +unop("bitfield_reverse", tuint32, """ /* we're not winning any awards for speed here, but that's ok */ dst = 0; for (unsigned bit = 0; bit < 32; bit++) dst |= ((src0 >> bit) & 1) << (31 - bit); """) -unop("bit_count", tuint, """ +unop("bit_count", tuint32, """ dst = 0; for (unsigned bit = 0; bit < 32; bit++) { if ((src0 >> bit) & 1) @@ -275,7 +279,7 @@ for (unsigned bit = 0; bit < 32; bit++) { } """) -unop_convert("ufind_msb", tint, tuint, """ +unop_convert("ufind_msb", tint32, tuint32, """ dst = -1; for (int bit = 31; bit > 0; bit--) { if ((src0 >> bit) & 1) { @@ -285,7 +289,7 @@ for (int bit = 31; bit > 0; bit--) { } """) -unop("ifind_msb", tint, """ +unop("ifind_msb", tint32, """ dst = -1; for (int bit = 31; bit >= 0; bit--) { /* If src0 < 0, we're looking for the first 0 bit. @@ -299,7 +303,7 @@ for (int bit = 31; bit >= 0; bit--) { } """) -unop("find_lsb", tint, """ +unop("find_lsb", tint32, """ dst = -1; for (unsigned bit = 0; bit < 32; bit++) { if ((src0 >> bit) & 1) { @@ -359,10 +363,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1") # low 32-bits of signed/unsigned integer multiply binop("imul", tint, commutative + associative, "src0 * src1") # high 32-bits of signed integer multiply -binop("imul_high", tint, commutative, +binop("imul_high", tint32, commutative, "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") # high 32-bits of unsigned integer multiply -binop("umul_high", tuint, commutative, +binop("umul_high", tuint32, commutative, "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") binop("fdiv", tfloat, "", "src0 / src1") @@ -427,18 +431,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 -binop_reduce("fall_equal", 1, tfloat, tfloat, "{src0} == {src1}", +binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", "{src0} && {src1}", "{src} ? 1.0f : 0.0f") -binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}", +binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", "{src0} || {src1}", "{src} ? 1.0f : 0.0f") # These comparisons for integer-less hardware return 1.0 and 0.0 for true # and false respectively -binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than -binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal -binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal -binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal +binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than +binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal +binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal +binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal binop("ishl", tint, "", "src0 << src1") @@ -461,11 +465,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1") # These use (src != 0.0) for testing the truth of the input, and output 1.0 # for true and 0.0 for false -binop("fand", tfloat, commutative, +binop("fand", tfloat32, commutative, "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") -binop("for", tfloat, commutative, +binop("for", tfloat32, commutative, "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") -binop("fxor", tfloat, commutative, +binop("fxor", tfloat32, commutative, "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", @@ -487,7 +491,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") # Saturated vector add for 4 8bit ints. -binop("usadd_4x8", tint, commutative + associative, """ +binop("usadd_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; @@ -495,7 +499,7 @@ for (int i = 0; i < 32; i += 8) { """) # Saturated vector subtract for 4 8bit ints. -binop("ussub_4x8", tint, "", """ +binop("ussub_4x8", tint32, "", """ dst = 0; for (int i = 0; i < 32; i += 8) { int src0_chan = (src0 >> i) & 0xff; @@ -506,7 +510,7 @@ for (int i = 0; i < 32; i += 8) { """) # vector min for 4 8bit ints. -binop("umin_4x8", tint, commutative + associative, """ +binop("umin_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; @@ -514,7 +518,7 @@ for (int i = 0; i < 32; i += 8) { """) # vector max for 4 8bit ints. -binop("umax_4x8", tint, commutative + associative, """ +binop("umax_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; @@ -522,7 +526,7 @@ for (int i = 0; i < 32; i += 8) { """) # unorm multiply: (a * b) / 255. -binop("umul_unorm_4x8", tint, commutative + associative, """ +binop("umul_unorm_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { int src0_chan = (src0 >> i) & 0xff; @@ -531,15 +535,15 @@ for (int i = 0; i < 32; i += 8) { } """) -binop("fpow", tfloat, "", "powf(src0, src1)") +binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") -binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat, +binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") # bfm implements the behavior of the first operation of the SM5 "bfi" assembly # and that of the "bfi1" i965 instruction. That is, it has undefined behavior # if either of its arguments are 32. -binop_convert("bfm", tuint, tint, "", """ +binop_convert("bfm", tuint32, tint32, "", """ int bits = src0, offset = src1; if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) dst = 0; /* undefined */ @@ -548,7 +552,7 @@ else """) opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """ -dst = ldexpf(src0, src1); +dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); /* flush denormals to zero. */ if (!isnormal(dst)) dst = copysignf(0.0f, src0); @@ -588,12 +592,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). -triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2") +triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") opcode("bcsel", 0, tuint, [0, 0, 0], [tbool, tuint, tuint], "", "src0 ? src1 : src2") # SM5 bfi assembly -triop("bfi", tuint, """ +triop("bfi", tuint32, """ unsigned mask = src0, insert = src1, base = src2; if (mask == 0) { dst = base; @@ -608,8 +612,8 @@ if (mask == 0) { """) # SM5 ubfe/ibfe assembly -opcode("ubfe", 0, tuint, - [0, 0, 0], [tuint, tint, tint], "", """ +opcode("ubfe", 0, tuint32, + [0, 0, 0], [tuint32, tint32, tint32], "", """ unsigned base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -622,8 +626,8 @@ if (bits == 0) { dst = base >> offset; } """) -opcode("ibfe", 0, tint, - [0, 0, 0], [tint, tint, tint], "", """ +opcode("ibfe", 0, tint32, + [0, 0, 0], [tint32, tint32, tint32], "", """ int base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -638,8 +642,8 @@ if (bits == 0) { """) # GLSL bitfieldExtract() -opcode("ubitfield_extract", 0, tuint, - [0, 0, 0], [tuint, tint, tint], "", """ +opcode("ubitfield_extract", 0, tuint32, + [0, 0, 0], [tuint32, tint32, tint32], "", """ unsigned base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -650,8 +654,8 @@ if (bits == 0) { dst = (base >> offset) & ((1ull << bits) - 1); } """) -opcode("ibitfield_extract", 0, tint, - [0, 0, 0], [tint, tint, tint], "", """ +opcode("ibitfield_extract", 0, tint32, + [0, 0, 0], [tint32, tint32, tint32], "", """ int base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -678,8 +682,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, [tuint, tuint, tuint, tuint], "", const_expr) -opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0], - [tuint, tuint, tint, tint], "", """ +opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], + [tuint32, tuint32, tint32, tint32], "", """ unsigned base = src0, insert = src1; int offset = src2, bits = src3; if (bits == 0) { diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 54f7d86..ed21c5d 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -35,10 +35,17 @@ d = 'd' # Written in the form (<search>, <replace>) where <search> is an expression # and <replace> is either an expression or a value. An expression is -# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>) +# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) # where each source is either an expression or a value. A value can be # either a numeric constant or a string representing a variable name. # +# If the opcode in a search expression is prefixed by a '~' character, this +# indicates that the operation is inexact. Such operations will only get +# applied to SSA values that do not have the exact bit set. This should be +# used by by any optimizations that are not bit-for-bit exact. It should not, +# however, be used for backend-requested lowering operations as those need to +# happen regardless of precision. +# # Variable names are specified as "[#]name[@type]" where "#" inicates that # the given variable will only match constants and the type indicates that # the given variable will only match values from ALU instructions with the @@ -55,19 +62,19 @@ optimizations = [ (('fabs', ('fneg', a)), ('fabs', a)), (('iabs', ('iabs', a)), ('iabs', a)), (('iabs', ('ineg', a)), ('iabs', a)), - (('fadd', a, 0.0), a), + (('~fadd', a, 0.0), a), (('iadd', a, 0), a), (('usadd_4x8', a, 0), a), (('usadd_4x8', a, ~0), ~0), - (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), + (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), - (('fadd', ('fneg', a), a), 0.0), + (('~fadd', ('fneg', a), a), 0.0), (('iadd', ('ineg', a), a), 0), (('iadd', ('ineg', a), ('iadd', a, b)), b), (('iadd', a, ('iadd', ('ineg', a), b)), b), - (('fadd', ('fneg', a), ('fadd', a, b)), b), - (('fadd', a, ('fadd', ('fneg', a), b)), b), - (('fmul', a, 0.0), 0.0), + (('~fadd', ('fneg', a), ('fadd', a, b)), b), + (('~fadd', a, ('fadd', ('fneg', a), b)), b), + (('~fmul', a, 0.0), 0.0), (('imul', a, 0), 0), (('umul_unorm_4x8', a, 0), 0), (('umul_unorm_4x8', a, ~0), a), @@ -76,32 +83,48 @@ optimizations = [ (('fmul', a, -1.0), ('fneg', a)), (('imul', a, -1), ('ineg', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), - (('ffma', 0.0, a, b), b), - (('ffma', a, 0.0, b), b), - (('ffma', a, b, 0.0), ('fmul', a, b)), + (('~ffma', 0.0, a, b), b), + (('~ffma', a, 0.0, b), b), + (('~ffma', a, b, 0.0), ('fmul', a, b)), (('ffma', a, 1.0, b), ('fadd', a, b)), (('ffma', 1.0, a, b), ('fadd', a, b)), - (('flrp', a, b, 0.0), a), - (('flrp', a, b, 1.0), b), - (('flrp', a, a, b), a), - (('flrp', 0.0, a, b), ('fmul', a, b)), + (('~flrp', a, b, 0.0), a), + (('~flrp', a, b, 1.0), b), + (('~flrp', a, a, b), a), + (('~flrp', 0.0, a, b), ('fmul', a, b)), + (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'), (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'), (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), - (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'), - (('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'), + (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'), + (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp'), + (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'), + (('~fadd', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'), (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), - (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'), + (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'), # Comparison simplifications - (('inot', ('flt', a, b)), ('fge', a, b)), - (('inot', ('fge', a, b)), ('flt', a, b)), - (('inot', ('feq', a, b)), ('fne', a, b)), - (('inot', ('fne', a, b)), ('feq', a, b)), + (('~inot', ('flt', a, b)), ('fge', a, b)), + (('~inot', ('fge', a, b)), ('flt', a, b)), + (('~inot', ('feq', a, b)), ('fne', a, b)), + (('~inot', ('fne', a, b)), ('feq', a, b)), (('inot', ('ilt', a, b)), ('ige', a, b)), (('inot', ('ige', a, b)), ('ilt', a, b)), (('inot', ('ieq', a, b)), ('ine', a, b)), (('inot', ('ine', a, b)), ('ieq', a, b)), + + # 0.0 >= b2f(a) + # b2f(a) <= 0.0 + # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 + # inot(a) + (('fge', 0.0, ('b2f', a)), ('inot', a)), + + # 0.0 < fabs(a) + # fabs(a) > 0.0 + # fabs(a) != 0.0 because fabs(a) must be >= 0 + # a != 0.0 + (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)), + (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), - (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)), + (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)), (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)), (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)), (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), @@ -111,15 +134,19 @@ optimizations = [ (('imax', a, a), a), (('umin', a, a), a), (('umax', a, a), a), - (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), - (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), + (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), + (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)), - (('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), - (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), - (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), - (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), + (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), + (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), + (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), + (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), + (('fabs', ('slt', a, b)), ('slt', a, b)), + (('fabs', ('sge', a, b)), ('sge', a, b)), + (('fabs', ('seq', a, b)), ('seq', a, b)), + (('fabs', ('sne', a, b)), ('sne', a, b)), (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), @@ -151,7 +178,6 @@ optimizations = [ (('ior', a, 0), a), (('fxor', a, a), 0.0), (('ixor', a, a), 0), - (('fxor', a, 0.0), a), (('ixor', a, 0), a), (('inot', ('inot', a)), a), # DeMorgan's Laws @@ -167,35 +193,35 @@ optimizations = [ (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)), (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)), # Exponential/logarithmic identities - (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a - (('flog2', ('fexp2', a)), a), # lg2(2^a) = a + (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a + (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) - (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b - (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), - ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d - (('fpow', a, 1.0), a), - (('fpow', a, 2.0), ('fmul', a, a)), - (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), - (('fpow', 2.0, a), ('fexp2', a)), - (('fpow', ('fpow', a, 2.2), 0.454545), a), - (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), - (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), - (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), - (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), - (('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), - (('flog2', ('frcp', a)), ('fneg', ('flog2', a))), - (('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), - (('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), - (('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))), - (('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))), - (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))), + (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b + (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), + ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d + (('~fpow', a, 1.0), a), + (('~fpow', a, 2.0), ('fmul', a, a)), + (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), + (('~fpow', 2.0, a), ('fexp2', a)), + (('~fpow', ('fpow', a, 2.2), 0.454545), a), + (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), + (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), + (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), + (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), + (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), + (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), + (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), + (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), + (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))), + (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))), + (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))), # Division and reciprocal - (('fdiv', 1.0, a), ('frcp', a)), + (('~fdiv', 1.0, a), ('frcp', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), - (('frcp', ('frcp', a)), a), - (('frcp', ('fsqrt', a)), ('frsq', a)), + (('~frcp', ('frcp', a)), a), + (('~frcp', ('fsqrt', a)), ('frsq', a)), (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), - (('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), + (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), # Boolean simplifications (('ieq', 'a@bool', True), a), (('ine', 'a@bool', True), ('inot', a)), @@ -216,6 +242,10 @@ optimizations = [ (('i2b', ('b2i', a)), a), (('f2i', ('ftrunc', a)), ('f2i', a)), (('f2u', ('ftrunc', a)), ('f2u', a)), + (('i2b', ('ineg', a)), ('i2b', a)), + (('i2b', ('iabs', a)), ('i2b', a)), + (('fabs', ('b2f', a)), ('b2f', a)), + (('iabs', ('b2i', a)), ('b2i', a)), # Byte extraction (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), @@ -228,7 +258,7 @@ optimizations = [ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), # Subtracts - (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), + (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), (('isub', a, ('isub', 0, b)), ('iadd', a, b)), (('ussub_4x8', a, 0), a), (('ussub_4x8', a, ~0), 0), @@ -236,7 +266,7 @@ optimizations = [ (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), (('ineg', a), ('isub', 0, a), 'options->lower_negate'), - (('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), + (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), (('iadd', a, ('isub', 0, b)), ('isub', a, b)), (('fabs', ('fsub', 0.0, a)), ('fabs', a)), (('iabs', ('isub', 0, a)), ('iabs', a)), @@ -368,10 +398,13 @@ for op in ['flt', 'fge', 'feq', 'fne', # they help code generation but do not necessarily produce code that is # more easily optimizable. late_optimizations = [ + # Most of these optimizations aren't quite safe when you get infinity or + # Nan involved but the first one should be fine. (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))), - (('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), - (('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), - (('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), + (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), + (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), + (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), + (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'), (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'), (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c index 04876a4..e64ca36 100644 --- a/src/compiler/nir/nir_opt_constant_folding.c +++ b/src/compiler/nir/nir_opt_constant_folding.c @@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) if (!instr->dest.dest.is_ssa) return false; + /* In the case that any outputs/inputs have unsized types, then we need to + * guess the bit-size. In this case, the validator ensures that all + * bit-sizes match so we can just take the bit-size from first + * output/input with an unsized type. If all the outputs/inputs are sized + * then we don't need to guess the bit-size at all because the code we + * generate for constant opcodes in this case already knows the sizes of + * the types involved and does not need the provided bit-size for anything + * (although it still requires to receive a valid bit-size). + */ + unsigned bit_size = 0; + if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) + bit_size = instr->dest.dest.ssa.bit_size; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { if (!instr->src[i].src.is_ssa) return false; + if (bit_size == 0 && + !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) { + bit_size = instr->src[i].src.ssa->bit_size; + } + nir_instr *src_instr = instr->src[i].src.ssa->parent_instr; if (src_instr->type != nir_instr_type_load_const) @@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i); j++) { - src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]]; + if (load_const->def.bit_size == 64) + src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]]; + else + src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]]; } /* We shouldn't have any source modifiers in the optimization loop. */ assert(!instr->src[i].abs && !instr->src[i].negate); } + if (bit_size == 0) + bit_size = 32; + /* We shouldn't have any saturate modifiers in the optimization loop. */ assert(!instr->dest.saturate); nir_const_value dest = nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components, - src); + bit_size, src); nir_load_const_instr *new_instr = nir_load_const_instr_create(mem_ctx, instr->dest.dest.ssa.num_components); + new_instr->def.bit_size = instr->dest.dest.ssa.bit_size; new_instr->value = dest; nir_instr_insert_before(&instr->instr, &new_instr->instr); @@ -106,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref) nir_load_const_instr *indirect = nir_instr_as_load_const(arr->indirect.ssa->parent_instr); - arr->base_offset += indirect->value.u[0]; + arr->base_offset += indirect->value.u32[0]; /* Clear out the source */ nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL)); diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c index 4cc6798..4658b23 100644 --- a/src/compiler/nir/nir_opt_dead_cf.c +++ b/src/compiler/nir/nir_opt_dead_cf.c @@ -228,7 +228,7 @@ dead_cf_block(nir_block *block) if (!const_value) return false; - opt_constant_if(following_if, const_value->u[0] != 0); + opt_constant_if(following_if, const_value->u32[0] != 0); return true; } diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index 0fc658d..bad9dc4 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state) } nir_ssa_dest_init(&sel->instr, &sel->dest.dest, - phi->dest.ssa.num_components, phi->dest.ssa.name); + phi->dest.ssa.num_components, + phi->dest.ssa.bit_size, phi->dest.ssa.name); sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; nir_ssa_def_rewrite_uses(&phi->dest.ssa, diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c index 5429083..a39e360 100644 --- a/src/compiler/nir/nir_phi_builder.c +++ b/src/compiler/nir/nir_phi_builder.c @@ -52,6 +52,7 @@ struct nir_phi_builder_value { /* Needed so we can create phis and undefs */ unsigned num_components; + unsigned bit_size; /* The list of phi nodes associated with this value. Phi nodes are not * added directly. Instead, they are created, the instr->block pointer @@ -61,8 +62,18 @@ struct nir_phi_builder_value { */ struct exec_list phis; - /* Array of SSA defs, indexed by block. If a phi needs to be inserted - * in a given block, it will have the magic value NEEDS_PHI. + /* Array of SSA defs, indexed by block. For each block, this array has has + * one of three types of values: + * + * - NULL. Indicates that there is no known definition in this block. If + * you need to find one, look at the block's immediate dominator. + * + * - NEEDS_PHI. Indicates that the block may need a phi node but none has + * been created yet. If a def is requested for a block, a phi will need + * to be created. + * + * - A regular SSA def. This will be either the result of a phi node or + * one of the defs provided by nir_phi_builder_value_set_blocK_def(). */ nir_ssa_def *defs[0]; }; @@ -101,7 +112,7 @@ nir_phi_builder_create(nir_function_impl *impl) struct nir_phi_builder_value * nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, - const BITSET_WORD *defs) + unsigned bit_size, const BITSET_WORD *defs) { struct nir_phi_builder_value *val; unsigned i, w_start = 0, w_end = 0; @@ -109,6 +120,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks); val->builder = pb; val->num_components = num_components; + val->bit_size = bit_size; exec_list_make_empty(&val->phis); exec_list_push_tail(&pb->values, &val->node); @@ -127,8 +139,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, set_foreach(cur->dom_frontier, dom_entry) { nir_block *next = (nir_block *) dom_entry->key; - /* - * If there's more than one return statement, then the end block + /* If there's more than one return statement, then the end block * can be a join point for some definitions. However, there are * no instructions in the end block, so nothing would use those * phi nodes. Of course, we couldn't place those phi nodes @@ -139,6 +150,10 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, continue; if (val->defs[next->index] == NULL) { + /* Instead of creating a phi node immediately, we simply set the + * value to the magic value NEEDS_PHI. Later, we create phi nodes + * on demand in nir_phi_builder_value_get_block_def(). + */ val->defs[next->index] = NEEDS_PHI; if (pb->work[next->index] < pb->iter_count) { @@ -163,7 +178,9 @@ nir_ssa_def * nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val, nir_block *block) { + /* For each block, we have one of three types of values */ if (val->defs[block->index] == NULL) { + /* NULL indicates that we have no SSA def for this block. */ if (block->imm_dom) { /* Grab it from our immediate dominator. We'll stash it here for * easy access later. @@ -185,17 +202,36 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val, return &undef->def; } } else if (val->defs[block->index] == NEEDS_PHI) { - /* If we need a phi instruction, go ahead and create one but don't - * add it to the program yet. Later, we'll go through and set up phi - * sources and add the instructions will be added at that time. + /* The magic value NEEDS_PHI indicates that the block needs a phi node + * but none has been created. We need to create one now so we can + * return it to the caller. + * + * Because a phi node may use SSA defs that it does not dominate (this + * happens in loops), we do not yet have enough information to fully + * fill out the phi node. Instead, the phi nodes we create here will be + * empty (have no sources) and won't actually be placed in the block's + * instruction list yet. Later, in nir_phi_builder_finish(), we walk + * over all of the phi instructions, fill out the sources lists, and + * place them at the top of their respective block's instruction list. + * + * Creating phi nodes on-demand allows us to avoid creating dead phi + * nodes that will just get deleted later. While this probably isn't a + * big win for a full into-SSA pass, other users may use the phi builder + * to make small SSA form repairs where most of the phi nodes will never + * be used. */ nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader); - nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL); + nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, + val->bit_size, NULL); phi->instr.block = block; exec_list_push_tail(&val->phis, &phi->instr.node); val->defs[block->index] = &phi->dest.ssa; return &phi->dest.ssa; } else { + /* In this case, we have an actual SSA def. It's either the result of a + * phi node created by the case above or one passed to us through + * nir_phi_builder_value_set_block_def(). + */ return val->defs[block->index]; } } @@ -216,9 +252,14 @@ nir_phi_builder_finish(struct nir_phi_builder *pb) NIR_VLA(nir_block *, preds, num_blocks); foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) { - /* We can't iterate over the list of phis normally because we are - * removing them as we go and, in some cases, adding new phis as we - * build the source lists of others. + /* We treat the linked list of phi nodes like a worklist. The list is + * pre-populated by calls to nir_phi_builder_value_get_block_def() that + * create phi nodes. As we fill in the sources of phi nodes, more may + * be created and are added to the end of the list. + * + * Because we are adding and removing phi nodes from the list as we go, + * we can't iterate over it normally. Instead, we just iterate until + * the list is empty. */ while (!exec_list_is_empty(&val->phis)) { struct exec_node *head = exec_list_get_head(&val->phis); diff --git a/src/compiler/nir/nir_phi_builder.h b/src/compiler/nir/nir_phi_builder.h index 50251bf..edc5302 100644 --- a/src/compiler/nir/nir_phi_builder.h +++ b/src/compiler/nir/nir_phi_builder.h @@ -25,7 +25,38 @@ #include "nir.h" +/** A helper for placing phi nodes in a NIR shader + * + * Basic usage goes something like this: + * + * each variable, var, has: + * a bitset var.defs of blocks where the variable is defined + * a struct nir_phi_builder_value *pb_val + * + * // initialize bitsets + * foreach block: + * foreach def of variable var: + * var.defs[def.block] = true; + * + * // initialize phi builder + * pb = nir_phi_builder_create() + * foreach var: + * var.pb_val = nir_phi_builder_add_value(pb, var.defs) + * + * // Visit each block. This needs to visit dominators first; + * // nir_for_each_block() will be ok. + * foreach block: + * foreach instruction: + * foreach use of variable var: + * replace use with nir_phi_builder_get_block_def(var.pb_val) + * foreach def of variable var: + * create ssa def, register with + * nir_phi_builder_set_block_def(var.pb_val) + * + * nir_phi_builder_finish(pb) + */ struct nir_phi_builder; + struct nir_phi_builder_value; /* Create a new phi builder. @@ -43,7 +74,7 @@ struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl); */ struct nir_phi_builder_value * nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, - const BITSET_WORD *defs); + unsigned bit_size, const BITSET_WORD *defs); /* Register a definition for the given value and block. * diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 24d5281..60b74d1 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state) print_alu_dest(&instr->dest, state); fprintf(fp, " = %s", nir_op_infos[instr->op].name); + if (instr->exact) + fprintf(fp, "!"); if (instr->dest.saturate) fprintf(fp, ".sat"); fprintf(fp, " "); @@ -714,7 +716,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state) * and then print the float in a comment for readability. */ - fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]); + fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]); } fprintf(fp, ")"); diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c index 3ab4f0f..96c791c 100644 --- a/src/compiler/nir/nir_repair_ssa.c +++ b/src/compiler/nir/nir_repair_ssa.c @@ -85,7 +85,8 @@ repair_ssa_def(nir_ssa_def *def, void *void_state) BITSET_SET(state->def_set, def->parent_instr->block->index); struct nir_phi_builder_value *val = - nir_phi_builder_add_value(pb, def->num_components, state->def_set); + nir_phi_builder_add_value(pb, def->num_components, def->bit_size, + state->def_set); nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def); diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c index 56d7e81..6e63063 100644 --- a/src/compiler/nir/nir_search.c +++ b/src/compiler/nir/nir_search.c @@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr) case nir_op_inot: return src_is_bool(instr->src[0].src); default: - return nir_op_infos[instr->op].output_type == nir_type_bool; + return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type) + == nir_type_bool); } } @@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src, nir_alu_instr *src_alu = nir_instr_as_alu(instr->src[src].src.ssa->parent_instr); - if (nir_op_infos[src_alu->op].output_type != var->type && - !(var->type == nir_type_bool && alu_instr_is_bool(src_alu))) + if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) != + var->type && + !(nir_alu_type_get_base_type(var->type) == nir_type_bool && + alu_instr_is_bool(src_alu))) return false; } @@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src, nir_load_const_instr *load = nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr); - switch (nir_op_infos[instr->op].input_types[src]) { + switch (const_val->type) { case nir_type_float: for (unsigned i = 0; i < num_components; ++i) { - if (load->value.f[new_swizzle[i]] != const_val->data.f) + double val; + switch (load->def.bit_size) { + case 32: + val = load->value.f32[new_swizzle[i]]; + break; + case 64: + val = load->value.f64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.d) return false; } return true; + case nir_type_int: + for (unsigned i = 0; i < num_components; ++i) { + int64_t val; + switch (load->def.bit_size) { + case 32: + val = load->value.i32[new_swizzle[i]]; + break; + case 64: + val = load->value.i64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.i) + return false; + } + return true; + case nir_type_uint: - case nir_type_bool: + case nir_type_bool32: for (unsigned i = 0; i < num_components; ++i) { - if (load->value.i[new_swizzle[i]] != const_val->data.i) + uint64_t val; + switch (load->def.bit_size) { + case 32: + val = load->value.u32[new_swizzle[i]]; + break; + case 64: + val = load->value.u64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.u) return false; } return true; + default: unreachable("Invalid alu source type"); } @@ -191,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr, if (instr->op != expr->opcode) return false; + assert(instr->dest.dest.is_ssa); + if (expr->inexact && instr->exact) + return false; + assert(!instr->dest.saturate); assert(nir_op_infos[instr->op].num_inputs > 0); @@ -244,9 +295,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr, } } +typedef struct bitsize_tree { + unsigned num_srcs; + struct bitsize_tree *srcs[4]; + + unsigned common_size; + bool is_src_sized[4]; + bool is_dest_sized; + + unsigned dest_size; + unsigned src_size[4]; +} bitsize_tree; + +static bitsize_tree * +build_bitsize_tree(void *mem_ctx, struct match_state *state, + const nir_search_value *value) +{ + bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree); + + switch (value->type) { + case nir_search_value_expression: { + nir_search_expression *expr = nir_search_value_as_expression(value); + nir_op_info info = nir_op_infos[expr->opcode]; + tree->num_srcs = info.num_inputs; + tree->common_size = 0; + for (unsigned i = 0; i < info.num_inputs; i++) { + tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]); + if (tree->is_src_sized[i]) + tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]); + tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]); + } + tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type); + if (tree->is_dest_sized) + tree->dest_size = nir_alu_type_get_type_size(info.output_type); + break; + } + + case nir_search_value_variable: { + nir_search_variable *var = nir_search_value_as_variable(value); + tree->num_srcs = 0; + tree->is_dest_sized = true; + tree->dest_size = nir_src_bit_size(state->variables[var->variable].src); + break; + } + + case nir_search_value_constant: { + tree->num_srcs = 0; + tree->is_dest_sized = false; + tree->common_size = 0; + break; + } + } + + return tree; +} + +static unsigned +bitsize_tree_filter_up(bitsize_tree *tree) +{ + for (unsigned i = 0; i < tree->num_srcs; i++) { + unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]); + if (src_size == 0) + continue; + + if (tree->is_src_sized[i]) { + assert(src_size == tree->src_size[i]); + } else if (tree->common_size != 0) { + assert(src_size == tree->common_size); + tree->src_size[i] = src_size; + } else { + tree->common_size = src_size; + tree->src_size[i] = src_size; + } + } + + if (tree->num_srcs && tree->common_size) { + if (tree->dest_size == 0) + tree->dest_size = tree->common_size; + else if (!tree->is_dest_sized) + assert(tree->dest_size == tree->common_size); + + for (unsigned i = 0; i < tree->num_srcs; i++) { + if (!tree->src_size[i]) + tree->src_size[i] = tree->common_size; + } + } + + return tree->dest_size; +} + +static void +bitsize_tree_filter_down(bitsize_tree *tree, unsigned size) +{ + if (tree->dest_size) + assert(tree->dest_size == size); + else + tree->dest_size = size; + + if (!tree->is_dest_sized) { + if (tree->common_size) + assert(tree->common_size == size); + else + tree->common_size = size; + } + + for (unsigned i = 0; i < tree->num_srcs; i++) { + if (!tree->src_size[i]) { + assert(tree->common_size); + tree->src_size[i] = tree->common_size; + } + bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]); + } +} + static nir_alu_src -construct_value(const nir_search_value *value, nir_alu_type type, - unsigned num_components, struct match_state *state, +construct_value(const nir_search_value *value, + unsigned num_components, bitsize_tree *bitsize, bool exact, + struct match_state *state, nir_instr *instr, void *mem_ctx) { switch (value->type) { @@ -257,7 +422,9 @@ construct_value(const nir_search_value *value, nir_alu_type type, num_components = nir_op_infos[expr->opcode].output_size; nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode); - nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL); + nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, + bitsize->dest_size, NULL); + alu->exact = exact; alu->dest.write_mask = (1 << num_components) - 1; alu->dest.saturate = false; @@ -269,8 +436,7 @@ construct_value(const nir_search_value *value, nir_alu_type type, num_components = nir_op_infos[alu->op].input_sizes[i]; alu->src[i] = construct_value(expr->srcs[i], - nir_op_infos[alu->op].input_types[i], - num_components, + num_components, bitsize->srcs[i], exact, state, instr, mem_ctx); } @@ -301,23 +467,57 @@ construct_value(const nir_search_value *value, nir_alu_type type, const nir_search_constant *c = nir_search_value_as_constant(value); nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1); - switch (type) { + switch (c->type) { case nir_type_float: - load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f); - load->value.f[0] = c->data.f; + load->def.name = ralloc_asprintf(load, "%f", c->data.d); + switch (bitsize->dest_size) { + case 32: + load->value.f32[0] = c->data.d; + break; + case 64: + load->value.f64[0] = c->data.d; + break; + default: + unreachable("unknown bit size"); + } break; + case nir_type_int: - load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i); - load->value.i[0] = c->data.i; + load->def.name = ralloc_asprintf(load, "%ld", c->data.i); + switch (bitsize->dest_size) { + case 32: + load->value.i32[0] = c->data.i; + break; + case 64: + load->value.i64[0] = c->data.i; + break; + default: + unreachable("unknown bit size"); + } break; + case nir_type_uint: - case nir_type_bool: - load->value.u[0] = c->data.u; + load->def.name = ralloc_asprintf(load, "%lu", c->data.u); + switch (bitsize->dest_size) { + case 32: + load->value.u32[0] = c->data.u; + break; + case 64: + load->value.u64[0] = c->data.u; + break; + default: + unreachable("unknown bit size"); + } + + case nir_type_bool32: + load->value.u32[0] = c->data.u; break; default: unreachable("Invalid alu source type"); } + load->def.bit_size = bitsize->dest_size; + nir_instr_insert_before(instr, &load->instr); nir_alu_src val; @@ -352,6 +552,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, swizzle, &state)) return NULL; + void *bitsize_ctx = ralloc_context(NULL); + bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace); + bitsize_tree_filter_up(tree); + bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size); + /* Inserting a mov may be unnecessary. However, it's much easier to * simply let copy propagation clean this up than to try to go through * and rewrite swizzles ourselves. @@ -359,11 +564,12 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov); mov->dest.write_mask = instr->dest.write_mask; nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - instr->dest.dest.ssa.num_components, NULL); + instr->dest.dest.ssa.num_components, + instr->dest.dest.ssa.bit_size, NULL); - mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type, - instr->dest.dest.ssa.num_components, &state, - &instr->instr, mem_ctx); + mov->src[0] = construct_value(replace, + instr->dest.dest.ssa.num_components, tree, + instr->exact, &state, &instr->instr, mem_ctx); nir_instr_insert_before(&instr->instr, &mov->instr); nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, @@ -375,5 +581,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, */ nir_instr_remove(&instr->instr); + ralloc_free(bitsize_ctx); + return mov; } diff --git a/src/compiler/nir/nir_search.h b/src/compiler/nir/nir_search.h index 7d47792..61742f1 100644 --- a/src/compiler/nir/nir_search.h +++ b/src/compiler/nir/nir_search.h @@ -71,16 +71,24 @@ typedef struct { typedef struct { nir_search_value value; + nir_alu_type type; + union { - uint32_t u; - int32_t i; - float f; + uint64_t u; + int64_t i; + double d; } data; } nir_search_constant; typedef struct { nir_search_value value; + /* When set on a search expression, the expression will only match an SSA + * value that does *not* have the exact bit set. If unset, the exact bit + * on the SSA value is ignored. + */ + bool inexact; + nir_op opcode; const nir_search_value *srcs[4]; } nir_search_expression; diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c index 44a5054..d588d7d 100644 --- a/src/compiler/nir/nir_to_ssa.c +++ b/src/compiler/nir/nir_to_ssa.c @@ -219,7 +219,9 @@ rewrite_def_forwards(nir_dest *dest, void *_state) state->states[index].num_defs); list_del(&dest->reg.def_link); - nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name); + nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, + reg->bit_size, name); + ralloc_free(name); /* push our SSA destination on the stack */ state->states[index].index++; @@ -271,7 +273,9 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state) instr->dest.write_mask = (1 << num_components) - 1; list_del(&instr->dest.dest.reg.def_link); - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + reg->bit_size, name); + ralloc_free(name); if (nir_op_infos[instr->op].output_size == 0) { /* diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 0c32d5f..9f18d1c 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state) nir_alu_src *src = &instr->src[index]; unsigned num_components; - if (src->src.is_ssa) + unsigned src_bit_size; + if (src->src.is_ssa) { + src_bit_size = src->src.ssa->bit_size; num_components = src->src.ssa->num_components; - else { + } else { + src_bit_size = src->src.reg.reg->bit_size; if (src->src.reg.reg->is_packed) num_components = 4; /* can't check anything */ else @@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state) assert(src->swizzle[i] < num_components); } + nir_alu_type src_type = nir_op_infos[instr->op].input_types[index]; + + /* 8-bit float isn't a thing */ + if (nir_alu_type_get_base_type(src_type) == nir_type_float) + assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64); + + if (nir_alu_type_get_type_size(src_type)) { + /* This source has an explicit bit size */ + assert(nir_alu_type_get_type_size(src_type) == src_bit_size); + } else { + if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) { + unsigned dest_bit_size = + instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size + : instr->dest.dest.reg.reg->bit_size; + assert(dest_bit_size == src_bit_size); + } + } + validate_src(&src->src, state); } @@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state) } static void -validate_alu_dest(nir_alu_dest *dest, validate_state *state) +validate_alu_dest(nir_alu_instr *instr, validate_state *state) { + nir_alu_dest *dest = &instr->dest; + unsigned dest_size = dest->dest.is_ssa ? dest->dest.ssa.num_components : dest->dest.reg.reg->num_components; @@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state) assert(nir_op_infos[alu->op].output_type == nir_type_float || !dest->saturate); + unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size + : dest->dest.reg.reg->bit_size; + nir_alu_type type = nir_op_infos[instr->op].output_type; + + /* 8-bit float isn't a thing */ + if (nir_alu_type_get_base_type(type) == nir_type_float) + assert(bit_size == 16 || bit_size == 32 || bit_size == 64); + + assert(nir_alu_type_get_type_size(type) == 0 || + nir_alu_type_get_type_size(type) == bit_size); + validate_dest(&dest->dest, state); } @@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state) validate_alu_src(instr, i, state); } - validate_alu_dest(&instr->dest, state); + validate_alu_dest(instr, state); } static void diff --git a/src/compiler/nir/spirv/spirv_to_nir.c b/src/compiler/nir/spirv/spirv_to_nir.c index 5a7184a..42a1f95 100644 --- a/src/compiler/nir/spirv/spirv_to_nir.c +++ b/src/compiler/nir/spirv/spirv_to_nir.c @@ -92,7 +92,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, nir_load_const_instr_create(b->shader, num_components); for (unsigned i = 0; i < num_components; i++) - load->value.u[i] = constant->value.u[i]; + load->value.u32[i] = constant->value.u[i]; nir_instr_insert_before_cf_list(&b->impl->body, &load->instr); val->def = &load->def; @@ -109,7 +109,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, nir_load_const_instr_create(b->shader, rows); for (unsigned j = 0; j < rows; j++) - load->value.u[j] = constant->value.u[rows * i + j]; + load->value.u32[j] = constant->value.u[rows * i + j]; nir_instr_insert_before_cf_list(&b->impl->body, &load->instr); col_val->def = &load->def; @@ -1035,6 +1035,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode, nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap); unsigned num_components = glsl_get_vector_elements(val->const_type); + unsigned bit_size = + glsl_get_bit_size(glsl_get_base_type(val->const_type)); nir_const_value src[3]; assert(count <= 7); @@ -1043,14 +1045,16 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode, vtn_value(b, w[4 + i], vtn_value_type_constant)->constant; unsigned j = swap ? 1 - i : i; + assert(bit_size == 32); for (unsigned k = 0; k < num_components; k++) - src[j].u[k] = c->value.u[k]; + src[j].u32[k] = c->value.u[k]; } - nir_const_value res = nir_eval_const_opcode(op, num_components, src); + nir_const_value res = nir_eval_const_opcode(op, num_components, + bit_size, src); for (unsigned k = 0; k < num_components; k++) - val->constant->value.u[k] = res.u[k]; + val->constant->value.u[k] = res.u32[k]; return; } /* default */ @@ -1414,7 +1418,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode, } nir_ssa_dest_init(&instr->instr, &instr->dest, - nir_tex_instr_dest_size(instr), NULL); + nir_tex_instr_dest_size(instr), 32, NULL); assert(glsl_get_vector_elements(ret_type->type) == nir_tex_instr_dest_size(instr)); @@ -1600,7 +1604,7 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode, if (opcode != SpvOpImageWrite) { struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; - nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, 32, NULL); nir_builder_instr_insert(&b->nb, &intrin->instr); @@ -1738,7 +1742,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode, fill_common_atomic_sources(b, opcode, w, &atomic->src[2]); } - nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL); + nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); @@ -1750,7 +1754,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode, } static nir_alu_instr * -create_vec(nir_shader *shader, unsigned num_components) +create_vec(nir_shader *shader, unsigned num_components, unsigned bit_size) { nir_op op; switch (num_components) { @@ -1762,7 +1766,8 @@ create_vec(nir_shader *shader, unsigned num_components) } nir_alu_instr *vec = nir_alu_instr_create(shader, op); - nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL); + nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, + bit_size, NULL); vec->dest.write_mask = (1 << num_components) - 1; return vec; @@ -1779,7 +1784,8 @@ vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src) for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) { nir_alu_instr *vec = create_vec(b->shader, - glsl_get_matrix_columns(src->type)); + glsl_get_matrix_columns(src->type), + glsl_get_bit_size(glsl_get_base_type(src->type))); if (glsl_type_is_vector_or_scalar(src->type)) { vec->src[0].src = nir_src_for_ssa(src->def); vec->src[0].swizzle[0] = i; @@ -1809,7 +1815,8 @@ nir_ssa_def * vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert, unsigned index) { - nir_alu_instr *vec = create_vec(b->shader, src->num_components); + nir_alu_instr *vec = create_vec(b->shader, src->num_components, + src->bit_size); for (unsigned i = 0; i < src->num_components; i++) { if (i == index) { @@ -1854,7 +1861,7 @@ vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components, nir_ssa_def *src0, nir_ssa_def *src1, const uint32_t *indices) { - nir_alu_instr *vec = create_vec(b->shader, num_components); + nir_alu_instr *vec = create_vec(b->shader, num_components, src0->bit_size); nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1); nir_builder_instr_insert(&b->nb, &undef->instr); @@ -1884,7 +1891,8 @@ static nir_ssa_def * vtn_vector_construct(struct vtn_builder *b, unsigned num_components, unsigned num_srcs, nir_ssa_def **srcs) { - nir_alu_instr *vec = create_vec(b->shader, num_components); + nir_alu_instr *vec = create_vec(b->shader, num_components, + srcs[0]->bit_size); unsigned dest_idx = 0; for (unsigned i = 0; i < num_srcs; i++) { diff --git a/src/compiler/nir/spirv/vtn_glsl450.c b/src/compiler/nir/spirv/vtn_glsl450.c index 6b649fd..3360fda 100644 --- a/src/compiler/nir/spirv/vtn_glsl450.c +++ b/src/compiler/nir/spirv/vtn_glsl450.c @@ -627,7 +627,9 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint, nir_alu_instr *instr = nir_alu_instr_create(b->shader, op); nir_ssa_dest_init(&instr->instr, &instr->dest.dest, - glsl_get_vector_elements(val->ssa->type), val->name); + glsl_get_vector_elements(val->ssa->type), + glsl_get_bit_size(glsl_get_base_type(val->ssa->type)), + val->name); instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1; val->ssa->def = &instr->dest.dest.ssa; diff --git a/src/compiler/nir/spirv/vtn_variables.c b/src/compiler/nir/spirv/vtn_variables.c index 31bf416..3cbac1e 100644 --- a/src/compiler/nir/spirv/vtn_variables.c +++ b/src/compiler/nir/spirv/vtn_variables.c @@ -190,7 +190,9 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref, if (load) { nir_ssa_dest_init(&intrin->instr, &intrin->dest, - intrin->num_components, NULL); + intrin->num_components, + glsl_get_bit_size(glsl_get_base_type(tail->type)), + NULL); inout->def = &intrin->dest.ssa; } else { nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1); @@ -322,7 +324,7 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain, nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set); nir_intrinsic_set_binding(instr, chain->var->binding); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b->nb, &instr->instr); return &instr->dest.ssa; @@ -411,7 +413,8 @@ _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load, if (load) { nir_ssa_dest_init(&instr->instr, &instr->dest, - instr->num_components, NULL); + instr->num_components, + glsl_get_bit_size(glsl_get_base_type(type)), NULL); (*inout)->def = &instr->dest.ssa; } @@ -1385,7 +1388,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode, nir_intrinsic_instr_create(b->nb.shader, nir_intrinsic_get_buffer_size); instr->src[0] = nir_src_for_ssa(index); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b->nb, &instr->instr); nir_ssa_def *buf_size = &instr->dest.ssa; |