/************************************************************************** * * Copyright 2007 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ #include #include "i915_reg.h" #include "i915_context.h" #include "i915_fpc.h" #include "i915_debug_private.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_string.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_dump.h" #include "draw/draw_vertex.h" #ifndef M_PI #define M_PI 3.14159265358979323846 #endif /** * Simple pass-through fragment shader to use when we don't have * a real shader (or it fails to compile for some reason). */ static unsigned passthrough_decl[] = { _3DSTATE_PIXEL_SHADER_PROGRAM | ((2*3)-1), /* declare input color: */ (D0_DCL | (REG_TYPE_T << D0_TYPE_SHIFT) | (T_DIFFUSE << D0_NR_SHIFT) | D0_CHANNEL_ALL), 0, 0, }; static unsigned passthrough_program[] = { /* move to output color: */ (A0_MOV | (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | A0_DEST_CHANNEL_ALL | (REG_TYPE_T << A0_SRC0_TYPE_SHIFT) | (T_DIFFUSE << A0_SRC0_NR_SHIFT)), 0x01230000, /* .xyzw */ 0 }; /* 1, -1/3!, 1/5!, -1/7! */ static const float scs_sin_constants[4] = { 1.0, -1.0f / (3 * 2 * 1), 1.0f / (5 * 4 * 3 * 2 * 1), -1.0f / (7 * 6 * 5 * 4 * 3 * 2 * 1) }; /* 1, -1/2!, 1/4!, -1/6! */ static const float scs_cos_constants[4] = { 1.0, -1.0f / (2 * 1), 1.0f / (4 * 3 * 2 * 1), -1.0f / (6 * 5 * 4 * 3 * 2 * 1) }; /* 2*pi, -(2*pi)^3/3!, (2*pi)^5/5!, -(2*pi)^7/7! */ static const float sin_constants[4] = { 2.0 * M_PI, -8.0f * M_PI * M_PI * M_PI / (3 * 2 * 1), 32.0f * M_PI * M_PI * M_PI * M_PI * M_PI / (5 * 4 * 3 * 2 * 1), -128.0f * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI / (7 * 6 * 5 * 4 * 3 * 2 * 1) }; /* 1, -(2*pi)^2/2!, (2*pi)^4/4!, -(2*pi)^6/6! */ static const float cos_constants[4] = { 1.0, -4.0f * M_PI * M_PI / (2 * 1), 16.0f * M_PI * M_PI * M_PI * M_PI / (4 * 3 * 2 * 1), -64.0f * M_PI * M_PI * M_PI * M_PI * M_PI * M_PI / (6 * 5 * 4 * 3 * 2 * 1) }; /** * component-wise negation of ureg */ static INLINE int negate(int reg, int x, int y, int z, int w) { /* Another neat thing about the UREG representation */ return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) | ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) | ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) | ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT)); } /** * In the event of a translation failure, we'll generate a simple color * pass-through program. */ static void i915_use_passthrough_shader(struct i915_fragment_shader *fs) { fs->program = (uint *) MALLOC(sizeof(passthrough_program)); fs->decl = (uint *) MALLOC(sizeof(passthrough_decl)); if (fs->program) { memcpy(fs->program, passthrough_program, sizeof(passthrough_program)); memcpy(fs->decl, passthrough_decl, sizeof(passthrough_decl)); fs->program_len = Elements(passthrough_program); fs->decl_len = Elements(passthrough_decl); } fs->num_constants = 0; } void i915_program_error(struct i915_fp_compile *p, const char *msg, ...) { va_list args; char buffer[1024]; debug_printf("i915_program_error: "); va_start( args, msg ); util_vsnprintf( buffer, sizeof(buffer), msg, args ); va_end( args ); debug_printf("%s", buffer); debug_printf("\n"); p->error = 1; } static uint get_mapping(struct i915_fragment_shader* fs, int unit) { int i; for (i = 0; i < I915_TEX_UNITS; i++) { if (fs->generic_mapping[i] == -1) { fs->generic_mapping[i] = unit; return i; } if (fs->generic_mapping[i] == unit) return i; } debug_printf("Exceeded max generics\n"); return 0; } /** * Construct a ureg for the given source register. Will emit * constants, apply swizzling and negation as needed. */ static uint src_vector(struct i915_fp_compile *p, const struct i915_full_src_register *source, struct i915_fragment_shader *fs) { uint index = source->Register.Index; uint src = 0, sem_name, sem_ind; switch (source->Register.File) { case TGSI_FILE_TEMPORARY: if (source->Register.Index >= I915_MAX_TEMPORARY) { i915_program_error(p, "Exceeded max temporary reg"); return 0; } src = UREG(REG_TYPE_R, index); break; case TGSI_FILE_INPUT: /* XXX: Packing COL1, FOGC into a single attribute works for * texenv programs, but will fail for real fragment programs * that use these attributes and expect them to be a full 4 * components wide. Could use a texcoord to pass these * attributes if necessary, but that won't work in the general * case. * * We also use a texture coordinate to pass wpos when possible. */ sem_name = p->shader->info.input_semantic_name[index]; sem_ind = p->shader->info.input_semantic_index[index]; switch (sem_name) { case TGSI_SEMANTIC_POSITION: { /* for fragcoord */ int real_tex_unit = get_mapping(fs, I915_SEMANTIC_POS); src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_ALL); break; } case TGSI_SEMANTIC_COLOR: if (sem_ind == 0) { src = i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL); } else { /* secondary color */ assert(sem_ind == 1); src = i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ); src = swizzle(src, X, Y, Z, ONE); } break; case TGSI_SEMANTIC_FOG: src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W); src = swizzle(src, W, W, W, W); break; case TGSI_SEMANTIC_GENERIC: { int real_tex_unit = get_mapping(fs, sem_ind); src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_ALL); break; } case TGSI_SEMANTIC_FACE: { /* for back/front faces */ int real_tex_unit = get_mapping(fs, I915_SEMANTIC_FACE); src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_X); break; } default: i915_program_error(p, "Bad source->Index"); return 0; } break; case TGSI_FILE_IMMEDIATE: assert(index < p->num_immediates); index = p->immediates_map[index]; /* fall-through */ case TGSI_FILE_CONSTANT: src = UREG(REG_TYPE_CONST, index); break; default: i915_program_error(p, "Bad source->File"); return 0; } src = swizzle(src, source->Register.SwizzleX, source->Register.SwizzleY, source->Register.SwizzleZ, source->Register.SwizzleW); /* There's both negate-all-components and per-component negation. * Try to handle both here. */ { int n = source->Register.Negate; src = negate(src, n, n, n, n); } /* no abs() */ #if 0 /* XXX assertions disabled to allow arbfplight.c to run */ /* XXX enable these assertions, or fix things */ assert(!source->Register.Absolute); #endif if (source->Register.Absolute) debug_printf("Unhandled absolute value\n"); return src; } /** * Construct a ureg for a destination register. */ static uint get_result_vector(struct i915_fp_compile *p, const struct i915_full_dst_register *dest) { switch (dest->Register.File) { case TGSI_FILE_OUTPUT: { uint sem_name = p->shader->info.output_semantic_name[dest->Register.Index]; switch (sem_name) { case TGSI_SEMANTIC_POSITION: return UREG(REG_TYPE_OD, 0); case TGSI_SEMANTIC_COLOR: return UREG(REG_TYPE_OC, 0); default: i915_program_error(p, "Bad inst->DstReg.Index/semantics"); return 0; } } case TGSI_FILE_TEMPORARY: return UREG(REG_TYPE_R, dest->Register.Index); default: i915_program_error(p, "Bad inst->DstReg.File"); return 0; } } /** * Compute flags for saturation and writemask. */ static uint get_result_flags(const struct i915_full_instruction *inst) { const uint writeMask = inst->Dst[0].Register.WriteMask; uint flags = 0x0; if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) flags |= A0_DEST_SATURATE; if (writeMask & TGSI_WRITEMASK_X) flags |= A0_DEST_CHANNEL_X; if (writeMask & TGSI_WRITEMASK_Y) flags |= A0_DEST_CHANNEL_Y; if (writeMask & TGSI_WRITEMASK_Z) flags |= A0_DEST_CHANNEL_Z; if (writeMask & TGSI_WRITEMASK_W) flags |= A0_DEST_CHANNEL_W; return flags; } /** * Convert TGSI_TEXTURE_x token to DO_SAMPLE_TYPE_x token */ static uint translate_tex_src_target(struct i915_fp_compile *p, uint tex) { switch (tex) { case TGSI_TEXTURE_SHADOW1D: /* fall-through */ case TGSI_TEXTURE_1D: return D0_SAMPLE_TYPE_2D; case TGSI_TEXTURE_SHADOW2D: /* fall-through */ case TGSI_TEXTURE_2D: return D0_SAMPLE_TYPE_2D; case TGSI_TEXTURE_SHADOWRECT: /* fall-through */ case TGSI_TEXTURE_RECT: return D0_SAMPLE_TYPE_2D; case TGSI_TEXTURE_3D: return D0_SAMPLE_TYPE_VOLUME; case TGSI_TEXTURE_CUBE: return D0_SAMPLE_TYPE_CUBE; default: i915_program_error(p, "TexSrc type"); return 0; } } /** * Return the number of coords needed to access a given TGSI_TEXTURE_* */ uint i915_num_coords(uint tex) { switch (tex) { case TGSI_TEXTURE_SHADOW1D: case TGSI_TEXTURE_1D: return 1; case TGSI_TEXTURE_SHADOW2D: case TGSI_TEXTURE_2D: case TGSI_TEXTURE_SHADOWRECT: case TGSI_TEXTURE_RECT: return 2; case TGSI_TEXTURE_3D: case TGSI_TEXTURE_CUBE: return 3; default: debug_printf("Unknown texture target for num coords"); return 2; } } /** * Generate texel lookup instruction. */ static void emit_tex(struct i915_fp_compile *p, const struct i915_full_instruction *inst, uint opcode, struct i915_fragment_shader* fs) { uint texture = inst->Texture.Texture; uint unit = inst->Src[1].Register.Index; uint tex = translate_tex_src_target( p, texture ); uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex); uint coord = src_vector( p, &inst->Src[0], fs); i915_emit_texld( p, get_result_vector( p, &inst->Dst[0] ), get_result_flags( inst ), sampler, coord, opcode, i915_num_coords(texture) ); } /** * Generate a simple arithmetic instruction * \param opcode the i915 opcode * \param numArgs the number of input/src arguments */ static void emit_simple_arith(struct i915_fp_compile *p, const struct i915_full_instruction *inst, uint opcode, uint numArgs, struct i915_fragment_shader *fs) { uint arg1, arg2, arg3; assert(numArgs <= 3); arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->Src[0], fs ); arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->Src[1], fs ); arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->Src[2], fs ); i915_emit_arith( p, opcode, get_result_vector( p, &inst->Dst[0]), get_result_flags( inst ), 0, arg1, arg2, arg3 ); } /** As above, but swap the first two src regs */ static void emit_simple_arith_swap2(struct i915_fp_compile *p, const struct i915_full_instruction *inst, uint opcode, uint numArgs, struct i915_fragment_shader *fs) { struct i915_full_instruction inst2; assert(numArgs == 2); /* transpose first two registers */ inst2 = *inst; inst2.Src[0] = inst->Src[1]; inst2.Src[1] = inst->Src[0]; emit_simple_arith(p, &inst2, opcode, numArgs, fs); } /* * Translate TGSI instruction to i915 instruction. * * Possible concerns: * * DDX, DDY -- return 0 * SIN, COS -- could use another taylor step? * LIT -- results seem a little different to sw mesa * LOG -- different to mesa on negative numbers, but this is conformant. */ static void i915_translate_instruction(struct i915_fp_compile *p, const struct i915_full_instruction *inst, struct i915_fragment_shader *fs) { uint writemask; uint src0, src1, src2, flags; uint tmp = 0; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ABS: src0 = src_vector(p, &inst->Src[0], fs); i915_emit_arith(p, A0_MAX, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, src0, negate(src0, 1, 1, 1, 1), 0); break; case TGSI_OPCODE_ADD: emit_simple_arith(p, inst, A0_ADD, 2, fs); break; case TGSI_OPCODE_CEIL: src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); flags = get_result_flags(inst); i915_emit_arith(p, A0_FLR, tmp, flags & A0_DEST_CHANNEL_ALL, 0, negate(src0, 1, 1, 1, 1), 0, 0); i915_emit_arith(p, A0_MOV, get_result_vector(p, &inst->Dst[0]), flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0); break; case TGSI_OPCODE_CMP: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); src2 = src_vector(p, &inst->Src[2], fs); i915_emit_arith(p, A0_CMP, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, src0, src2, src1); /* NOTE: order of src2, src1 */ break; case TGSI_OPCODE_COS: src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0); i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0); /* * t0.xy = MUL x.xx11, x.x111 ; x^2, x, 1, 1 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1 * t0 = MUL t0.xxz1 t0.z111 ; x^6 x^4 x^2 1 * result = DP4 t0, cos_constants */ i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XY, 0, swizzle(tmp, X, X, ONE, ONE), swizzle(tmp, X, ONE, ONE, ONE), 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XYZ, 0, swizzle(tmp, X, Y, X, ONE), swizzle(tmp, X, X, ONE, ONE), 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XYZ, 0, swizzle(tmp, X, X, Z, ONE), swizzle(tmp, Z, ONE, ONE, ONE), 0); i915_emit_arith(p, A0_DP4, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(tmp, ONE, Z, Y, X), i915_emit_const4fv(p, cos_constants), 0); break; case TGSI_OPCODE_DDX: case TGSI_OPCODE_DDY: /* XXX We just output 0 here */ debug_printf("Punting DDX/DDX\n"); src0 = get_result_vector(p, &inst->Dst[0]); i915_emit_arith(p, A0_MOV, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, ZERO, ZERO, ZERO, ZERO), 0, 0); break; case TGSI_OPCODE_DP2: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); i915_emit_arith(p, A0_DP3, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, Y, ZERO, ZERO), src1, 0); break; case TGSI_OPCODE_DP3: emit_simple_arith(p, inst, A0_DP3, 2, fs); break; case TGSI_OPCODE_DP4: emit_simple_arith(p, inst, A0_DP4, 2, fs); break; case TGSI_OPCODE_DPH: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); i915_emit_arith(p, A0_DP4, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, Y, Z, ONE), src1, 0); break; case TGSI_OPCODE_DST: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); /* result[0] = 1 * 1; * result[1] = a[1] * b[1]; * result[2] = a[2] * 1; * result[3] = 1 * b[3]; */ i915_emit_arith(p, A0_MUL, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, ONE, Y, Z, ONE), swizzle(src1, ONE, Y, ONE, W), 0); break; case TGSI_OPCODE_END: /* no-op */ break; case TGSI_OPCODE_EX2: src0 = src_vector(p, &inst->Src[0], fs); i915_emit_arith(p, A0_EXP, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, X, X, X), 0, 0); break; case TGSI_OPCODE_FLR: emit_simple_arith(p, inst, A0_FLR, 1, fs); break; case TGSI_OPCODE_FRC: emit_simple_arith(p, inst, A0_FRC, 1, fs); break; case TGSI_OPCODE_KILL_IF: /* kill if src[0].x < 0 || src[0].y < 0 ... */ src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); i915_emit_texld(p, tmp, /* dest reg: a dummy reg */ A0_DEST_CHANNEL_ALL, /* dest writemask */ 0, /* sampler */ src0, /* coord*/ T0_TEXKILL, /* opcode */ 1); /* num_coord */ break; case TGSI_OPCODE_KILL: /* unconditional kill */ tmp = i915_get_utemp(p); i915_emit_texld(p, tmp, /* dest reg: a dummy reg */ A0_DEST_CHANNEL_ALL, /* dest writemask */ 0, /* sampler */ negate(swizzle(0, ONE, ONE, ONE, ONE), 1, 1, 1, 1), /* coord */ T0_TEXKILL, /* opcode */ 1); /* num_coord */ break; case TGSI_OPCODE_LG2: src0 = src_vector(p, &inst->Src[0], fs); i915_emit_arith(p, A0_LOG, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, X, X, X), 0, 0); break; case TGSI_OPCODE_LIT: src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); /* tmp = max( a.xyzw, a.00zw ) * XXX: Clamp tmp.w to -128..128 * tmp.y = log(tmp.y) * tmp.y = tmp.w * tmp.y * tmp.y = exp(tmp.y) * result = cmp (a.11-x1, a.1x01, a.1xy1 ) */ i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0, src0, swizzle(src0, ZERO, ZERO, Z, W), 0); i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_Y, 0, swizzle(tmp, Y, Y, Y, Y), 0, 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_Y, 0, swizzle(tmp, ZERO, Y, ZERO, ZERO), swizzle(tmp, ZERO, W, ZERO, ZERO), 0); i915_emit_arith(p, A0_EXP, tmp, A0_DEST_CHANNEL_Y, 0, swizzle(tmp, Y, Y, Y, Y), 0, 0); i915_emit_arith(p, A0_CMP, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0), swizzle(tmp, ONE, X, ZERO, ONE), swizzle(tmp, ONE, X, Y, ONE)); break; case TGSI_OPCODE_LRP: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); src2 = src_vector(p, &inst->Src[2], fs); flags = get_result_flags(inst); tmp = i915_get_utemp(p); /* b*a + c*(1-a) * * b*a + c - ca * * tmp = b*a + c, * result = (-c)*a + tmp */ i915_emit_arith(p, A0_MAD, tmp, flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2); i915_emit_arith(p, A0_MAD, get_result_vector(p, &inst->Dst[0]), flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp); break; case TGSI_OPCODE_MAD: emit_simple_arith(p, inst, A0_MAD, 3, fs); break; case TGSI_OPCODE_MAX: emit_simple_arith(p, inst, A0_MAX, 2, fs); break; case TGSI_OPCODE_MIN: emit_simple_arith(p, inst, A0_MIN, 2, fs); break; case TGSI_OPCODE_MOV: emit_simple_arith(p, inst, A0_MOV, 1, fs); break; case TGSI_OPCODE_MUL: emit_simple_arith(p, inst, A0_MUL, 2, fs); break; case TGSI_OPCODE_NOP: break; case TGSI_OPCODE_POW: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); tmp = i915_get_utemp(p); flags = get_result_flags(inst); /* XXX: masking on intermediate values, here and elsewhere. */ i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_X, 0, swizzle(src0, X, X, X, X), 0, 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0); i915_emit_arith(p, A0_EXP, get_result_vector(p, &inst->Dst[0]), flags, 0, swizzle(tmp, X, X, X, X), 0, 0); break; case TGSI_OPCODE_RET: /* XXX: no-op? */ break; case TGSI_OPCODE_RCP: src0 = src_vector(p, &inst->Src[0], fs); i915_emit_arith(p, A0_RCP, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, X, X, X), 0, 0); break; case TGSI_OPCODE_RSQ: src0 = src_vector(p, &inst->Src[0], fs); i915_emit_arith(p, A0_RSQ, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, X, X, X, X), 0, 0); break; case TGSI_OPCODE_SCS: src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); /* * t0.xy = MUL x.xx11, x.x1111 ; x^2, x, 1, 1 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x * t1 = MUL t0.xyyw t0.yz11 ; x^7 x^5 x^3 x * scs.x = DP4 t1, scs_sin_constants * t1 = MUL t0.xxz1 t0.z111 ; x^6 x^4 x^2 1 * scs.y = DP4 t1, scs_cos_constants */ i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XY, 0, swizzle(src0, X, X, ONE, ONE), swizzle(src0, X, ONE, ONE, ONE), 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_ALL, 0, swizzle(tmp, X, Y, X, Y), swizzle(tmp, X, X, ONE, ONE), 0); writemask = inst->Dst[0].Register.WriteMask; if (writemask & TGSI_WRITEMASK_Y) { uint tmp1; if (writemask & TGSI_WRITEMASK_X) tmp1 = i915_get_utemp(p); else tmp1 = tmp; i915_emit_arith(p, A0_MUL, tmp1, A0_DEST_CHANNEL_ALL, 0, swizzle(tmp, X, Y, Y, W), swizzle(tmp, X, Z, ONE, ONE), 0); i915_emit_arith(p, A0_DP4, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_Y, 0, swizzle(tmp1, W, Z, Y, X), i915_emit_const4fv(p, scs_sin_constants), 0); } if (writemask & TGSI_WRITEMASK_X) { i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XYZ, 0, swizzle(tmp, X, X, Z, ONE), swizzle(tmp, Z, ONE, ONE, ONE), 0); i915_emit_arith(p, A0_DP4, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_X, 0, swizzle(tmp, ONE, Z, Y, X), i915_emit_const4fv(p, scs_cos_constants), 0); } break; case TGSI_OPCODE_SEQ: /* if we're both >= and <= then we're == */ src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_SGE, tmp, A0_DEST_CHANNEL_ALL, 0, src0, src1, 0); i915_emit_arith(p, A0_SGE, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, src1, src0, 0); i915_emit_arith(p, A0_MUL, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, get_result_vector(p, &inst->Dst[0]), tmp, 0); break; case TGSI_OPCODE_SGE: emit_simple_arith(p, inst, A0_SGE, 2, fs); break; case TGSI_OPCODE_SIN: src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0); i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0); /* * t0.xy = MUL x.xx11, x.x1111 ; x^2, x, 1, 1 * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x * t1 = MUL t0.xyyw t0.yz11 ; x^7 x^5 x^3 x * result = DP4 t1.wzyx, sin_constants */ i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_XY, 0, swizzle(tmp, X, X, ONE, ONE), swizzle(tmp, X, ONE, ONE, ONE), 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_ALL, 0, swizzle(tmp, X, Y, X, Y), swizzle(tmp, X, X, ONE, ONE), 0); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_ALL, 0, swizzle(tmp, X, Y, Y, W), swizzle(tmp, X, Z, ONE, ONE), 0); i915_emit_arith(p, A0_DP4, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(tmp, W, Z, Y, X), i915_emit_const4fv(p, sin_constants), 0); break; case TGSI_OPCODE_SLE: /* like SGE, but swap reg0, reg1 */ emit_simple_arith_swap2(p, inst, A0_SGE, 2, fs); break; case TGSI_OPCODE_SLT: emit_simple_arith(p, inst, A0_SLT, 2, fs); break; case TGSI_OPCODE_SGT: /* like SLT, but swap reg0, reg1 */ emit_simple_arith_swap2(p, inst, A0_SLT, 2, fs); break; case TGSI_OPCODE_SNE: /* if we're < or > then we're != */ src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, src0, src1, 0); i915_emit_arith(p, A0_SLT, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, src1, src0, 0); i915_emit_arith(p, A0_ADD, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, get_result_vector(p, &inst->Dst[0]), tmp, 0); break; case TGSI_OPCODE_SSG: /* compute (src>0) - (src<0) */ src0 = src_vector(p, &inst->Src[0], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_SLT, tmp, A0_DEST_CHANNEL_ALL, 0, src0, swizzle(src0, ZERO, ZERO, ZERO, ZERO), 0); i915_emit_arith(p, A0_SLT, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, swizzle(src0, ZERO, ZERO, ZERO, ZERO), src0, 0); i915_emit_arith(p, A0_ADD, get_result_vector(p, &inst->Dst[0]), A0_DEST_CHANNEL_ALL, 0, get_result_vector(p, &inst->Dst[0]), negate(tmp, 1, 1, 1, 1), 0); break; case TGSI_OPCODE_SUB: src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); i915_emit_arith(p, A0_ADD, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, src0, negate(src1, 1, 1, 1, 1), 0); break; case TGSI_OPCODE_TEX: emit_tex(p, inst, T0_TEXLD, fs); break; case TGSI_OPCODE_TRUNC: emit_simple_arith(p, inst, A0_TRC, 1, fs); break; case TGSI_OPCODE_TXB: emit_tex(p, inst, T0_TEXLDB, fs); break; case TGSI_OPCODE_TXP: emit_tex(p, inst, T0_TEXLDP, fs); break; case TGSI_OPCODE_XPD: /* Cross product: * result.x = src0.y * src1.z - src0.z * src1.y; * result.y = src0.z * src1.x - src0.x * src1.z; * result.z = src0.x * src1.y - src0.y * src1.x; * result.w = undef; */ src0 = src_vector(p, &inst->Src[0], fs); src1 = src_vector(p, &inst->Src[1], fs); tmp = i915_get_utemp(p); i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_ALL, 0, swizzle(src0, Z, X, Y, ONE), swizzle(src1, Y, Z, X, ONE), 0); i915_emit_arith(p, A0_MAD, get_result_vector(p, &inst->Dst[0]), get_result_flags(inst), 0, swizzle(src0, Y, Z, X, ONE), swizzle(src1, Z, X, Y, ONE), negate(tmp, 1, 1, 1, 0)); break; default: i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode); p->error = 1; return; } i915_release_utemps(p); } static void i915_translate_token(struct i915_fp_compile *p, const union i915_full_token *token, struct i915_fragment_shader *fs) { struct i915_fragment_shader *ifs = p->shader; switch( token->Token.Type ) { case TGSI_TOKEN_TYPE_PROPERTY: /* * We only support one cbuf, but we still need to ignore the property * correctly so we don't hit the assert at the end of the switch case. */ assert(token->FullProperty.Property.PropertyName == TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS); break; case TGSI_TOKEN_TYPE_DECLARATION: if (token->FullDeclaration.Declaration.File == TGSI_FILE_CONSTANT) { uint i; for (i = token->FullDeclaration.Range.First; i <= MIN2(token->FullDeclaration.Range.Last, I915_MAX_CONSTANT - 1); i++) { assert(ifs->constant_flags[i] == 0x0); ifs->constant_flags[i] = I915_CONSTFLAG_USER; ifs->num_constants = MAX2(ifs->num_constants, i + 1); } } else if (token->FullDeclaration.Declaration.File == TGSI_FILE_TEMPORARY) { uint i; for (i = token->FullDeclaration.Range.First; i <= token->FullDeclaration.Range.Last; i++) { if (i >= I915_MAX_TEMPORARY) debug_printf("Too many temps (%d)\n",i); else /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */ p->temp_flag |= (1 << i); /* mark temp as used */ } } break; case TGSI_TOKEN_TYPE_IMMEDIATE: { const struct tgsi_full_immediate *imm = &token->FullImmediate; const uint pos = p->num_immediates++; uint j; assert( imm->Immediate.NrTokens <= 4 + 1 ); for (j = 0; j < imm->Immediate.NrTokens - 1; j++) { p->immediates[pos][j] = imm->u[j].Float; } } break; case TGSI_TOKEN_TYPE_INSTRUCTION: if (p->first_instruction) { /* resolve location of immediates */ uint i, j; for (i = 0; i < p->num_immediates; i++) { /* find constant slot for this immediate */ for (j = 0; j < I915_MAX_CONSTANT; j++) { if (ifs->constant_flags[j] == 0x0) { memcpy(ifs->constants[j], p->immediates[i], 4 * sizeof(float)); /*printf("immediate %d maps to const %d\n", i, j);*/ ifs->constant_flags[j] = 0xf; /* all four comps used */ p->immediates_map[i] = j; ifs->num_constants = MAX2(ifs->num_constants, j + 1); break; } } } p->first_instruction = FALSE; } i915_translate_instruction(p, &token->FullInstruction, fs); break; default: assert( 0 ); } } /** * Translate TGSI fragment shader into i915 hardware instructions. * \param p the translation state * \param tokens the TGSI token array */ static void i915_translate_instructions(struct i915_fp_compile *p, const struct i915_token_list *tokens, struct i915_fragment_shader *fs) { int i; for(i = 0; iNumTokens; i++) { i915_translate_token(p, &tokens->Tokens[i], fs); } } static struct i915_fp_compile * i915_init_compile(struct i915_context *i915, struct i915_fragment_shader *ifs) { struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile); int i; p->shader = ifs; /* Put new constants at end of const buffer, growing downward. * The problem is we don't know how many user-defined constants might * be specified with pipe->set_constant_buffer(). * Should pre-scan the user's program to determine the highest-numbered * constant referenced. */ ifs->num_constants = 0; memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags)); memset(&p->register_phases, 0, sizeof(p->register_phases)); for (i = 0; i < I915_TEX_UNITS; i++) ifs->generic_mapping[i] = -1; p->first_instruction = TRUE; p->nr_tex_indirect = 1; /* correct? */ p->nr_tex_insn = 0; p->nr_alu_insn = 0; p->nr_decl_insn = 0; p->csr = p->program; p->decl = p->declarations; p->decl_s = 0; p->decl_t = 0; p->temp_flag = ~0x0 << I915_MAX_TEMPORARY; p->utemp_flag = ~0x7; /* initialize the first program word */ *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM; return p; } /* Copy compile results to the fragment program struct and destroy the * compilation context. */ static void i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p) { struct i915_fragment_shader *ifs = p->shader; unsigned long program_size = (unsigned long) (p->csr - p->program); unsigned long decl_size = (unsigned long) (p->decl - p->declarations); if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT) debug_printf("Exceeded max nr indirect texture lookups\n"); if (p->nr_tex_insn > I915_MAX_TEX_INSN) i915_program_error(p, "Exceeded max TEX instructions"); if (p->nr_alu_insn > I915_MAX_ALU_INSN) i915_program_error(p, "Exceeded max ALU instructions"); if (p->nr_decl_insn > I915_MAX_DECL_INSN) i915_program_error(p, "Exceeded max DECL instructions"); if (p->error) { p->NumNativeInstructions = 0; p->NumNativeAluInstructions = 0; p->NumNativeTexInstructions = 0; p->NumNativeTexIndirections = 0; i915_use_passthrough_shader(ifs); } else { p->NumNativeInstructions = p->nr_alu_insn + p->nr_tex_insn + p->nr_decl_insn; p->NumNativeAluInstructions = p->nr_alu_insn; p->NumNativeTexInstructions = p->nr_tex_insn; p->NumNativeTexIndirections = p->nr_tex_indirect; /* patch in the program length */ p->declarations[0] |= program_size + decl_size - 2; /* Copy compilation results to fragment program struct: */ assert(!ifs->decl); assert(!ifs->program); ifs->decl = (uint *) MALLOC(decl_size * sizeof(uint)); ifs->program = (uint *) MALLOC(program_size * sizeof(uint)); if (ifs->decl) { ifs->decl_len = decl_size; memcpy(ifs->decl, p->declarations, decl_size * sizeof(uint)); } if (ifs->program) { ifs->program_len = program_size; memcpy(ifs->program, p->program, program_size * sizeof(uint)); } } /* Release the compilation struct: */ FREE(p); } /** * Rather than trying to intercept and jiggle depth writes during * emit, just move the value into its correct position at the end of * the program: */ static void i915_fixup_depth_write(struct i915_fp_compile *p) { /* XXX assuming pos/depth is always in output[0] */ if (p->shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) { const uint depth = UREG(REG_TYPE_OD, 0); i915_emit_arith(p, A0_MOV, /* opcode */ depth, /* dest reg */ A0_DEST_CHANNEL_W, /* write mask */ 0, /* saturate? */ swizzle(depth, X, Y, Z, Z), /* src0 */ 0, 0 /* src1, src2 */); } } void i915_translate_fragment_program( struct i915_context *i915, struct i915_fragment_shader *fs) { struct i915_fp_compile *p; const struct tgsi_token *tokens = fs->state.tokens; struct i915_token_list* i_tokens; #if 0 tgsi_dump(tokens, 0); #endif /* hw doesn't seem to like empty frag programs, even when the depth write * fixup gets emitted below - may that one is fishy, too? */ if (fs->info.num_instructions == 1) { i915_use_passthrough_shader(fs); return; } p = i915_init_compile(i915, fs); i_tokens = i915_optimize(tokens); i915_translate_instructions(p, i_tokens, fs); i915_fixup_depth_write(p); i915_fini_compile(i915, p); i915_optimize_free(i_tokens); #if 0 i915_disassemble_program(NULL, fs->program, fs->program_len); #endif }