diff options
Diffstat (limited to 'src/gallium')
73 files changed, 5214 insertions, 3266 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 28a176d..2807c78 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -165,6 +165,7 @@ GALLIVM_SOURCES := \ gallivm/lp_bld_conv.c \ gallivm/lp_bld_flow.c \ gallivm/lp_bld_format_aos.c \ + gallivm/lp_bld_format_aos_array.c \ gallivm/lp_bld_format_soa.c \ gallivm/lp_bld_format_yuv.c \ gallivm/lp_bld_gather.c \ @@ -187,7 +188,6 @@ GALLIVM_SOURCES := \ gallivm/lp_bld_type.c \ draw/draw_llvm.c \ draw/draw_llvm_sample.c \ - draw/draw_llvm_translate.c \ draw/draw_vs_llvm.c \ draw/draw_pt_fetch_shade_pipeline_llvm.c diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 20260c1..be30b7d 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -70,8 +70,7 @@ draw_get_option_use_llvm(void) * Create new draw module context with gallivm state for LLVM JIT. */ static struct draw_context * -draw_create_context(struct pipe_context *pipe, boolean try_llvm, - struct gallivm_state *gallivm) +draw_create_context(struct pipe_context *pipe, boolean try_llvm) { struct draw_context *draw = CALLOC_STRUCT( draw_context ); if (draw == NULL) @@ -79,16 +78,7 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm, #if HAVE_LLVM if (try_llvm && draw_get_option_use_llvm()) { - if (!gallivm) { - gallivm = gallivm_create(); - draw->own_gallivm = gallivm; - } - - if (!gallivm) - goto err_destroy; - - draw->llvm = draw_llvm_create(draw, gallivm); - + draw->llvm = draw_llvm_create(draw); if (!draw->llvm) goto err_destroy; } @@ -114,7 +104,7 @@ err_out: struct draw_context * draw_create(struct pipe_context *pipe) { - return draw_create_context(pipe, TRUE, NULL); + return draw_create_context(pipe, TRUE); } @@ -124,17 +114,7 @@ draw_create(struct pipe_context *pipe) struct draw_context * draw_create_no_llvm(struct pipe_context *pipe) { - return draw_create_context(pipe, FALSE, NULL); -} - - -/** - * Create new draw module context with gallivm state for LLVM JIT. - */ -struct draw_context * -draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm) -{ - return draw_create_context(pipe, TRUE, gallivm); + return draw_create_context(pipe, FALSE); } @@ -213,9 +193,6 @@ void draw_destroy( struct draw_context *draw ) #ifdef HAVE_LLVM if (draw->llvm) draw_llvm_destroy( draw->llvm ); - - if (draw->own_gallivm) - gallivm_destroy(draw->own_gallivm); #endif FREE( draw ); diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 852cbc3..cc95600 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -48,7 +48,6 @@ struct draw_vertex_shader; struct draw_geometry_shader; struct draw_fragment_shader; struct tgsi_sampler; -struct gallivm_state; /* * structure to contain driver internal information @@ -67,9 +66,6 @@ struct draw_context *draw_create( struct pipe_context *pipe ); struct draw_context *draw_create_no_llvm(struct pipe_context *pipe); -struct draw_context * -draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm); - void draw_destroy( struct draw_context *draw ); void draw_flush(struct draw_context *draw); diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index e08221e..8d9b530 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -43,6 +43,8 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_type.h" +#include "gallivm/lp_bld_pack.h" +#include "gallivm/lp_bld_format.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_dump.h" @@ -56,40 +58,6 @@ #define DEBUG_STORE 0 -/** - * This function is called by the gallivm "garbage collector" when - * the LLVM global data structures are freed. We must free all LLVM-related - * data. Specifically, all JIT'd shader variants. - */ -static void -draw_llvm_garbage_collect_callback(void *cb_data) -{ - struct draw_llvm *llvm = (struct draw_llvm *) cb_data; - struct draw_context *draw = llvm->draw; - struct draw_llvm_variant_list_item *li; - - /* Ensure prepare will be run and shaders recompiled */ - assert(!draw->suspend_flushing); - draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE); - - /* free all shader variants */ - li = first_elem(&llvm->vs_variants_list); - while (!at_end(&llvm->vs_variants_list, li)) { - struct draw_llvm_variant_list_item *next = next_elem(li); - draw_llvm_destroy_variant(li->base); - li = next; - } - - /* Null-out these pointers so they get remade next time they're needed. - * See the accessor functions below. - */ - llvm->context_ptr_type = NULL; - llvm->buffer_ptr_type = NULL; - llvm->vb_ptr_type = NULL; - llvm->vertex_header_ptr_type = NULL; -} - - static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var, boolean elts); @@ -316,56 +284,56 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems) * Create LLVM types for various structures. */ static void -create_jit_types(struct draw_llvm *llvm) +create_jit_types(struct draw_llvm_variant *variant) { - struct gallivm_state *gallivm = llvm->gallivm; + struct gallivm_state *gallivm = variant->gallivm; LLVMTypeRef texture_type, context_type, buffer_type, vb_type; texture_type = create_jit_texture_type(gallivm, "texture"); context_type = create_jit_context_type(gallivm, texture_type, "draw_jit_context"); - llvm->context_ptr_type = LLVMPointerType(context_type, 0); + variant->context_ptr_type = LLVMPointerType(context_type, 0); buffer_type = LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 8), 0); - llvm->buffer_ptr_type = LLVMPointerType(buffer_type, 0); + variant->buffer_ptr_type = LLVMPointerType(buffer_type, 0); vb_type = create_jit_vertex_buffer_type(gallivm, "pipe_vertex_buffer"); - llvm->vb_ptr_type = LLVMPointerType(vb_type, 0); + variant->vb_ptr_type = LLVMPointerType(vb_type, 0); } static LLVMTypeRef -get_context_ptr_type(struct draw_llvm *llvm) +get_context_ptr_type(struct draw_llvm_variant *variant) { - if (!llvm->context_ptr_type) - create_jit_types(llvm); - return llvm->context_ptr_type; + if (!variant->context_ptr_type) + create_jit_types(variant); + return variant->context_ptr_type; } static LLVMTypeRef -get_buffer_ptr_type(struct draw_llvm *llvm) +get_buffer_ptr_type(struct draw_llvm_variant *variant) { - if (!llvm->buffer_ptr_type) - create_jit_types(llvm); - return llvm->buffer_ptr_type; + if (!variant->buffer_ptr_type) + create_jit_types(variant); + return variant->buffer_ptr_type; } static LLVMTypeRef -get_vb_ptr_type(struct draw_llvm *llvm) +get_vb_ptr_type(struct draw_llvm_variant *variant) { - if (!llvm->vb_ptr_type) - create_jit_types(llvm); - return llvm->vb_ptr_type; + if (!variant->vb_ptr_type) + create_jit_types(variant); + return variant->vb_ptr_type; } static LLVMTypeRef -get_vertex_header_ptr_type(struct draw_llvm *llvm) +get_vertex_header_ptr_type(struct draw_llvm_variant *variant) { - if (!llvm->vertex_header_ptr_type) - create_jit_types(llvm); - return llvm->vertex_header_ptr_type; + if (!variant->vertex_header_ptr_type) + create_jit_types(variant); + return variant->vertex_header_ptr_type; } @@ -373,7 +341,7 @@ get_vertex_header_ptr_type(struct draw_llvm *llvm) * Create per-context LLVM info. */ struct draw_llvm * -draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm) +draw_llvm_create(struct draw_context *draw) { struct draw_llvm *llvm; @@ -384,18 +352,10 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm) lp_build_init(); llvm->draw = draw; - llvm->gallivm = gallivm; - - if (gallivm_debug & GALLIVM_DEBUG_IR) { - LLVMDumpModule(llvm->gallivm->module); - } llvm->nr_variants = 0; make_empty_list(&llvm->vs_variants_list); - gallivm_register_garbage_collector_callback( - draw_llvm_garbage_collect_callback, llvm); - return llvm; } @@ -406,9 +366,6 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm) void draw_llvm_destroy(struct draw_llvm *llvm) { - gallivm_remove_garbage_collector_callback( - draw_llvm_garbage_collect_callback, llvm); - /* XXX free other draw_llvm data? */ FREE(llvm); } @@ -435,15 +392,27 @@ draw_llvm_create_variant(struct draw_llvm *llvm, variant->llvm = llvm; + variant->gallivm = gallivm_create(); + + create_jit_types(variant); + memcpy(&variant->key, key, shader->variant_key_size); - vertex_header = create_jit_vertex_header(llvm->gallivm, num_inputs); + vertex_header = create_jit_vertex_header(variant->gallivm, num_inputs); - llvm->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0); + variant->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0); draw_llvm_generate(llvm, variant, FALSE); /* linear */ draw_llvm_generate(llvm, variant, TRUE); /* elts */ + gallivm_compile_module(variant->gallivm); + + variant->jit_func = (draw_jit_vert_func) + gallivm_jit_function(variant->gallivm, variant->function); + + variant->jit_func_elts = (draw_jit_vert_func_elts) + gallivm_jit_function(variant->gallivm, variant->function_elts); + variant->shader = shader; variant->list_item_global.base = variant; variant->list_item_local.base = variant; @@ -455,8 +424,9 @@ draw_llvm_create_variant(struct draw_llvm *llvm, static void -generate_vs(struct draw_llvm *llvm, +generate_vs(struct draw_llvm_variant *variant, LLVMBuilderRef builder, + struct lp_type vs_type, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS], const struct lp_bld_tgsi_system_values *system_values, @@ -464,21 +434,11 @@ generate_vs(struct draw_llvm *llvm, struct lp_build_sampler_soa *draw_sampler, boolean clamp_vertex_color) { + struct draw_llvm *llvm = variant->llvm; const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens; - struct lp_type vs_type; - LLVMValueRef consts_ptr = draw_jit_context_vs_constants(llvm->gallivm, context_ptr); + LLVMValueRef consts_ptr = draw_jit_context_vs_constants(variant->gallivm, context_ptr); struct lp_build_sampler_soa *sampler = 0; - memset(&vs_type, 0, sizeof vs_type); - vs_type.floating = TRUE; /* floating point values */ - vs_type.sign = TRUE; /* values are signed */ - vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ - vs_type.width = 32; /* 32-bit float */ - vs_type.length = 4; /* 4 elements per vector */ -#if 0 - num_vs = 4; /* number of vertices per block */ -#endif - if (gallivm_debug & GALLIVM_DEBUG_IR) { tgsi_dump(tokens, 0); } @@ -486,7 +446,7 @@ generate_vs(struct draw_llvm *llvm, if (llvm->draw->num_sampler_views && llvm->draw->num_samplers) sampler = draw_sampler; - lp_build_tgsi_soa(llvm->gallivm, + lp_build_tgsi_soa(variant->gallivm, tokens, vs_type, NULL /*struct lp_build_mask_context *mask*/, @@ -503,7 +463,7 @@ generate_vs(struct draw_llvm *llvm, unsigned chan, attrib; struct lp_build_context bld; struct tgsi_shader_info* info = &llvm->draw->vs.vertex_shader->info; - lp_build_context_init(&bld, llvm->gallivm, vs_type); + lp_build_context_init(&bld, variant->gallivm, vs_type); for (attrib = 0; attrib < info->num_outputs; ++attrib) { for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { @@ -531,25 +491,6 @@ generate_vs(struct draw_llvm *llvm, } -#if DEBUG_STORE -static void print_vectorf(LLVMBuilderRef builder, - LLVMValueRef vec) -{ - LLVMValueRef val[4]; - val[0] = LLVMBuildExtractElement(builder, vec, - lp_build_const_int32(gallivm, 0), ""); - val[1] = LLVMBuildExtractElement(builder, vec, - lp_build_const_int32(gallivm, 1), ""); - val[2] = LLVMBuildExtractElement(builder, vec, - lp_build_const_int32(gallivm, 2), ""); - val[3] = LLVMBuildExtractElement(builder, vec, - lp_build_const_int32(gallivm, 3), ""); - lp_build_printf(builder, "vector = [%f, %f, %f, %f]\n", - val[0], val[1], val[2], val[3]); -} -#endif - - static void generate_fetch(struct gallivm_state *gallivm, LLVMValueRef vbuffers_ptr, @@ -559,6 +500,8 @@ generate_fetch(struct gallivm_state *gallivm, LLVMValueRef index, LLVMValueRef instance_id) { + const struct util_format_description *format_desc = util_format_description(velem->src_format); + LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); LLVMBuilderRef builder = gallivm->builder; LLVMValueRef indices = LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), @@ -587,118 +530,47 @@ generate_fetch(struct gallivm_state *gallivm, lp_build_const_int32(gallivm, velem->src_offset), ""); - /*lp_build_printf(builder, "vbuf index = %d, stride is %d\n", indices, stride);*/ +/* lp_build_printf(gallivm, "vbuf index = %d, stride is %d\n", indices, stride);*/ vbuffer_ptr = LLVMBuildGEP(builder, vbuffer_ptr, &stride, 1, ""); - *res = draw_llvm_translate_from(gallivm, vbuffer_ptr, velem->src_format); -} - - -static LLVMValueRef -aos_to_soa(struct gallivm_state *gallivm, - LLVMValueRef val0, - LLVMValueRef val1, - LLVMValueRef val2, - LLVMValueRef val3, - LLVMValueRef channel) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ex, res; - - ex = LLVMBuildExtractElement(builder, val0, - channel, ""); - res = LLVMBuildInsertElement(builder, - LLVMConstNull(LLVMTypeOf(val0)), - ex, - lp_build_const_int32(gallivm, 0), - ""); - - ex = LLVMBuildExtractElement(builder, val1, - channel, ""); - res = LLVMBuildInsertElement(builder, - res, ex, - lp_build_const_int32(gallivm, 1), - ""); - - ex = LLVMBuildExtractElement(builder, val2, - channel, ""); - res = LLVMBuildInsertElement(builder, - res, ex, - lp_build_const_int32(gallivm, 2), - ""); - - ex = LLVMBuildExtractElement(builder, val3, - channel, ""); - res = LLVMBuildInsertElement(builder, - res, ex, - lp_build_const_int32(gallivm, 3), - ""); - - return res; + *res = lp_build_fetch_rgba_aos(gallivm, + format_desc, + lp_float32_vec4_type(), + vbuffer_ptr, + zero, zero, zero); } - static void -soa_to_aos(struct gallivm_state *gallivm, - LLVMValueRef soa[TGSI_NUM_CHANNELS], - LLVMValueRef aos[TGSI_NUM_CHANNELS]) +convert_to_soa(struct gallivm_state *gallivm, + LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32], + LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS], + unsigned num_attribs, const struct lp_type soa_type) { - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef comp; - int i = 0; + unsigned i, j, k; + struct lp_type aos_channel_type = soa_type; debug_assert(TGSI_NUM_CHANNELS == 4); + debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0); - aos[0] = LLVMConstNull(LLVMTypeOf(soa[0])); - aos[1] = aos[2] = aos[3] = aos[0]; - - for (i = 0; i < TGSI_NUM_CHANNELS; ++i) { - LLVMValueRef channel = lp_build_const_int32(gallivm, i); - - comp = LLVMBuildExtractElement(builder, soa[i], - lp_build_const_int32(gallivm, 0), ""); - aos[0] = LLVMBuildInsertElement(builder, aos[0], comp, channel, ""); - - comp = LLVMBuildExtractElement(builder, soa[i], - lp_build_const_int32(gallivm, 1), ""); - aos[1] = LLVMBuildInsertElement(builder, aos[1], comp, channel, ""); + aos_channel_type.length >>= 1; - comp = LLVMBuildExtractElement(builder, soa[i], - lp_build_const_int32(gallivm, 2), ""); - aos[2] = LLVMBuildInsertElement(builder, aos[2], comp, channel, ""); - - comp = LLVMBuildExtractElement(builder, soa[i], - lp_build_const_int32(gallivm, 3), ""); - aos[3] = LLVMBuildInsertElement(builder, aos[3], comp, channel, ""); + for (i = 0; i < num_attribs; ++i) { + LLVMValueRef aos_channels[TGSI_NUM_CHANNELS]; + unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS; - } -} + for (j = 0; j < TGSI_NUM_CHANNELS; ++j) { + LLVMValueRef channel[LP_MAX_VECTOR_LENGTH]; + assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); -static void -convert_to_soa(struct gallivm_state *gallivm, - LLVMValueRef (*aos)[TGSI_NUM_CHANNELS], - LLVMValueRef (*soa)[TGSI_NUM_CHANNELS], - int num_attribs) -{ - int i; + for (k = 0; k < pixels_per_channel; ++k) { + channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k]; + } - debug_assert(TGSI_NUM_CHANNELS == 4); + aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); + } - for (i = 0; i < num_attribs; ++i) { - LLVMValueRef val0 = aos[i][0]; - LLVMValueRef val1 = aos[i][1]; - LLVMValueRef val2 = aos[i][2]; - LLVMValueRef val3 = aos[i][3]; - - soa[i][0] = aos_to_soa(gallivm, val0, val1, val2, val3, - lp_build_const_int32(gallivm, 0)); - soa[i][1] = aos_to_soa(gallivm, val0, val1, val2, val3, - lp_build_const_int32(gallivm, 1)); - soa[i][2] = aos_to_soa(gallivm, val0, val1, val2, val3, - lp_build_const_int32(gallivm, 2)); - soa[i][3] = aos_to_soa(gallivm, val0, val1, val2, val3, - lp_build_const_int32(gallivm, 3)); + lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]); } } @@ -707,89 +579,34 @@ static void store_aos(struct gallivm_state *gallivm, LLVMValueRef io_ptr, LLVMValueRef index, - LLVMValueRef value, - LLVMValueRef clipmask, boolean have_clipdist) + LLVMValueRef value) { + LLVMTypeRef data_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, lp_float32_vec4_type()), 0); LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptr); LLVMValueRef data_ptr = draw_jit_header_data(gallivm, io_ptr); LLVMValueRef indices[3]; - LLVMValueRef val; - int vertex_id_pad_edgeflag; indices[0] = lp_build_const_int32(gallivm, 0); indices[1] = index; indices[2] = lp_build_const_int32(gallivm, 0); - /* If this assertion fails, it means we need to update the bit twidding - * code here. See struct vertex_header in draw_private.h. - */ - assert(DRAW_TOTAL_CLIP_PLANES==14); - /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */ - vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES); - if (have_clipdist) - vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1); - val = lp_build_const_int32(gallivm, vertex_id_pad_edgeflag); - /* OR with the clipmask */ - val = LLVMBuildOr(builder, val, clipmask, ""); - - /* store vertex header */ - LLVMBuildStore(builder, val, id_ptr); - - #if DEBUG_STORE - lp_build_printf(builder, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr); -#endif -#if 0 - /*lp_build_printf(builder, " ---- %p storing at %d (%p) ", io_ptr, index, data_ptr); - print_vectorf(builder, value);*/ - data_ptr = LLVMBuildBitCast(builder, data_ptr, - LLVMPointerType(LLVMArrayType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0), 0), - "datavec"); - data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 2, ""); - - LLVMBuildStore(builder, value, data_ptr); -#else - { - LLVMValueRef x, y, z, w; - LLVMValueRef idx0, idx1, idx2, idx3; - LLVMValueRef gep0, gep1, gep2, gep3; - data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, ""); - - idx0 = lp_build_const_int32(gallivm, 0); - idx1 = lp_build_const_int32(gallivm, 1); - idx2 = lp_build_const_int32(gallivm, 2); - idx3 = lp_build_const_int32(gallivm, 3); - - x = LLVMBuildExtractElement(builder, value, - idx0, ""); - y = LLVMBuildExtractElement(builder, value, - idx1, ""); - z = LLVMBuildExtractElement(builder, value, - idx2, ""); - w = LLVMBuildExtractElement(builder, value, - idx3, ""); - - gep0 = LLVMBuildGEP(builder, data_ptr, &idx0, 1, ""); - gep1 = LLVMBuildGEP(builder, data_ptr, &idx1, 1, ""); - gep2 = LLVMBuildGEP(builder, data_ptr, &idx2, 1, ""); - gep3 = LLVMBuildGEP(builder, data_ptr, &idx3, 1, ""); - - /*lp_build_printf(builder, "##### x = %f (%p), y = %f (%p), z = %f (%p), w = %f (%p)\n", - x, gep0, y, gep1, z, gep2, w, gep3);*/ - LLVMBuildStore(builder, x, gep0); - LLVMBuildStore(builder, y, gep1); - LLVMBuildStore(builder, z, gep2); - LLVMBuildStore(builder, w, gep3); - } + lp_build_printf(gallivm, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr); #endif + + data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, ""); + data_ptr = LLVMBuildPointerCast(builder, data_ptr, data_ptr_type, ""); + + /* Unaligned store due to the vertex header */ + lp_set_store_alignment(LLVMBuildStore(builder, value, data_ptr), sizeof(float)); } static void store_aos_array(struct gallivm_state *gallivm, + struct lp_type soa_type, LLVMValueRef io_ptr, - LLVMValueRef aos[TGSI_NUM_CHANNELS], + LLVMValueRef* aos, int attrib, int num_outputs, LLVMValueRef clipmask, @@ -797,42 +614,49 @@ store_aos_array(struct gallivm_state *gallivm, { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib); - LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0); - LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1); - LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2); - LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3); - LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; - LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3; + LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32]; + LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32]; + int vector_length = soa_type.length; + int i; debug_assert(TGSI_NUM_CHANNELS == 4); - io0_ptr = LLVMBuildGEP(builder, io_ptr, - &ind0, 1, ""); - io1_ptr = LLVMBuildGEP(builder, io_ptr, - &ind1, 1, ""); - io2_ptr = LLVMBuildGEP(builder, io_ptr, - &ind2, 1, ""); - io3_ptr = LLVMBuildGEP(builder, io_ptr, - &ind3, 1, ""); - - clipmask0 = LLVMBuildExtractElement(builder, clipmask, - ind0, ""); - clipmask1 = LLVMBuildExtractElement(builder, clipmask, - ind1, ""); - clipmask2 = LLVMBuildExtractElement(builder, clipmask, - ind2, ""); - clipmask3 = LLVMBuildExtractElement(builder, clipmask, - ind3, ""); + for (i = 0; i < vector_length; i++) { + inds[i] = lp_build_const_int32(gallivm, i); + io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, ""); + } + if (attrib == 0) { + /* store vertex header for each of the n vertices */ + LLVMValueRef val, cliptmp; + int vertex_id_pad_edgeflag; + + /* If this assertion fails, it means we need to update the bit twidding + * code here. See struct vertex_header in draw_private.h. + */ + assert(DRAW_TOTAL_CLIP_PLANES==14); + /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */ + vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES); + if (have_clipdist) + vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1); + val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag); + /* OR with the clipmask */ + cliptmp = LLVMBuildOr(builder, val, clipmask, ""); + for (i = 0; i < vector_length; i++) { + LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptrs[i]); + val = LLVMBuildExtractElement(builder, cliptmp, inds[i], ""); + LLVMBuildStore(builder, val, id_ptr); #if DEBUG_STORE - lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n", - io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3); + lp_build_printf(gallivm, "io = %p, index %d\n, clipmask = %x\n", + io_ptrs[i], inds[i], val); #endif - /* store for each of the 4 vertices */ - store_aos(gallivm, io0_ptr, attr_index, aos[0], clipmask0, have_clipdist); - store_aos(gallivm, io1_ptr, attr_index, aos[1], clipmask1, have_clipdist); - store_aos(gallivm, io2_ptr, attr_index, aos[2], clipmask2, have_clipdist); - store_aos(gallivm, io3_ptr, attr_index, aos[3], clipmask3, have_clipdist); + } + } + + /* store for each of the n vertices */ + for (i = 0; i < vector_length; i++) { + store_aos(gallivm, io_ptrs[i], attr_index, aos[i]); + } } @@ -842,33 +666,53 @@ convert_to_aos(struct gallivm_state *gallivm, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], LLVMValueRef clipmask, int num_outputs, - int max_vertices, boolean have_clipdist) + struct lp_type soa_type, + boolean have_clipdist) { LLVMBuilderRef builder = gallivm->builder; - unsigned chan, attrib; + unsigned chan, attrib, i; #if DEBUG_STORE - lp_build_printf(builder, " # storing begin\n"); + lp_build_printf(gallivm, " # storing begin\n"); #endif for (attrib = 0; attrib < num_outputs; ++attrib) { - LLVMValueRef soa[4]; - LLVMValueRef aos[4]; + LLVMValueRef soa[TGSI_NUM_CHANNELS]; + LLVMValueRef aos[LP_MAX_VECTOR_WIDTH / 32]; for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { if (outputs[attrib][chan]) { LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], ""); lp_build_name(out, "output%u.%c", attrib, "xyzw"[chan]); - /*lp_build_printf(builder, "output %d : %d ", - LLVMConstInt(LLVMInt32Type(), attrib, 0), - LLVMConstInt(LLVMInt32Type(), chan, 0)); - print_vectorf(builder, out);*/ +#if DEBUG_STORE + lp_build_printf(gallivm, "output %d : %d ", + LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), + attrib, 0), + LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), + chan, 0)); + lp_build_print_value(gallivm, "val = ", out); +#endif soa[chan] = out; } else { soa[chan] = 0; } } - soa_to_aos(gallivm, soa, aos); + + + if (soa_type.length == TGSI_NUM_CHANNELS) { + lp_build_transpose_aos(gallivm, soa_type, soa, aos); + } else { + lp_build_transpose_aos(gallivm, soa_type, soa, soa); + + for (i = 0; i < soa_type.length; ++i) { + aos[i] = lp_build_extract_range(gallivm, + soa[i % TGSI_NUM_CHANNELS], + (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS, + TGSI_NUM_CHANNELS); + } + } + store_aos_array(gallivm, + soa_type, io, aos, attrib, @@ -876,104 +720,71 @@ convert_to_aos(struct gallivm_state *gallivm, clipmask, have_clipdist); } #if DEBUG_STORE - lp_build_printf(builder, " # storing end\n"); + lp_build_printf(gallivm, " # storing end\n"); #endif } /** * Stores original vertex positions in clip coordinates - * There is probably a more efficient way to do this, 4 floats at once - * rather than extracting each element one by one. - * idx is the output to store things too, if pre_clip_pos is set - * we store the pos to the idx, if not we store the clipvertex to it. */ static void store_clip(struct gallivm_state *gallivm, + const struct lp_type vs_type, LLVMValueRef io_ptr, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], boolean pre_clip_pos, int idx) { LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef out[4]; + LLVMValueRef soa[4]; + LLVMValueRef aos[LP_MAX_VECTOR_LENGTH]; LLVMValueRef indices[2]; - LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; - LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3; - LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr; - LLVMValueRef out0elem, out1elem, out2elem, out3elem; - int i; + LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32]; + LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32]; + LLVMValueRef clip_ptrs[LP_MAX_VECTOR_WIDTH / 32]; + int i, j; - LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0); - LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1); - LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2); - LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3); - indices[0] = indices[1] = lp_build_const_int32(gallivm, 0); - out[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 x2 x3*/ - out[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 y2 y3*/ - out[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 z2 z3*/ - out[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 w2 w3*/ + for (i = 0; i < vs_type.length; i++) { + inds[i] = lp_build_const_int32(gallivm, i); + io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, ""); + } - io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, ""); - io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, ""); - io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, ""); - io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, ""); + soa[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 .. xn*/ + soa[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 .. yn*/ + soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/ + soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/ if (!pre_clip_pos) { - clip_ptr0 = draw_jit_header_clip(gallivm, io0_ptr); - clip_ptr1 = draw_jit_header_clip(gallivm, io1_ptr); - clip_ptr2 = draw_jit_header_clip(gallivm, io2_ptr); - clip_ptr3 = draw_jit_header_clip(gallivm, io3_ptr); + for (i = 0; i < vs_type.length; i++) { + clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]); + } } else { - clip_ptr0 = draw_jit_header_pre_clip_pos(gallivm, io0_ptr); - clip_ptr1 = draw_jit_header_pre_clip_pos(gallivm, io1_ptr); - clip_ptr2 = draw_jit_header_pre_clip_pos(gallivm, io2_ptr); - clip_ptr3 = draw_jit_header_pre_clip_pos(gallivm, io3_ptr); + for (i = 0; i < vs_type.length; i++) { + clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]); + } } - for (i = 0; i<4; i++) { - clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, indices, 2, ""); /* x0 */ - clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, indices, 2, ""); /* x1 */ - clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, indices, 2, ""); /* x2 */ - clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, indices, 2, ""); /* x3 */ - - out0elem = LLVMBuildExtractElement(builder, out[i], ind0, ""); /* x0 */ - out1elem = LLVMBuildExtractElement(builder, out[i], ind1, ""); /* x1 */ - out2elem = LLVMBuildExtractElement(builder, out[i], ind2, ""); /* x2 */ - out3elem = LLVMBuildExtractElement(builder, out[i], ind3, ""); /* x3 */ - - LLVMBuildStore(builder, out0elem, clip0_ptr); - LLVMBuildStore(builder, out1elem, clip1_ptr); - LLVMBuildStore(builder, out2elem, clip2_ptr); - LLVMBuildStore(builder, out3elem, clip3_ptr); - - indices[1]= LLVMBuildAdd(builder, indices[1], ind1, ""); + lp_build_transpose_aos(gallivm, vs_type, soa, soa); + for (i = 0; i < vs_type.length; ++i) { + aos[i] = lp_build_extract_range(gallivm, + soa[i % TGSI_NUM_CHANNELS], + (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS, + TGSI_NUM_CHANNELS); } -} - + for (j = 0; j < vs_type.length; j++) { + LLVMTypeRef clip_ptr_type = LLVMPointerType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0); + LLVMValueRef clip_ptr; -/** - * Equivalent of _mm_set1_ps(a) - */ -static LLVMValueRef -vec4f_from_scalar(struct gallivm_state *gallivm, - LLVMValueRef a, - const char *name) -{ - LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context); - LLVMValueRef res = LLVMGetUndef(LLVMVectorType(float_type, 4)); - int i; + clip_ptr = LLVMBuildGEP(builder, clip_ptrs[j], indices, 2, "clipo"); + clip_ptr = LLVMBuildPointerCast(builder, clip_ptr, clip_ptr_type, ""); - for (i = 0; i < 4; ++i) { - LLVMValueRef index = lp_build_const_int32(gallivm, i); - res = LLVMBuildInsertElement(gallivm->builder, res, a, - index, i == 3 ? name : ""); + /* Unaligned store */ + lp_set_store_alignment(LLVMBuildStore(builder, aos[j], clip_ptr), sizeof(float)); } - - return res; } @@ -981,15 +792,17 @@ vec4f_from_scalar(struct gallivm_state *gallivm, * Transforms the outputs for viewport mapping */ static void -generate_viewport(struct draw_llvm *llvm, +generate_viewport(struct draw_llvm_variant *variant, LLVMBuilderRef builder, + struct lp_type vs_type, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], LLVMValueRef context_ptr) { int i; - struct gallivm_state *gallivm = llvm->gallivm; - struct lp_type f32_type = lp_type_float_vec(32); - LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + struct gallivm_state *gallivm = variant->gallivm; + struct lp_type f32_type = vs_type; + LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type); + LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 .. wn*/ LLVMValueRef const1 = lp_build_const_vec(gallivm, f32_type, 1.0); /*1.0 1.0 1.0 1.0*/ LLVMValueRef vp_ptr = draw_jit_context_viewport(gallivm, context_ptr); @@ -999,7 +812,7 @@ generate_viewport(struct draw_llvm *llvm, /* Viewport Mapping */ for (i=0; i<3; i++) { - LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/ + LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 .. xn*/ LLVMValueRef scale; LLVMValueRef trans; LLVMValueRef scale_i; @@ -1012,8 +825,10 @@ generate_viewport(struct draw_llvm *llvm, index = lp_build_const_int32(gallivm, i+4); trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, ""); - scale = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, scale_i, ""), "scale"); - trans = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, trans_i, ""), "trans"); + scale = lp_build_broadcast(gallivm, vs_type_llvm, + LLVMBuildLoad(builder, scale_i, "scale")); + trans = lp_build_broadcast(gallivm, vs_type_llvm, + LLVMBuildLoad(builder, trans_i, "trans")); /* divide by w */ out = LLVMBuildFMul(builder, out, out3, ""); @@ -1030,10 +845,12 @@ generate_viewport(struct draw_llvm *llvm, /** - * Returns clipmask as 4xi32 bitmask for the 4 vertices + * Returns clipmask as nxi32 bitmask for the n vertices */ static LLVMValueRef generate_clipmask(struct draw_llvm *llvm, + struct gallivm_state *gallivm, + struct lp_type vs_type, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], boolean clip_xy, boolean clip_z, @@ -1043,15 +860,15 @@ generate_clipmask(struct draw_llvm *llvm, LLVMValueRef context_ptr, boolean *have_clipdist) { - struct gallivm_state *gallivm = llvm->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef mask; /* stores the <4xi32> clipmasks */ + LLVMValueRef mask; /* stores the <nxi32> clipmasks */ LLVMValueRef test, temp; LLVMValueRef zero, shift; LLVMValueRef pos_x, pos_y, pos_z, pos_w; LLVMValueRef cv_x, cv_y, cv_z, cv_w; LLVMValueRef plane1, planes, plane_ptr, sum; - struct lp_type f32_type = lp_type_float_vec(32); + struct lp_type f32_type = vs_type; + struct lp_type i32_type = lp_int_type(vs_type); const unsigned pos = draw_current_shader_position_output(llvm->draw); const unsigned cv = draw_current_shader_clipvertex_output(llvm->draw); int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance; @@ -1064,25 +881,25 @@ generate_clipmask(struct draw_llvm *llvm, if (cd[0] != pos || cd[1] != pos) have_cd = true; - mask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0); - temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0); - zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */ - shift = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1); /* 1 1 1 1 */ + mask = lp_build_const_int_vec(gallivm, i32_type, 0); + temp = lp_build_const_int_vec(gallivm, i32_type, 0); + zero = lp_build_const_vec(gallivm, f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */ + shift = lp_build_const_int_vec(gallivm, i32_type, 1); /* 1 1 1 1 */ /* * load clipvertex and position from correct locations. * if they are the same just load them once. */ - pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 x2 x3*/ - pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 y2 y3*/ - pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 z2 z3*/ - pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 w2 w3*/ + pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 .. xn */ + pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 .. yn */ + pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 .. zn */ + pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 .. wn */ if (clip_user && cv != pos) { - cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 x2 x3*/ - cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 y2 y3*/ - cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 z2 z3*/ - cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 w2 w3*/ + cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 .. xn */ + cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 .. yn */ + cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 .. zn */ + cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 .. wn */ } else { cv_x = pos_x; cv_y = pos_y; @@ -1120,7 +937,7 @@ generate_clipmask(struct draw_llvm *llvm, } if (clip_z) { - temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 16); + temp = lp_build_const_int_vec(gallivm, i32_type, 16); if (clip_halfz) { /* plane 5 */ test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z); @@ -1163,42 +980,43 @@ generate_clipmask(struct draw_llvm *llvm, clipdist = LLVMBuildLoad(builder, outputs[cd[1]][i-4], ""); } test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, clipdist); - temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx); + temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx); test = LLVMBuildAnd(builder, test, temp, ""); mask = LLVMBuildOr(builder, mask, test, ""); } else { + LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type); indices[0] = lp_build_const_int32(gallivm, 0); indices[1] = lp_build_const_int32(gallivm, plane_idx); indices[2] = lp_build_const_int32(gallivm, 0); plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x"); - planes = vec4f_from_scalar(gallivm, plane1, "plane4_x"); + planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1); sum = LLVMBuildFMul(builder, planes, cv_x, ""); indices[2] = lp_build_const_int32(gallivm, 1); plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y"); - planes = vec4f_from_scalar(gallivm, plane1, "plane4_y"); + planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1); test = LLVMBuildFMul(builder, planes, cv_y, ""); sum = LLVMBuildFAdd(builder, sum, test, ""); indices[2] = lp_build_const_int32(gallivm, 2); plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z"); - planes = vec4f_from_scalar(gallivm, plane1, "plane4_z"); + planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1); test = LLVMBuildFMul(builder, planes, cv_z, ""); sum = LLVMBuildFAdd(builder, sum, test, ""); indices[2] = lp_build_const_int32(gallivm, 3); plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, ""); plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w"); - planes = vec4f_from_scalar(gallivm, plane1, "plane4_w"); + planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1); test = LLVMBuildFMul(builder, planes, cv_w, ""); sum = LLVMBuildFAdd(builder, sum, test, ""); test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum); - temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx); + temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx); test = LLVMBuildAnd(builder, test, temp, ""); mask = LLVMBuildOr(builder, mask, test, ""); } @@ -1212,23 +1030,28 @@ generate_clipmask(struct draw_llvm *llvm, * Returns boolean if any clipping has occurred * Used zero/non-zero i32 value to represent boolean */ -static void -clipmask_bool(struct gallivm_state *gallivm, - LLVMValueRef clipmask, - LLVMValueRef ret_ptr) +static LLVMValueRef +clipmask_booli32(struct gallivm_state *gallivm, + const struct lp_type vs_type, + LLVMValueRef clipmask_bool_ptr) { LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, ""); + LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef clipmask_bool = LLVMBuildLoad(builder, clipmask_bool_ptr, ""); + LLVMValueRef ret = LLVMConstNull(int32_type); LLVMValueRef temp; int i; - for (i=0; i<4; i++) { - temp = LLVMBuildExtractElement(builder, clipmask, + /* + * Can do this with log2(vector length) pack instructions and one extract + * (as we don't actually need a or) with sse2 which would be way better. + */ + for (i=0; i < vs_type.length; i++) { + temp = LLVMBuildExtractElement(builder, clipmask_bool, lp_build_const_int32(gallivm, i) , ""); ret = LLVMBuildOr(builder, ret, temp, ""); } - - LLVMBuildStore(builder, ret, ret_ptr); + return ret; } @@ -1236,7 +1059,7 @@ static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, boolean elts) { - struct gallivm_state *gallivm = llvm->gallivm; + struct gallivm_state *gallivm = variant->gallivm; LLVMContextRef context = gallivm->context; LLVMTypeRef int32_type = LLVMInt32TypeInContext(context); LLVMTypeRef arg_types[8]; @@ -1244,6 +1067,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, LLVMValueRef context_ptr; LLVMBasicBlockRef block; LLVMBuilderRef builder; + struct lp_type vs_type; LLVMValueRef end, start; LLVMValueRef count, fetch_elts, fetch_count; LLVMValueRef stride, step, io_itr; @@ -1255,12 +1079,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, unsigned i, j; struct lp_build_context bld; struct lp_build_loop_state lp_loop; - const int max_vertices = 4; + const int vector_length = lp_native_vector_width / 32; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; LLVMValueRef fetch_max; - void *code; struct lp_build_sampler_soa *sampler = 0; - LLVMValueRef ret, ret_ptr; + LLVMValueRef ret, clipmask_bool_ptr; const boolean bypass_viewport = variant->key.bypass_viewport; const boolean enable_cliptest = variant->key.clip_xy || variant->key.clip_z || @@ -1273,16 +1096,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, memset(&system_values, 0, sizeof(system_values)); - arg_types[0] = get_context_ptr_type(llvm); /* context */ - arg_types[1] = get_vertex_header_ptr_type(llvm); /* vertex_header */ - arg_types[2] = get_buffer_ptr_type(llvm); /* vbuffers */ + arg_types[0] = get_context_ptr_type(variant); /* context */ + arg_types[1] = get_vertex_header_ptr_type(variant); /* vertex_header */ + arg_types[2] = get_buffer_ptr_type(variant); /* vbuffers */ if (elts) arg_types[3] = LLVMPointerType(int32_type, 0);/* fetch_elts * */ else arg_types[3] = int32_type; /* start */ arg_types[4] = int32_type; /* fetch_count / count */ arg_types[5] = int32_type; /* stride */ - arg_types[6] = get_vb_ptr_type(llvm); /* pipe_vertex_buffer's */ + arg_types[6] = get_vb_ptr_type(variant); /* pipe_vertex_buffer's */ arg_types[7] = int32_type; /* instance_id */ func_type = LLVMFunctionType(int32_type, arg_types, Elements(arg_types), 0); @@ -1341,9 +1164,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, lp_build_context_init(&bld, gallivm, lp_type_int(32)); - /* function will return non-zero i32 value if any clipped vertices */ - ret_ptr = lp_build_alloca(gallivm, int32_type, ""); - LLVMBuildStore(builder, zero, ret_ptr); + memset(&vs_type, 0, sizeof vs_type); + vs_type.floating = TRUE; /* floating point values */ + vs_type.sign = TRUE; /* values are signed */ + vs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ + vs_type.width = 32; /* 32-bit float */ + vs_type.length = vector_length; + + /* hold temporary "bool" clipmask */ + clipmask_bool_ptr = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, vs_type), ""); + LLVMBuildStore(builder, lp_build_zero(gallivm, lp_int_type(vs_type)), clipmask_bool_ptr); /* code generated texture sampling */ sampler = draw_llvm_sampler_soa_create( @@ -1358,14 +1188,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, end = lp_build_add(&bld, start, count); } - step = lp_build_const_int32(gallivm, max_vertices); + step = lp_build_const_int32(gallivm, vector_length); fetch_max = LLVMBuildSub(builder, end, one, "fetch_max"); lp_build_loop_begin(&lp_loop, gallivm, start); { LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; - LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS] = { { 0 } }; + LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } }; LLVMValueRef io; LLVMValueRef clipmask; /* holds the clipmask value */ const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS]; @@ -1377,11 +1207,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, ""); #if DEBUG_STORE - lp_build_printf(builder, " --- io %d = %p, loop counter %d\n", + lp_build_printf(gallivm, " --- io %d = %p, loop counter %d\n", io_itr, io, lp_loop.counter); #endif - system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32)); - for (i = 0; i < TGSI_NUM_CHANNELS; ++i) { + system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32, 32*vector_length)); + for (i = 0; i < vector_length; ++i) { LLVMValueRef true_index = LLVMBuildAdd(builder, lp_loop.counter, @@ -1413,11 +1243,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, } } convert_to_soa(gallivm, aos_attribs, inputs, - draw->pt.nr_vertex_elements); + draw->pt.nr_vertex_elements, vs_type); ptr_aos = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) inputs; - generate_vs(llvm, + generate_vs(variant, builder, + vs_type, outputs, ptr_aos, &system_values, @@ -1426,29 +1257,34 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, variant->key.clamp_vertex_color); /* store original positions in clip before further manipulation */ - store_clip(gallivm, io, outputs, 0, cv); - store_clip(gallivm, io, outputs, 1, pos); + store_clip(gallivm, vs_type, io, outputs, 0, cv); + store_clip(gallivm, vs_type, io, outputs, 1, pos); /* do cliptest */ if (enable_cliptest) { + LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, ""); /* allocate clipmask, assign it integer type */ - clipmask = generate_clipmask(llvm, outputs, + clipmask = generate_clipmask(llvm, + gallivm, + vs_type, + outputs, variant->key.clip_xy, variant->key.clip_z, variant->key.clip_user, variant->key.clip_halfz, variant->key.ucp_enable, context_ptr, &have_clipdist); - /* return clipping boolean value for function */ - clipmask_bool(gallivm, clipmask, ret_ptr); + temp = LLVMBuildOr(builder, clipmask, temp, ""); + /* store temporary clipping boolean value */ + LLVMBuildStore(builder, temp, clipmask_bool_ptr); } else { - clipmask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0); + clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 0); } /* do viewport mapping */ if (!bypass_viewport) { - generate_viewport(llvm, builder, outputs, context_ptr); + generate_viewport(variant, builder, vs_type, outputs, context_ptr); } /* store clipmask in vertex header, @@ -1456,43 +1292,20 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, * and transformed positions in data */ convert_to_aos(gallivm, io, outputs, clipmask, - vs_info->num_outputs, max_vertices, have_clipdist); + vs_info->num_outputs, vs_type, + have_clipdist); } lp_build_loop_end_cond(&lp_loop, end, step, LLVMIntUGE); sampler->destroy(sampler); - ret = LLVMBuildLoad(builder, ret_ptr, ""); - LLVMBuildRet(builder, ret); - - /* - * Translate the LLVM IR into machine code. - */ -#ifdef DEBUG - if (LLVMVerifyFunction(variant_func, LLVMPrintMessageAction)) { - lp_debug_dump_value(variant_func); - assert(0); - } -#endif - - LLVMRunFunctionPassManager(gallivm->passmgr, variant_func); + /* return clipping boolean value for function */ + ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr); - if (gallivm_debug & GALLIVM_DEBUG_IR) { - lp_debug_dump_value(variant_func); - debug_printf("\n"); - } - - code = LLVMGetPointerToGlobal(gallivm->engine, variant_func); - if (elts) - variant->jit_func_elts = (draw_jit_vert_func_elts) pointer_to_func(code); - else - variant->jit_func = (draw_jit_vert_func) pointer_to_func(code); + LLVMBuildRet(builder, ret); - if (gallivm_debug & GALLIVM_DEBUG_ASM) { - lp_disassemble(code); - } - lp_func_delete_body(variant_func); + gallivm_verify_function(gallivm, variant_func); } @@ -1600,17 +1413,17 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant) struct draw_llvm *llvm = variant->llvm; if (variant->function_elts) { - LLVMFreeMachineCodeForFunction(llvm->gallivm->engine, - variant->function_elts); - LLVMDeleteFunction(variant->function_elts); + gallivm_free_function(variant->gallivm, + variant->function_elts, variant->jit_func_elts); } if (variant->function) { - LLVMFreeMachineCodeForFunction(llvm->gallivm->engine, - variant->function); - LLVMDeleteFunction(variant->function); + gallivm_free_function(variant->gallivm, + variant->function, variant->jit_func); } + gallivm_destroy(variant->gallivm); + remove_from_list(&variant->list_item_local); variant->shader->variants_cached--; remove_from_list(&variant->list_item_global); diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 31fc2db..39d83cf 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -36,11 +36,6 @@ #include "pipe/p_context.h" #include "util/u_simple_list.h" -#include <llvm-c/Core.h> -#include <llvm-c/Analysis.h> -#include <llvm-c/Target.h> -#include <llvm-c/ExecutionEngine.h> - struct draw_llvm; struct llvm_vertex_shader; @@ -220,6 +215,14 @@ struct draw_llvm_variant_list_item struct draw_llvm_variant { + struct gallivm_state *gallivm; + + /* LLVM JIT builder types */ + LLVMTypeRef context_ptr_type; + LLVMTypeRef buffer_ptr_type; + LLVMTypeRef vb_ptr_type; + LLVMTypeRef vertex_header_ptr_type; + LLVMValueRef function; LLVMValueRef function_elts; draw_jit_vert_func jit_func; @@ -249,16 +252,8 @@ struct draw_llvm { struct draw_jit_context jit_context; - struct gallivm_state *gallivm; - struct draw_llvm_variant_list_item vs_variants_list; int nr_variants; - - /* LLVM JIT builder types */ - LLVMTypeRef context_ptr_type; - LLVMTypeRef buffer_ptr_type; - LLVMTypeRef vb_ptr_type; - LLVMTypeRef vertex_header_ptr_type; }; @@ -270,7 +265,7 @@ llvm_vertex_shader(struct draw_vertex_shader *vs) struct draw_llvm * -draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm); +draw_llvm_create(struct draw_context *draw); void draw_llvm_destroy(struct draw_llvm *llvm); @@ -286,11 +281,6 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant); struct draw_llvm_variant_key * draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store); -LLVMValueRef -draw_llvm_translate_from(struct gallivm_state *gallivm, - LLVMValueRef vbuffer, - enum pipe_format from_format); - struct lp_build_sampler_soa * draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state, LLVMValueRef context_ptr); diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c index 0a8b3bc..1dbe5f5 100644 --- a/src/gallium/auxiliary/draw/draw_llvm_sample.c +++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c @@ -173,8 +173,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *texel) @@ -189,7 +188,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, type, unit, num_coords, coords, - ddx, ddy, + derivs, lod_bias, explicit_lod, texel); } @@ -201,6 +200,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, static void draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, struct gallivm_state *gallivm, + struct lp_type type, unsigned unit, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *sizes_out) @@ -212,6 +212,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, lp_build_size_query_soa(gallivm, &sampler->dynamic_state.static_state[unit], &sampler->dynamic_state.base, + type, unit, explicit_lod, sizes_out); diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c deleted file mode 100644 index 77d0af7..0000000 --- a/src/gallium/auxiliary/draw/draw_llvm_translate.c +++ /dev/null @@ -1,506 +0,0 @@ -#include "draw_private.h" -#include "draw_context.h" - -#include "draw_llvm.h" - -#include "gallivm/lp_bld_const.h" -#include "gallivm/lp_bld_struct.h" -#include "gallivm/lp_bld_format.h" -#include "gallivm/lp_bld_debug.h" -#include "gallivm/lp_bld_type.h" - -#include "util/u_memory.h" -#include "util/u_format.h" -#include "pipe/p_state.h" - - -#define DRAW_DBG 0 - -static LLVMValueRef -from_64_float(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - return LLVMBuildFPTrunc(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static LLVMValueRef -from_32_float(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0) , ""); - return LLVMBuildLoad(gallivm->builder, bc, ""); -} - -static INLINE LLVMValueRef -from_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, ""); - return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static INLINE LLVMValueRef -from_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static INLINE LLVMValueRef -from_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static INLINE LLVMValueRef -from_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, ""); - return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static INLINE LLVMValueRef -from_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - -static INLINE LLVMValueRef -from_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); -} - - -static INLINE LLVMValueRef -from_8_unorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, ""); - LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 255.), ""); -} - -static INLINE LLVMValueRef -from_16_unorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 65535.), ""); -} - -static INLINE LLVMValueRef -from_32_unorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 4294967295.), ""); -} - -static INLINE LLVMValueRef -from_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, ""); - LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 127.0), ""); -} - -static INLINE LLVMValueRef -from_16_snorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 32767.0f), ""); -} - -static INLINE LLVMValueRef -from_32_snorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 2147483647.0), ""); -} - -static INLINE LLVMValueRef -from_32_fixed(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val, - LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , ""); - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, ""); - LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), ""); - - return LLVMBuildFDiv(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 65536.0), ""); -} - -static LLVMValueRef -to_64_float(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPExt(gallivm->builder, l, LLVMDoubleTypeInContext(gallivm->context), ""); -} - -static LLVMValueRef -to_32_float(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - return LLVMBuildLoad(gallivm->builder, fp, ""); -} - -static INLINE LLVMValueRef -to_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), ""); -} - -static INLINE LLVMValueRef -to_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), ""); -} - -static INLINE LLVMValueRef -to_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), ""); -} - -static INLINE LLVMValueRef -to_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), ""); -} - -static INLINE LLVMValueRef -to_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), ""); -} - -static INLINE LLVMValueRef -to_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), ""); -} - -static INLINE LLVMValueRef -to_8_unorm(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 8), ""); - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 255.), ""); -} - -static INLINE LLVMValueRef -to_16_unorm(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 32), ""); - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 65535.), ""); -} - -static INLINE LLVMValueRef -to_32_unorm(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 32), ""); - - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 4294967295.), ""); -} - -static INLINE LLVMValueRef -to_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, ""); - LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 8), ""); - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 127.0), ""); -} - -static INLINE LLVMValueRef -to_16_snorm(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 16), ""); - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 32767.0f), ""); -} - -static INLINE LLVMValueRef -to_32_snorm(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 32), ""); - - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 2147483647.0), ""); -} - -static INLINE LLVMValueRef -to_32_fixed(struct gallivm_state *gallivm, LLVMValueRef fp) -{ - LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, ""); - LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l, - LLVMIntTypeInContext(gallivm->context, 32), ""); - - return LLVMBuildFMul(gallivm->builder, uscaled, - lp_build_const_float(gallivm, 65536.0), ""); -} - -typedef LLVMValueRef (*from_func)(struct gallivm_state *, LLVMValueRef); -typedef LLVMValueRef (*to_func)(struct gallivm_state *, LLVMValueRef); - -/* so that underneath can avoid function calls which are prohibited - * for static initialization we need this conversion */ -enum ll_type { - LL_Double, - LL_Float, - LL_Int32, - LL_Int16, - LL_Int8 -}; - -static INLINE LLVMTypeRef -ll_type_to_llvm(struct gallivm_state *gallivm, enum ll_type type) -{ - switch (type) { - case LL_Double: - return LLVMDoubleTypeInContext(gallivm->context); - case LL_Float: - return LLVMFloatTypeInContext(gallivm->context); - case LL_Int32: - return LLVMInt32TypeInContext(gallivm->context); - case LL_Int16: - return LLVMIntTypeInContext(gallivm->context, 16); - case LL_Int8: - return LLVMIntTypeInContext(gallivm->context, 8); - } - return LLVMIntTypeInContext(gallivm->context, 8); -} - -static INLINE int -ll_type_size(enum ll_type type) -{ - switch (type) { - case LL_Double: - return 8; - case LL_Float: - return 4; - case LL_Int32: - return 4; - case LL_Int16: - return 2; - case LL_Int8: - return 1; - } - return 1; -} - -struct draw_llvm_translate { - int format; - from_func from; - to_func to; - enum ll_type type; - int num_components; -} translates[] = -{ - {PIPE_FORMAT_R64_FLOAT, from_64_float, to_64_float, LL_Double, 1}, - {PIPE_FORMAT_R64G64_FLOAT, from_64_float, to_64_float, LL_Double, 2}, - {PIPE_FORMAT_R64G64B64_FLOAT, from_64_float, to_64_float, LL_Double, 3}, - {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4}, - {PIPE_FORMAT_R32_FLOAT, from_32_float, to_32_float, LL_Float, 1}, - {PIPE_FORMAT_R32G32_FLOAT, from_32_float, to_32_float, LL_Float, 2}, - {PIPE_FORMAT_R32G32B32_FLOAT, from_32_float, to_32_float, LL_Float, 3}, - {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4}, - - {PIPE_FORMAT_R32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 1}, - {PIPE_FORMAT_R32G32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 2}, - {PIPE_FORMAT_R32G32B32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 3}, - {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4}, - - {PIPE_FORMAT_R32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 1}, - {PIPE_FORMAT_R32G32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 2}, - {PIPE_FORMAT_R32G32B32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 3}, - {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4}, - - {PIPE_FORMAT_R32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 1}, - {PIPE_FORMAT_R32G32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 2}, - {PIPE_FORMAT_R32G32B32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 3}, - {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4}, - - {PIPE_FORMAT_R32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 1}, - {PIPE_FORMAT_R32G32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 2}, - {PIPE_FORMAT_R32G32B32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 3}, - {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4}, - - {PIPE_FORMAT_R16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 1}, - {PIPE_FORMAT_R16G16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 2}, - {PIPE_FORMAT_R16G16B16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 3}, - {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4}, - - {PIPE_FORMAT_R16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 1}, - {PIPE_FORMAT_R16G16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 2}, - {PIPE_FORMAT_R16G16B16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 3}, - {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4}, - - {PIPE_FORMAT_R16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 1}, - {PIPE_FORMAT_R16G16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 2}, - {PIPE_FORMAT_R16G16B16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 3}, - {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4}, - - {PIPE_FORMAT_R16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 1}, - {PIPE_FORMAT_R16G16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 2}, - {PIPE_FORMAT_R16G16B16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 3}, - {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4}, - - {PIPE_FORMAT_R8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 1}, - {PIPE_FORMAT_R8G8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 2}, - {PIPE_FORMAT_R8G8B8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 3}, - {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4}, - - {PIPE_FORMAT_R8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 1}, - {PIPE_FORMAT_R8G8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 2}, - {PIPE_FORMAT_R8G8B8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 3}, - {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4}, - - {PIPE_FORMAT_R8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 1}, - {PIPE_FORMAT_R8G8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 2}, - {PIPE_FORMAT_R8G8B8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 3}, - {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4}, - - {PIPE_FORMAT_R8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 1}, - {PIPE_FORMAT_R8G8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 2}, - {PIPE_FORMAT_R8G8B8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 3}, - {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4}, - - {PIPE_FORMAT_R32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 1}, - {PIPE_FORMAT_R32G32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 2}, - {PIPE_FORMAT_R32G32B32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 3}, - {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4}, -}; - - -static LLVMValueRef -fetch(struct gallivm_state *gallivm, - LLVMValueRef ptr, int val_size, int nr_components, - from_func func) -{ - int i; - int offset = 0; - LLVMValueRef res = - LLVMConstNull(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)); - LLVMValueRef defaults[4]; - - defaults[0] = - defaults[1] = - defaults[2] = lp_build_const_float(gallivm, 0.0); - defaults[3] = lp_build_const_float(gallivm, 1.0); - - for (i = 0; i < nr_components; ++i) { - LLVMValueRef src_index = lp_build_const_int32(gallivm, offset); - LLVMValueRef dst_index = lp_build_const_int32(gallivm, i); - LLVMValueRef src_tmp; - LLVMValueRef component; - - src_tmp = LLVMBuildGEP(gallivm->builder, ptr, &src_index, 1, "src_tmp"); - - /* convert src_tmp to float */ - component = func(gallivm, src_tmp); - - /* vec.comp = component */ - res = LLVMBuildInsertElement(gallivm->builder, - res, - component, - dst_index, ""); - offset += val_size; - } - for (; i < 4; ++i) { - LLVMValueRef dst_index = lp_build_const_int32(gallivm, i); - res = LLVMBuildInsertElement(gallivm->builder, - res, - defaults[i], - dst_index, ""); - } - return res; -} - - -LLVMValueRef -draw_llvm_translate_from(struct gallivm_state *gallivm, - LLVMValueRef vbuffer, - enum pipe_format from_format) -{ - const struct util_format_description *format_desc; - LLVMValueRef zero; - int i; - struct lp_type type = lp_float32_vec4_type(); - - /* - * The above can only cope with straight arrays: no bitfields, - * swizzles, or half floats. - */ - - for (i = 0; i < Elements(translates); ++i) { - if (translates[i].format == from_format) { - /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/ - return fetch(gallivm, - vbuffer, - ll_type_size(translates[i].type), - translates[i].num_components, - translates[i].from); - } - } - - - /* - * This doesn't handle anything bigger than 32bits, or half floats - * yet. - * - * TODO: unify all this code into lp_build_fetch_rgba_aos(). - */ - - format_desc = util_format_description(from_format); - zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); - return lp_build_fetch_rgba_aos(gallivm, format_desc, type, vbuffer, zero, zero, zero); -} diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index d85deee..9cede21 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -47,8 +47,8 @@ #include "tgsi/tgsi_scan.h" #ifdef HAVE_LLVM -#include <llvm-c/ExecutionEngine.h> struct draw_llvm; +struct gallivm_state; #endif @@ -301,7 +301,6 @@ struct draw_context #ifdef HAVE_LLVM struct draw_llvm *llvm; - struct gallivm_state *own_gallivm; #endif struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS]; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 1e17f80..04b286f 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -230,7 +230,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, llvm_vert_info.stride = fpme->vertex_size; llvm_vert_info.verts = (struct vertex_header *)MALLOC(fpme->vertex_size * - align(fetch_info->count, 4)); + align(fetch_info->count, lp_native_vector_width / 32)); if (!llvm_vert_info.verts) { assert(0); return; @@ -423,7 +423,7 @@ draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw) { struct llvm_middle_end *fpme = 0; - if (!draw->llvm || !draw->llvm->gallivm->engine) + if (!draw->llvm) return NULL; fpme = CALLOC_STRUCT( llvm_middle_end ); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 9fc5762..d226dab 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const char *intrinsic = NULL; + unsigned intr_size; LLVMValueRef cond; assert(lp_check_value(type, a)); @@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld, /* TODO: optimize the constant case */ - if(type.width * type.length == 128) { - if(type.floating) { - if(type.width == 32 && util_cpu_caps.has_sse) + if (type.floating && util_cpu_caps.has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.min.ss"; + intr_size = 128; + } + else if (type.length <= 4 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse.min.ps"; - if(type.width == 64 && util_cpu_caps.has_sse2) + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.min.ps.256"; + intr_size = 256; + } + } + if (type.width == 64 && util_cpu_caps.has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.min.sd"; + intr_size = 128; + } + else if (type.length == 2 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse2.min.pd"; + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.min.pd.256"; + intr_size = 256; + } } - else { - if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pminu.b"; - if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) + } + else if (util_cpu_caps.has_sse2 && type.length >= 2) { + intr_size = 128; + if ((type.width == 8 || type.width == 16) && + (type.width * type.length <= 64) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient code, bogus shuffle due to packing\n", + __FUNCTION__); + } + if (type.width == 8 && !type.sign) { + intrinsic = "llvm.x86.sse2.pminu.b"; + } + else if (type.width == 16 && type.sign) { + intrinsic = "llvm.x86.sse2.pmins.w"; + } + if (util_cpu_caps.has_sse4_1) { + if (type.width == 8 && type.sign) { intrinsic = "llvm.x86.sse41.pminsb"; - if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 16 && !type.sign) { intrinsic = "llvm.x86.sse41.pminuw"; - if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmins.w"; - if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && !type.sign) { intrinsic = "llvm.x86.sse41.pminud"; - if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && type.sign) { intrinsic = "llvm.x86.sse41.pminsd"; + } } } - if(intrinsic) - return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); + if(intrinsic) { + return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, + type, + intr_size, a, b); + } cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); return lp_build_select(bld, cond, a, b); @@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const char *intrinsic = NULL; + unsigned intr_size; LLVMValueRef cond; assert(lp_check_value(type, a)); @@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld, /* TODO: optimize the constant case */ - if(type.width * type.length == 128) { - if(type.floating) { - if(type.width == 32 && util_cpu_caps.has_sse) + if (type.floating && util_cpu_caps.has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.max.ss"; + intr_size = 128; + } + else if (type.length <= 4 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse.max.ps"; - if(type.width == 64 && util_cpu_caps.has_sse2) + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.max.ps.256"; + intr_size = 256; + } + } + if (type.width == 64 && util_cpu_caps.has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.max.sd"; + intr_size = 128; + } + else if (type.length == 2 || !util_cpu_caps.has_avx) { intrinsic = "llvm.x86.sse2.max.pd"; + intr_size = 128; + } + else { + intrinsic = "llvm.x86.avx.max.pd.256"; + intr_size = 256; + } } - else { - if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmaxu.b"; - if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1) + } + else if (util_cpu_caps.has_sse2 && type.length >= 2) { + intr_size = 128; + if ((type.width == 8 || type.width == 16) && + (type.width * type.length <= 64) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient code, bogus shuffle due to packing\n", + __FUNCTION__); + } + if (type.width == 8 && !type.sign) { + intrinsic = "llvm.x86.sse2.pmaxu.b"; + intr_size = 128; + } + else if (type.width == 16 && type.sign) { + intrinsic = "llvm.x86.sse2.pmaxs.w"; + } + if (util_cpu_caps.has_sse4_1) { + if (type.width == 8 && type.sign) { intrinsic = "llvm.x86.sse41.pmaxsb"; - if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 16 && !type.sign) { intrinsic = "llvm.x86.sse41.pmaxuw"; - if(type.width == 16 && type.sign && util_cpu_caps.has_sse2) - intrinsic = "llvm.x86.sse2.pmaxs.w"; - if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && !type.sign) { intrinsic = "llvm.x86.sse41.pmaxud"; - if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1) + } + if (type.width == 32 && type.sign) { intrinsic = "llvm.x86.sse41.pmaxsd"; + } } } - if(intrinsic) - return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); + if(intrinsic) { + return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, + type, + intr_size, a, b); + } cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); return lp_build_select(bld, cond, a, b); @@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld, } -/** Return the scalar sum of the elements of a */ +/** Return the scalar sum of the elements of a. + * Should avoid this operation whenever possible. + */ LLVMValueRef -lp_build_sum_vector(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_horizontal_add(struct lp_build_context *bld, + LLVMValueRef a) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; LLVMValueRef index, res; - unsigned i; + unsigned i, length; + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; + LLVMValueRef vecres, elem2; assert(lp_check_value(type, a)); @@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld, assert(!bld->type.norm); - index = lp_build_const_int32(bld->gallivm, 0); - res = LLVMBuildExtractElement(builder, a, index, ""); + /* + * for byte vectors can do much better with psadbw. + * Using repeated shuffle/adds here. Note with multiple vectors + * this can be done more efficiently as outlined in the intel + * optimization manual. + * Note: could cause data rearrangement if used with smaller element + * sizes. + */ - for (i = 1; i < type.length; i++) { - index = lp_build_const_int32(bld->gallivm, i); - if (type.floating) - res = LLVMBuildFAdd(builder, res, - LLVMBuildExtractElement(builder, - a, index, ""), - ""); - else - res = LLVMBuildAdd(builder, res, - LLVMBuildExtractElement(builder, - a, index, ""), - ""); + vecres = a; + length = type.length / 2; + while (length > 1) { + LLVMValueRef vec1, vec2; + for (i = 0; i < length; i++) { + shuffles1[i] = lp_build_const_int32(bld->gallivm, i); + shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); + } + vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, + LLVMConstVector(shuffles1, length), ""); + vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, + LLVMConstVector(shuffles2, length), ""); + if (type.floating) { + vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); + } + else { + vecres = LLVMBuildAdd(builder, vec1, vec2, ""); + } + length = length >> 1; } + /* always have vector of size 2 here */ + assert(length == 1); + + index = lp_build_const_int32(bld->gallivm, 0); + res = LLVMBuildExtractElement(builder, vecres, index, ""); + index = lp_build_const_int32(bld->gallivm, 1); + elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); + + if (type.floating) + res = LLVMBuildFAdd(builder, res, elem2, ""); + else + res = LLVMBuildAdd(builder, res, elem2, ""); + return res; } +/** + * Return the horizontal sums of 4 float vectors as a float4 vector. + * This uses the technique as outlined in Intel Optimization Manual. + */ +static LLVMValueRef +lp_build_horizontal_add4x4f(struct lp_build_context *bld, + LLVMValueRef src[4]) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles[4]; + LLVMValueRef tmp[4]; + LLVMValueRef sumtmp[2], shuftmp[2]; + + /* lower half of regs */ + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + shuffles[2] = lp_build_const_int32(gallivm, 4); + shuffles[3] = lp_build_const_int32(gallivm, 5); + tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], + LLVMConstVector(shuffles, 4), ""); + tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], + LLVMConstVector(shuffles, 4), ""); + + /* upper half of regs */ + shuffles[0] = lp_build_const_int32(gallivm, 2); + shuffles[1] = lp_build_const_int32(gallivm, 3); + shuffles[2] = lp_build_const_int32(gallivm, 6); + shuffles[3] = lp_build_const_int32(gallivm, 7); + tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], + LLVMConstVector(shuffles, 4), ""); + tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], + LLVMConstVector(shuffles, 4), ""); + + sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); + sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 2); + shuffles[2] = lp_build_const_int32(gallivm, 4); + shuffles[3] = lp_build_const_int32(gallivm, 6); + shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], + LLVMConstVector(shuffles, 4), ""); + + shuffles[0] = lp_build_const_int32(gallivm, 1); + shuffles[1] = lp_build_const_int32(gallivm, 3); + shuffles[2] = lp_build_const_int32(gallivm, 5); + shuffles[3] = lp_build_const_int32(gallivm, 7); + shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], + LLVMConstVector(shuffles, 4), ""); + + return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); +} + + +/* + * partially horizontally add 2-4 float vectors with length nx4, + * i.e. only four adjacent values in each vector will be added, + * assuming values are really grouped in 4 which also determines + * output order. + * + * Return a vector of the same length as the initial vectors, + * with the excess elements (if any) being undefined. + * The element order is independent of number of input vectors. + * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 + * the output order thus will be + * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef + */ +LLVMValueRef +lp_build_hadd_partial4(struct lp_build_context *bld, + LLVMValueRef vectors[], + unsigned num_vecs) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef ret_vec; + LLVMValueRef tmp[4]; + const char *intrinsic = NULL; + + assert(num_vecs >= 2 && num_vecs <= 4); + assert(bld->type.floating); + + /* only use this with at least 2 vectors, as it is sort of expensive + * (depending on cpu) and we always need two horizontal adds anyway, + * so a shuffle/add approach might be better. + */ + + tmp[0] = vectors[0]; + tmp[1] = vectors[1]; + + tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; + tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; + + if (util_cpu_caps.has_sse3 && bld->type.width == 32 && + bld->type.length == 4) { + intrinsic = "llvm.x86.sse3.hadd.ps"; + } + else if (util_cpu_caps.has_avx && bld->type.width == 32 && + bld->type.length == 8) { + intrinsic = "llvm.x86.avx.hadd.ps.256"; + } + if (intrinsic) { + tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[0], tmp[1]); + if (num_vecs > 2) { + tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[2], tmp[3]); + } + else { + tmp[1] = tmp[0]; + } + return lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, bld->type), + tmp[0], tmp[1]); + } + + if (bld->type.length == 4) { + ret_vec = lp_build_horizontal_add4x4f(bld, tmp); + } + else { + LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; + unsigned j; + unsigned num_iter = bld->type.length / 4; + struct lp_type parttype = bld->type; + parttype.length = 4; + for (j = 0; j < num_iter; j++) { + LLVMValueRef partsrc[4]; + unsigned i; + for (i = 0; i < 4; i++) { + partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); + } + partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); + } + ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); + } + return ret_vec; +} /** * Generate a - b @@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld, if(bld->type.floating) { #if 0 /* - * Power of two multiplication by directly manipulating the mantissa. + * Power of two multiplication by directly manipulating the exponent. * * XXX: This might not be always faster, it will introduce a small error * for multiplication by zero, and it will produce wrong results @@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld, return LLVMConstUDiv(a, b); } - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 && + if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && type.floating) return lp_build_mul(bld, a, lp_build_rcp(bld, b)); @@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld, return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); } } + else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && + (gallivm_debug & GALLIVM_DEBUG_PERF) && + (type.width == 8 || type.width == 16 || type.width == 32)) { + debug_printf("%s: inefficient code, should split vectors manually\n", + __FUNCTION__); + } return lp_build_max(bld, a, LLVMBuildNeg(builder, a, "")); } @@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld, else { /* signed int/norm/fixed point */ + /* could use psign with sse3 and appropriate vectors here */ LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); res = lp_build_select(bld, cond, bld->one, minus_one); @@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld, return LLVMBuildSIToFP(builder, a, vec_type, ""); } +static boolean +sse41_rounding_available(const struct lp_type type) +{ + if ((util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) || + (util_cpu_caps.has_avx && type.width*type.length == 256)) + return TRUE; + return FALSE; +} enum lp_build_round_sse41_mode { @@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld, res = LLVMBuildExtractElement(builder, res, index0, ""); } else { - assert(type.width*type.length == 128); - - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; + if (type.width * type.length == 128) { + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ps"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.pd"; + break; + default: + assert(0); + return bld->undef; + } + } + else { + assert(type.width * type.length == 256); + assert(util_cpu_caps.has_avx); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.avx.round.ps.256"; + break; + case 64: + intrinsic = "llvm.x86.avx.round.pd.256"; + break; + default: + assert(0); + return bld->undef; + } } res = lp_build_intrinsic_binary(builder, intrinsic, @@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, ret_type, arg); } else { - assert(type.width*type.length == 128); - - intrinsic = "llvm.x86.sse2.cvtps2dq"; + if (type.width* type.length == 128) { + intrinsic = "llvm.x86.sse2.cvtps2dq"; + } + else { + assert(type.width*type.length == 256); + assert(util_cpu_caps.has_avx); + intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; + } res = lp_build_intrinsic_unary(builder, intrinsic, ret_type, a); } @@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); } else { @@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { @@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { @@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { @@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld, /** + * Prevent returning a fractional part of 1.0 for very small negative values of + * 'a' by clamping against 0.99999(9). + */ +static inline LLVMValueRef +clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) +{ + LLVMValueRef max; + + /* this is the largest number smaller than 1.0 representable as float */ + max = lp_build_const_vec(bld->gallivm, bld->type, + 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); + return lp_build_min(bld, fract, max); +} + + +/** + * Same as lp_build_fract, but guarantees that the result is always smaller + * than one. + */ +LLVMValueRef +lp_build_fract_safe(struct lp_build_context *bld, + LLVMValueRef a) +{ + return clamp_fract(bld, lp_build_fract(bld, a)); +} + + +/** * Return the integer part of a float (vector) value (== round toward zero). * The returned value is an integer (vector). * Ex: itrunc(-1.5) = -1 @@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld, assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse2 && - ((type.width == 32) && (type.length == 1 || type.length == 4))) { + if ((util_cpu_caps.has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { return lp_build_iround_nearest_sse2(bld, a); } - else if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { @@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { - res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); - } - else { - res = a; - - if (type.sign) { + res = a; + if (type.sign) { + if (sse41_rounding_available(type)) { + res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); + } + else { /* Take the sign bit and add it to 1 constant */ LLVMTypeRef vec_type = bld->vec_type; unsigned mantissa = lp_mantissa(type); @@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { @@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld, * Combined ifloor() & fract(). * * Preferred to calling the functions separately, as it will ensure that the - * stratergy (floor() vs ifloor()) that results in less redundant work is used. + * strategy (floor() vs ifloor()) that results in less redundant work is used. */ void lp_build_ifloor_fract(struct lp_build_context *bld, @@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && - (type.length == 1 || type.width*type.length == 128)) { + if (sse41_rounding_available(type)) { /* * floor() is easier. */ @@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld, } +/** + * Same as lp_build_ifloor_fract, but guarantees that the fractional part is + * always smaller than one. + */ +void +lp_build_ifloor_fract_safe(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); + *out_fpart = clamp_fract(bld, *out_fpart); +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) @@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld, assert(lp_check_value(type, a)); /* TODO: optimize the constant case */ - /* TODO: optimize the constant case */ assert(type.floating); - util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); + if (type.length == 1) { + util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width); + } + else { + util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width); + } return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); } @@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld, * - it doesn't even get the reciprocate of 1.0 exactly * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf * - for recent processors the benefit over DIVPS is marginal, a case - * depedent + * dependent * * We could still use it on certain processors if benchmarks show that the * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for * particular uses that require less workarounds. */ - if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; + const char *intrinsic = NULL; - res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rcp.ps"; + } + else { + intrinsic = "llvm.x86.avx.rcp.ps.256"; + } + + res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); for (i = 0; i < num_iterations; ++i) { res = lp_build_rcp_refine(bld, a, res); @@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld, assert(type.floating); - if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || + (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { const unsigned num_iterations = 1; LLVMValueRef res; unsigned i; + const char *intrinsic = NULL; + + if (type.length == 4) { + intrinsic = "llvm.x86.sse.rsqrt.ps"; + } + else { + intrinsic = "llvm.x86.avx.rsqrt.ps.256"; + } + + res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); - res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index aeb987f..60b9907 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld, LLVMValueRef b); LLVMValueRef -lp_build_sum_vector(struct lp_build_context *bld, - LLVMValueRef a); +lp_build_horizontal_add(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef +lp_build_hadd_partial4(struct lp_build_context *bld, + LLVMValueRef vectors[], + unsigned num_vecs); LLVMValueRef lp_build_sub(struct lp_build_context *bld, @@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld, LLVMValueRef a); LLVMValueRef +lp_build_fract_safe(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a); LLVMValueRef @@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld, LLVMValueRef *out_ipart, LLVMValueRef *out_fpart); +void +lp_build_ifloor_fract_safe(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart); + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c index 59e8fb2..35799a1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_const.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c @@ -37,6 +37,7 @@ #include "util/u_debug.h" #include "util/u_math.h" +#include "util/u_half.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type) if(type.floating) { switch(type.width) { + case 16: + return 10; case 32: return 23; case 64: - return 53; + return 52; default: assert(0); return 0; @@ -136,6 +139,8 @@ lp_const_min(struct lp_type type) if (type.floating) { switch(type.width) { + case 16: + return -65504; case 32: return -FLT_MAX; case 64: @@ -169,6 +174,8 @@ lp_const_max(struct lp_type type) if (type.floating) { switch(type.width) { + case 16: + return 65504; case 32: return FLT_MAX; case 64: @@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type) { if (type.floating) { switch(type.width) { + case 16: + return 2E-10; case 32: return FLT_EPSILON; case 64: @@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type) elem_type = lp_build_elem_type(gallivm, type); - if(type.floating) + if(type.floating && type.width == 16) + elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0); + else if(type.floating) elems[0] = LLVMConstReal(elem_type, 1.0); else if(type.fixed) elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0); @@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm, LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type); LLVMValueRef elem; - if(type.floating) { + if(type.floating && type.width == 16) { + elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0); + } else if(type.floating) { elem = LLVMConstReal(elem_type, val); } else { @@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm, if(swizzle == NULL) swizzle = default_swizzle; - if(type.floating) { - elems[swizzle[0]] = LLVMConstReal(elem_type, r); - elems[swizzle[1]] = LLVMConstReal(elem_type, g); - elems[swizzle[2]] = LLVMConstReal(elem_type, b); - elems[swizzle[3]] = LLVMConstReal(elem_type, a); - } - else { - double dscale = lp_const_scale(type); - - elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0); - elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0); - elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0); - elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0); - } + elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r); + elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g); + elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b); + elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a); for(i = 4; i < type.length; ++i) elems[i] = elems[i % 4]; @@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm, /** * Build a callable function pointer. * - * We this casts instead of LLVMAddGlobalMapping() + * We use function pointer constants instead of LLVMAddGlobalMapping() * to work around a bug in LLVM 2.6, and for efficiency/simplicity. */ LLVMValueRef diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 0973e1f..0399709 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -70,6 +70,66 @@ #include "lp_bld_arit.h" #include "lp_bld_pack.h" #include "lp_bld_conv.h" +#include "lp_bld_logic.h" + + +/** + * Converts int16 half-float to float32 + * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?) + * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] + * + * @param src_type <vector> type of int16 + * @param src value to convert + * + * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + */ +LLVMValueRef +lp_build_half_to_float(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef src) +{ + struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length); + struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length); + + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type); + + /* Constants */ + LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13); + LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16); + LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff); + LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff); + LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); + LLVMValueRef f32_magic = LLVMBuildBitCast(builder, + lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23), + float_vec_type, ""); + + /* Convert int16 vector to int32 vector by zero ext */ + LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); + + /* Exponent / mantissa bits */ + LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, ""); + LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, ""); + + /* Exponent adjust */ + LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, ""); + + /* Make sure Inf/NaN survive */ + LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan); + LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, ""); + + /* Sign bit */ + LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, ""); + LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, ""); + + /* Combine result */ + LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, ""); + LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, ""); + + /* Cast from int32 vector to float32 vector */ + return LLVMBuildBitCast(builder, final, float_vec_type, ""); +} /** @@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm, dst_type.width == 8 && dst_type.length == 16 && + 4 * num_dsts == num_srcs && + util_cpu_caps.has_sse2) { struct lp_build_context bld; @@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm, return; } + /* Special case 2x8f --> 1x16ub + */ + else if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 8 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + dst_type.length == 16 && + + 2 * num_dsts == num_srcs && + + util_cpu_caps.has_avx) { + + struct lp_build_context bld; + struct lp_type int16_type = dst_type; + struct lp_type int32_type = dst_type; + LLVMValueRef const_255f; + unsigned i; + + lp_build_context_init(&bld, gallivm, src_type); + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); + + for (i = 0; i < num_dsts; ++i, src += 2) { + LLVMValueRef lo, hi, a, b; + + a = LLVMBuildFMul(builder, src[0], const_255f, ""); + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + + a = lp_build_iround(&bld, a); + b = lp_build_iround(&bld, b); + + tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); + tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); + tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); + tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); + + /* relying on clamping behavior of sse2 intrinsics here */ + lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); + } + return; + } + + /* Pre convert half-floats to floats + */ + else if (src_type.floating && src_type.width == 16) + { + for(i = 0; i < num_tmps; ++i) + tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); + + tmp_type.width = 32; + } + /* * Clamp if necessary */ @@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm, * This will convert the integer masks that match the given types. * * The mask values should 0 or -1, i.e., all bits either set to zero or one. - * Any other value will likely cause in unpredictable results. + * Any other value will likely cause unpredictable results. * * This is basically a very trimmed down version of lp_build_conv. */ @@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { - /* Register width must remain constant */ - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); @@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm, * Truncate or expand bit width */ - if(src_type.width > dst_type.width) { - assert(num_dsts == 1); - dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); - } - else if(src_type.width < dst_type.width) { - assert(num_srcs == 1); - lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts); - } - else { - assert(num_srcs == num_dsts); - memcpy(dst, src, num_dsts * sizeof *dst); - } + lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h index cec6559..c830fbe 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h @@ -42,6 +42,10 @@ struct lp_type; +LLVMValueRef +lp_build_half_to_float(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef src); LLVMValueRef lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp index 444b70a..93505f3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp @@ -35,10 +35,8 @@ #if HAVE_LLVM >= 0x0300 #include <llvm/Support/TargetRegistry.h> -#include <llvm/Support/TargetSelect.h> #else /* HAVE_LLVM < 0x0300 */ #include <llvm/Target/TargetRegistry.h> -#include <llvm/Target/TargetSelect.h> #endif /* HAVE_LLVM < 0x0300 */ #if HAVE_LLVM >= 0x0209 @@ -183,7 +181,7 @@ lp_disassemble(const void* func) /* * Limit disassembly to this extent */ - const uint64_t extent = 0x10000; + const uint64_t extent = 96 * 1024; uint64_t max_pc = 0; @@ -200,24 +198,6 @@ lp_disassemble(const void* func) std::string Error; const Target *T = TargetRegistry::lookupTarget(Triple, Error); -#if HAVE_LLVM >= 0x0208 - InitializeNativeTargetAsmPrinter(); -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - LLVMInitializeX86AsmPrinter(); -#elif defined(PIPE_ARCH_ARM) - LLVMInitializeARMAsmPrinter(); -#elif defined(PIPE_ARCH_PPC) - LLVMInitializePowerPCAsmPrinter(); -#endif - -#if HAVE_LLVM >= 0x0301 - InitializeNativeTargetDisassembler(); -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - LLVMInitializeX86Disassembler(); -#elif defined(PIPE_ARCH_ARM) - LLVMInitializeARMDisassembler(); -#endif - #if HAVE_LLVM >= 0x0300 OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple)); #else diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c index d2b3713..30da44e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c @@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask) value = lp_build_mask_value(mask); + /* + * XXX this doesn't quite generate the most efficient code possible, if + * the masks are vectors which have all bits set to the same value + * in each element. + * movmskps/pmovmskb would be more efficient to get the required value + * into ordinary reg (certainly with 8 floats). + * Not sure if llvm could figure that out on its own. + */ + /* cond = (mask == 0) */ cond = LLVMBuildICmp(builder, LLVMIntEQ, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 04142d9..3608a68 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef i, LLVMValueRef j); +LLVMValueRef +lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset); + /* * SoA diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index e4b8da6..9591bcf 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return lp_build_format_swizzle_aos(format_desc, &bld, res); } + /* If all channels are of same type and we are not using half-floats */ + if (util_format_is_array(format_desc)) { + return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset); + } + /* * YUV / subsampled formats */ @@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return res; } - /* * Fallback to util_format_description::fetch_rgba_float(). */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c new file mode 100644 index 0000000..b8ec379 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c @@ -0,0 +1,102 @@ +/************************************************************************** + * + * Copyright 2012 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_const.h" +#include "lp_bld_struct.h" +#include "lp_bld_format.h" +#include "lp_bld_debug.h" +#include "lp_bld_type.h" +#include "lp_bld_conv.h" +#include "lp_bld_pack.h" + +#include "util/u_memory.h" +#include "util/u_format.h" +#include "pipe/p_state.h" + +/** + * @brief lp_build_fetch_rgba_aos_array + * + * \param format_desc describes format of the image we're fetching from + * \param dst_type output type + * \param base_ptr address of the pixel block (or the texel if uncompressed) + * \param offset ptr offset + */ +LLVMValueRef +lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type dst_type, + LLVMValueRef base_ptr, + LLVMValueRef offset) +{ + struct lp_build_context bld; + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef src_elem_type, src_vec_type; + LLVMValueRef ptr, res = NULL; + struct lp_type src_type; + + memset(&src_type, 0, sizeof src_type); + src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT; + src_type.fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED; + src_type.sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED; + src_type.norm = format_desc->channel[0].normalized; + src_type.width = format_desc->channel[0].size; + src_type.length = format_desc->nr_channels; + + assert(src_type.length <= dst_type.length); + + src_elem_type = lp_build_elem_type(gallivm, src_type); + src_vec_type = lp_build_vec_type(gallivm, src_type); + + /* Read whole vector from memory, unaligned */ + if (!res) { + ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); + ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), ""); + res = LLVMBuildLoad(builder, ptr, ""); + lp_set_load_alignment(res, src_type.width / 8); + } + + /* Truncate doubles to float */ + if (src_type.floating && src_type.width == 64) { + src_type.width = 32; + src_vec_type = lp_build_vec_type(gallivm, src_type); + + res = LLVMBuildFPTrunc(builder, res, src_vec_type, ""); + } + + /* Expand to correct length */ + if (src_type.length < dst_type.length) { + res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length); + src_type.length = dst_type.length; + } + + /* Convert to correct format */ + lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1); + + /* Swizzle it */ + lp_build_context_init(&bld, gallivm, dst_type); + return lp_build_format_swizzle_aos(format_desc, &bld, res); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 0a57b3c..afeb340 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, */ if (util_format_fits_8unorm(format_desc) && - type.floating && type.width == 32 && type.length == 4) { + type.floating && type.width == 32 && + (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; LLVMValueRef tmp; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c index ccc8320..f77eb12 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm, * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ - if (util_cpu_caps.has_sse2 && n == 4) { + if (util_cpu_caps.has_sse2 && n > 1) { LLVMValueRef sel, tmp, tmp2; struct lp_build_context bld32; @@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm, * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ - if (util_cpu_caps.has_sse2 && n == 4) { + if (util_cpu_caps.has_sse2 && n > 1) { LLVMValueRef sel, tmp; struct lp_build_context bld32; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 768d935..5bf4bcf 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -26,15 +26,44 @@ **************************************************************************/ +#include "pipe/p_config.h" #include "pipe/p_compiler.h" #include "util/u_cpu_detect.h" #include "util/u_debug.h" #include "util/u_memory.h" #include "util/u_simple_list.h" +#include "lp_bld.h" #include "lp_bld_debug.h" +#include "lp_bld_misc.h" #include "lp_bld_init.h" +#include <llvm-c/Analysis.h> #include <llvm-c/Transforms/Scalar.h> +#include <llvm-c/BitWriter.h> + + +/** + * AVX is supported in: + * - standard JIT from LLVM 3.2 onwards + * - MC-JIT from LLVM 3.1 + * - MC-JIT supports limited OSes (MacOSX and Linux) + * - standard JIT in LLVM 3.1, with backports + */ +#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE)) +# define USE_MCJIT 1 +# define HAVE_AVX 1 +#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT)) +# define USE_MCJIT 0 +# define HAVE_AVX 1 +#else +# define USE_MCJIT 0 +# define HAVE_AVX 0 +#endif + + +#if USE_MCJIT +void LLVMLinkInMCJIT(); +#endif #ifdef DEBUG @@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, static boolean gallivm_initialized = FALSE; +unsigned lp_native_vector_width; + /* * Optimization values are: @@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level { }; +#if HAVE_LLVM <= 0x0206 /** - * LLVM 2.6 permits only one ExecutionEngine to be created. This is it. - */ -static LLVMExecutionEngineRef GlobalEngine = NULL; - -/** - * Same gallivm state shared by all contexts. + * LLVM 2.6 permits only one ExecutionEngine to be created. So use the + * same gallivm state everywhere. */ static struct gallivm_state *GlobalGallivm = NULL; - - - - -extern void -lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE); - -extern void -lp_set_target_options(void); - +#endif /** @@ -111,6 +130,7 @@ static boolean create_pass_manager(struct gallivm_state *gallivm) { assert(!gallivm->passmgr); + assert(gallivm->target); gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider); if (!gallivm->passmgr) @@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm) &mod, &error); #endif + if (gallivm->passmgr) { + LLVMDisposePassManager(gallivm->passmgr); + } + #if 0 /* XXX this seems to crash with all versions of LLVM */ if (gallivm->provider) LLVMDisposeModuleProvider(gallivm->provider); #endif - if (gallivm->passmgr) - LLVMDisposePassManager(gallivm->passmgr); - -#if HAVE_LLVM >= 0x207 - if (gallivm->module) - LLVMDisposeModule(gallivm->module); -#endif - -#if 0 - /* Don't free the exec engine, it's a global/singleton */ - if (gallivm->engine) + if (HAVE_LLVM >= 0x207 && gallivm->engine) { + /* This will already destroy any associated module */ LLVMDisposeExecutionEngine(gallivm->engine); -#endif + } else { + LLVMDisposeModule(gallivm->module); + } -#if 0 +#if !USE_MCJIT /* Don't free the TargetData, it's owned by the exec engine */ - LLVMDisposeTargetData(gallivm->target); +#else + if (gallivm->target) { + LLVMDisposeTargetData(gallivm->target); + } #endif + /* Never free the LLVM context. + */ +#if 0 if (gallivm->context) LLVMContextDispose(gallivm->context); +#endif if (gallivm->builder) LLVMDisposeBuilder(gallivm->builder); @@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm) } -/** - * Allocate gallivm LLVM objects. - * \return TRUE for success, FALSE for failure - */ static boolean -init_gallivm_state(struct gallivm_state *gallivm) +init_gallivm_engine(struct gallivm_state *gallivm) { - assert(!gallivm->context); - assert(!gallivm->module); - assert(!gallivm->provider); - - lp_build_init(); - - gallivm->context = LLVMContextCreate(); - if (!gallivm->context) - goto fail; - - gallivm->module = LLVMModuleCreateWithNameInContext("gallivm", - gallivm->context); - if (!gallivm->module) - goto fail; - - gallivm->provider = - LLVMCreateModuleProviderForExistingModule(gallivm->module); - if (!gallivm->provider) - goto fail; - - if (!GlobalEngine) { + if (1) { /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */ enum LLVM_CodeGenOpt_Level optlevel; char *error = NULL; + int ret; if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) { optlevel = None; @@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm) optlevel = Default; } - if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider, - (unsigned) optlevel, &error)) { +#if USE_MCJIT + ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine, + gallivm->module, + (unsigned) optlevel, + &error); +#else + ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider, + (unsigned) optlevel, &error); +#endif + if (ret) { _debug_printf("%s\n", error); LLVMDisposeMessage(error); goto fail; } #if defined(DEBUG) || defined(PROFILE) - lp_register_oprofile_jit_event_listener(GlobalEngine); + lp_register_oprofile_jit_event_listener(gallivm->engine); #endif } - gallivm->engine = GlobalEngine; - LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new +#if !USE_MCJIT gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine); if (!gallivm->target) goto fail; +#else + if (0) { + /* + * Dump the data layout strings. + */ - if (!create_pass_manager(gallivm)) - goto fail; + LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine); + char *data_layout; + char *engine_data_layout; - gallivm->builder = LLVMCreateBuilderInContext(gallivm->context); - if (!gallivm->builder) - goto fail; + data_layout = LLVMCopyStringRepOfTargetData(gallivm->target); + engine_data_layout = LLVMCopyStringRepOfTargetData(target); + + if (1) { + debug_printf("module target data = %s\n", data_layout); + debug_printf("engine target data = %s\n", engine_data_layout); + } + + free(data_layout); + free(engine_data_layout); + } +#endif return TRUE; fail: - free_gallivm_state(gallivm); return FALSE; } -struct callback -{ - garbage_collect_callback_func func; - void *cb_data; - struct callback *prev, *next; -}; - - -/** list of all garbage collector callbacks */ -static struct callback callback_list = {NULL, NULL, NULL, NULL}; +/** + * Singleton + * + * We must never free LLVM contexts, because LLVM has several global caches + * which pointing/derived from objects owned by the context, causing false + * memory leaks and false cache hits when these objects are destroyed. + * + * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM + * context per thread, and put them in a pool when threads are destroyed. + */ +static LLVMContextRef gallivm_context = NULL; /** - * Register a function with gallivm which will be called when we - * do garbage collection. + * Allocate gallivm LLVM objects. + * \return TRUE for success, FALSE for failure */ -void -gallivm_register_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data) +static boolean +init_gallivm_state(struct gallivm_state *gallivm) { - struct callback *cb; - - if (!callback_list.prev) { - make_empty_list(&callback_list); - } + assert(!gallivm->context); + assert(!gallivm->module); + assert(!gallivm->provider); - /* see if already in list */ - foreach(cb, &callback_list) { - if (cb->func == func && cb->cb_data == cb_data) - return; - } + lp_build_init(); - /* add to list */ - cb = CALLOC_STRUCT(callback); - if (cb) { - cb->func = func; - cb->cb_data = cb_data; - insert_at_head(&callback_list, cb); + if (!gallivm_context) { + gallivm_context = LLVMContextCreate(); } -} + gallivm->context = gallivm_context; + if (!gallivm->context) + goto fail; + gallivm->module = LLVMModuleCreateWithNameInContext("gallivm", + gallivm->context); + if (!gallivm->module) + goto fail; -/** - * Remove a callback. - */ -void -gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data) -{ - struct callback *cb; - - /* search list */ - foreach(cb, &callback_list) { - if (cb->func == func && cb->cb_data == cb_data) { - /* found, remove it */ - remove_from_list(cb); - FREE(cb); - return; - } - } -} + gallivm->provider = + LLVMCreateModuleProviderForExistingModule(gallivm->module); + if (!gallivm->provider) + goto fail; + gallivm->builder = LLVMCreateBuilderInContext(gallivm->context); + if (!gallivm->builder) + goto fail; -/** - * Call the callback functions (which are typically in the - * draw module and llvmpipe driver. - */ -static void -call_garbage_collector_callbacks(void) -{ - struct callback *cb; - foreach(cb, &callback_list) { - cb->func(cb->cb_data); + /* FIXME: MC-JIT only allows compiling one module at a time, and it must be + * complete when MC-JIT is created. So defer the MC-JIT engine creation for + * now. + */ +#if !USE_MCJIT + if (!init_gallivm_engine(gallivm)) { + goto fail; } -} +#else + /* + * MC-JIT engine compiles the module immediately on creation, so we can't + * obtain the target data from it. Instead we create a target data layout + * from a string. + * + * The produced layout strings are not precisely the same, but should make + * no difference for the kind of optimization passes we run. + * + * For reference this is the layout string on x64: + * + * e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64 + * + * See also: + * - http://llvm.org/docs/LangRef.html#datalayout + */ + + { + const unsigned pointer_size = 8 * sizeof(void *); + char layout[512]; + util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u", +#ifdef PIPE_ARCH_LITTLE_ENDIAN + 'e', // little endian +#else + 'E', // big endian +#endif + pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment + pointer_size, // aggregate preferred alignment + pointer_size, pointer_size); // stack objects abi alignment, preferred alignment + gallivm->target = LLVMCreateTargetData(layout); + if (!gallivm->target) { + return FALSE; + } + } +#endif + if (!create_pass_manager(gallivm)) + goto fail; -/** - * Other gallium components using gallivm should call this periodically - * to let us do garbage collection (or at least try to free memory - * accumulated by the LLVM libraries). - */ -void -gallivm_garbage_collect(struct gallivm_state *gallivm) -{ - if (gallivm->context) { - if (gallivm_debug & GALLIVM_DEBUG_GC) - debug_printf("***** Doing LLVM garbage collection\n"); + return TRUE; - call_garbage_collector_callbacks(); - free_gallivm_state(gallivm); - init_gallivm_state(gallivm); - } +fail: + free_gallivm_state(gallivm); + return FALSE; } @@ -398,12 +426,27 @@ lp_build_init(void) lp_set_target_options(); - LLVMInitializeNativeTarget(); - +#if USE_MCJIT + LLVMLinkInMCJIT(); +#else LLVMLinkInJIT(); +#endif util_cpu_detect(); + + if (HAVE_AVX && + util_cpu_caps.has_avx) { + lp_native_vector_width = 256; + } else { + /* Leave it at 128, even when no SIMD extensions are available. + * Really needs to be a multiple of 128 so can fit 4 floats. + */ + lp_native_vector_width = 128; + } + lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH", + lp_native_vector_width); + gallivm_initialized = TRUE; #if 0 @@ -423,16 +466,27 @@ lp_build_init(void) struct gallivm_state * gallivm_create(void) { - if (!GlobalGallivm) { - GlobalGallivm = CALLOC_STRUCT(gallivm_state); - if (GlobalGallivm) { - if (!init_gallivm_state(GlobalGallivm)) { - FREE(GlobalGallivm); - GlobalGallivm = NULL; - } + struct gallivm_state *gallivm; + +#if HAVE_LLVM <= 0x206 + if (GlobalGallivm) { + return GlobalGallivm; + } +#endif + + gallivm = CALLOC_STRUCT(gallivm_state); + if (gallivm) { + if (!init_gallivm_state(gallivm)) { + FREE(gallivm); + gallivm = NULL; } } - return GlobalGallivm; + +#if HAVE_LLVM <= 0x206 + GlobalGallivm = gallivm; +#endif + + return gallivm; } @@ -442,6 +496,132 @@ gallivm_create(void) void gallivm_destroy(struct gallivm_state *gallivm) { +#if HAVE_LLVM <= 0x0206 /* No-op: don't destroy the singleton */ (void) gallivm; +#else + free_gallivm_state(gallivm); + FREE(gallivm); +#endif +} + + +/** + * Validate and optimze a function. + */ +static void +gallivm_optimize_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + if (0) { + debug_printf("optimizing %s...\n", LLVMGetValueName(func)); + } + + assert(gallivm->passmgr); + + /* Apply optimizations to LLVM IR */ + LLVMRunFunctionPassManager(gallivm->passmgr, func); + + if (0) { + if (gallivm_debug & GALLIVM_DEBUG_IR) { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(func); + debug_printf("\n"); + } + } +} + + +/** + * Validate a function. + */ +void +gallivm_verify_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + /* Verify the LLVM IR. If invalid, dump and abort */ +#ifdef DEBUG + if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) { + lp_debug_dump_value(func); + assert(0); + return; + } +#endif + + gallivm_optimize_function(gallivm, func); + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(func); + debug_printf("\n"); + } +} + + +void +gallivm_compile_module(struct gallivm_state *gallivm) +{ +#if HAVE_LLVM > 0x206 + assert(!gallivm->compiled); +#endif + + /* Dump byte code to a file */ + if (0) { + LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc"); + debug_printf("llvmpipe.bc written\n"); + debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n"); + } + +#if USE_MCJIT + assert(!gallivm->engine); + if (!init_gallivm_engine(gallivm)) { + assert(0); + } +#endif + assert(gallivm->engine); + + ++gallivm->compiled; +} + + +func_pointer +gallivm_jit_function(struct gallivm_state *gallivm, + LLVMValueRef func) +{ + void *code; + func_pointer jit_func; + + assert(gallivm->compiled); + assert(gallivm->engine); + + code = LLVMGetPointerToGlobal(gallivm->engine, func); + assert(code); + jit_func = pointer_to_func(code); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) { + lp_disassemble(code); + } + + /* Free the function body to save memory */ + lp_func_delete_body(func); + + return jit_func; +} + + +/** + * Free the function (and its machine code). + */ +void +gallivm_free_function(struct gallivm_state *gallivm, + LLVMValueRef func, + const void *code) +{ +#if !USE_MCJIT + if (code) { + LLVMFreeMachineCodeForFunction(gallivm->engine, func); + } + + LLVMDeleteFunction(func); +#endif } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index 5fc0f99..7edea61 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" +#include "util/u_pointer.h" // for func_pointer #include "lp_bld.h" #include <llvm-c/ExecutionEngine.h> @@ -44,6 +45,7 @@ struct gallivm_state LLVMPassManagerRef passmgr; LLVMContextRef context; LLVMBuilderRef builder; + unsigned compiled; }; @@ -51,35 +53,28 @@ void lp_build_init(void); -extern void -lp_func_delete_body(LLVMValueRef func); - +struct gallivm_state * +gallivm_create(void); void -gallivm_garbage_collect(struct gallivm_state *gallivm); - +gallivm_destroy(struct gallivm_state *gallivm); -typedef void (*garbage_collect_callback_func)(void *cb_data); void -gallivm_register_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data); +gallivm_verify_function(struct gallivm_state *gallivm, + LLVMValueRef func); void -gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func, - void *cb_data); +gallivm_compile_module(struct gallivm_state *gallivm); - -struct gallivm_state * -gallivm_create(void); +func_pointer +gallivm_jit_function(struct gallivm_state *gallivm, + LLVMValueRef func); void -gallivm_destroy(struct gallivm_state *gallivm); - - -extern LLVMValueRef -lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, - const char *Name); +gallivm_free_function(struct gallivm_state *gallivm, + LLVMValueRef func, + const void * code); void lp_set_load_alignment(LLVMValueRef Inst, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c index 2323f12..2bf1211 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c @@ -48,6 +48,8 @@ #include "lp_bld_const.h" #include "lp_bld_intr.h" +#include "lp_bld_type.h" +#include "lp_bld_pack.h" LLVMValueRef @@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder, } +/** + * Call intrinsic with arguments adapted to intrinsic vector length. + * + * Split vectors which are too large for the hw, or expand them if they + * are too small, so a caller calling a function which might use intrinsics + * doesn't need to do splitting/expansion on its own. + * This only supports intrinsics where src and dst types match. + */ +LLVMValueRef +lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, + const char *name, + struct lp_type src_type, + unsigned intr_size, + LLVMValueRef a, + LLVMValueRef b) +{ + unsigned i; + struct lp_type intrin_type = src_type; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef anative, bnative; + unsigned intrin_length = intr_size / src_type.width; + + intrin_type.length = intrin_length; + + if (intrin_length > src_type.length) { + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef constvec, tmp; + + for (i = 0; i < src_type.length; i++) { + elems[i] = lp_build_const_int32(gallivm, i); + } + for (; i < intrin_length; i++) { + elems[i] = i32undef; + } + if (src_type.length == 1) { + LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type); + a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), ""); + b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), ""); + } + constvec = LLVMConstVector(elems, intrin_length); + anative = LLVMBuildShuffleVector(builder, a, a, constvec, ""); + bnative = LLVMBuildShuffleVector(builder, b, b, constvec, ""); + tmp = lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, intrin_type), + anative, bnative); + if (src_type.length > 1) { + constvec = LLVMConstVector(elems, src_type.length); + return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, ""); + } + else { + return LLVMBuildExtractElement(builder, tmp, elems[0], ""); + } + } + else if (intrin_length < src_type.length) { + unsigned num_vec = src_type.length / intrin_length; + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; + + /*Â don't support arbitrary size here as this is so yuck */ + if (src_type.length % intrin_length) { + /*Â FIXME: This is something which should be supported + * but there doesn't seem to be any need for it currently + * so crash and burn. + */ + debug_printf("%s: should handle arbitrary vector size\n", + __FUNCTION__); + assert(0); + return NULL; + } + + for (i = 0; i < num_vec; i++) { + anative = lp_build_extract_range(gallivm, a, i*intrin_length, + intrin_length); + bnative = lp_build_extract_range(gallivm, b, i*intrin_length, + intrin_length); + tmp[i] = lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, intrin_type), + anative, bnative); + } + return lp_build_concat(gallivm, tmp, intrin_type, num_vec); + } + else { + return lp_build_intrinsic_binary(builder, name, + lp_build_vec_type(gallivm, src_type), + a, b); + } +} + + LLVMValueRef lp_build_intrinsic_map(struct gallivm_state *gallivm, const char *name, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h index b73dd70..38c5c29 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h @@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder, LLVMValueRef +lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm, + const char *name, + struct lp_type src_type, + unsigned intr_size, + LLVMValueRef a, + LLVMValueRef b); + + +LLVMValueRef lp_build_intrinsic_map(struct gallivm_state *gallivm, const char *name, LLVMTypeRef ret_type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 6979614..7a4a5bb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -52,8 +52,8 @@ * * select <4 x i1> %C, %A, %B * - * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not - * supported on any backend. + * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only + * supported on some backends (x86) starting with llvm 3.1. * * Expanding the boolean vector to full SIMD register width, as in * @@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld, } res = LLVMBuildSelect(builder, mask, a, b, ""); } - else if (util_cpu_caps.has_sse4_1 && - type.width * type.length == 128 && + else if (((util_cpu_caps.has_sse4_1 && + type.width * type.length == 128) || + (util_cpu_caps.has_avx && + type.width * type.length == 256 && type.width >= 32)) && !LLVMIsConstant(a) && !LLVMIsConstant(b) && !LLVMIsConstant(mask)) { @@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld, LLVMTypeRef arg_type; LLVMValueRef args[3]; - if (type.floating && - type.width == 64) { + /* + * There's only float blend in AVX but can just cast i32/i64 + * to float. + */ + if (type.width * type.length == 256) { + if (type.width == 64) { + intrinsic = "llvm.x86.avx.blendv.pd.256"; + arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); + } + else { + intrinsic = "llvm.x86.avx.blendv.ps.256"; + arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); + } + } + else if (type.floating && + type.width == 64) { intrinsic = "llvm.x86.sse41.blendvpd"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2); } else if (type.floating && @@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld, return lp_build_select(bld, mask_vec, a, b); } } + + +/** + * Return (scalar-cast)val ? true : false; + */ +LLVMValueRef +lp_build_any_true_range(struct lp_build_context *bld, + unsigned real_length, + LLVMValueRef val) +{ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMTypeRef scalar_type; + LLVMTypeRef true_type; + + assert(real_length <= bld->type.length); + + true_type = LLVMIntTypeInContext(bld->gallivm->context, + bld->type.width * real_length); + scalar_type = LLVMIntTypeInContext(bld->gallivm->context, + bld->type.width * bld->type.length); + val = LLVMBuildBitCast(builder, val, scalar_type, ""); + /* + * We're using always native types so we can use intrinsics. + * However, if we don't do per-element calculations, we must ensure + * the excess elements aren't used since they may contain garbage. + */ + if (real_length < bld->type.length) { + val = LLVMBuildTrunc(builder, val, true_type, ""); + } + return LLVMBuildICmp(builder, LLVMIntNE, + val, LLVMConstNull(true_type), ""); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h index ef33a65..64c0a1f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h @@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld, LLVMValueRef b); +LLVMValueRef +lp_build_any_true_range(struct lp_build_context *bld, + unsigned real_length, + LLVMValueRef val); + #endif /* !LP_BLD_LOGIC_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 6c4586c..dd2c612 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -26,6 +26,12 @@ **************************************************************************/ +/** + * The purpose of this module is to expose LLVM functionality not available + * through the C++ bindings. + */ + + #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS #endif @@ -41,11 +47,24 @@ #include <llvm/Target/TargetOptions.h> #include <llvm/ExecutionEngine/ExecutionEngine.h> #include <llvm/ExecutionEngine/JITEventListener.h> +#if HAVE_LLVM >= 0x0301 +#include <llvm/ADT/Triple.h> +#include <llvm/ExecutionEngine/JITMemoryManager.h> +#endif #include <llvm/Support/CommandLine.h> #include <llvm/Support/PrettyStackTrace.h> +#if HAVE_LLVM >= 0x0300 +#include <llvm/Support/TargetSelect.h> +#else /* HAVE_LLVM < 0x0300 */ +#include <llvm/Target/TargetSelect.h> +#endif /* HAVE_LLVM < 0x0300 */ + #include "pipe/p_config.h" #include "util/u_debug.h" +#include "util/u_cpu_detect.h" + +#include "lp_bld_misc.h" /** @@ -99,6 +118,9 @@ lp_set_target_options(void) #if defined(DEBUG) || defined(PROFILE) llvm::NoFramePointerElim = true; +#if HAVE_LLVM >= 0x0208 + llvm::NoFramePointerElimNonLeaf = true; +#endif #endif llvm::NoExcessFPPrecision = false; @@ -146,6 +168,30 @@ lp_set_target_options(void) * shared object where the gallium driver resides. */ llvm::DisablePrettyStackTrace = true; + + // If we have a native target, initialize it to ensure it is linked in and + // usable by the JIT. + llvm::InitializeNativeTarget(); + +#if HAVE_LLVM >= 0x0208 + llvm::InitializeNativeTargetAsmPrinter(); +#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + LLVMInitializeX86AsmPrinter(); +#elif defined(PIPE_ARCH_ARM) + LLVMInitializeARMAsmPrinter(); +#elif defined(PIPE_ARCH_PPC) + LLVMInitializePowerPCAsmPrinter(); +#endif + +#if HAVE_LLVM >= 0x0207 +# if HAVE_LLVM >= 0x0301 + llvm::InitializeNativeTargetDisassembler(); +# elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + LLVMInitializeX86Disassembler(); +# elif defined(PIPE_ARCH_ARM) + LLVMInitializeARMDisassembler(); +# endif +#endif } @@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name)); } + extern "C" void lp_set_load_alignment(LLVMValueRef Inst, @@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst, { llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align); } + + +#if HAVE_LLVM >= 0x301 + +/** + * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX + * feature where available. + * + * See also: + * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp + * - llvm/tools/lli/lli.cpp + * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results + */ +extern "C" +LLVMBool +lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + LLVMModuleRef M, + unsigned OptLevel, + char **OutError) +{ + using namespace llvm; + + std::string Error; + EngineBuilder builder(unwrap(M)); + builder.setEngineKind(EngineKind::JIT) + .setErrorStr(&Error) + .setOptLevel((CodeGenOpt::Level)OptLevel); + + builder.setUseMCJIT(true); + + llvm::SmallVector<std::string, 1> MAttrs; + if (util_cpu_caps.has_avx) { + /* + * AVX feature is not automatically detected from CPUID by the X86 target + * yet, because the old (yet default) JIT engine is not capable of + * emitting the opcodes. But as we're using MCJIT here, it is safe to + * add set this attribute. + */ + MAttrs.push_back("+avx"); + builder.setMAttrs(MAttrs); + } + builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager()); + + ExecutionEngine *JIT; +#if 0 + JIT = builder.create(); +#else + /* + * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833 + */ + StringRef MArch = ""; + StringRef MCPU = ""; + Triple TT(unwrap(M)->getTargetTriple()); + JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs)); +#endif + if (JIT) { + *OutJIT = wrap(JIT); + return 0; + } + *OutError = strdup(Error.c_str()); + return 1; +} + +#endif /* HAVE_LLVM >= 0x301 */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h new file mode 100644 index 0000000..4f80b38 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h @@ -0,0 +1,70 @@ +/************************************************************************** + * + * Copyright 2012 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef LP_BLD_MISC_H +#define LP_BLD_MISC_H + + +#include "lp_bld.h" +#include <llvm-c/ExecutionEngine.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + + +extern void +lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE); + +extern void +lp_set_target_options(void); + + +extern void +lp_func_delete_body(LLVMValueRef func); + + +extern LLVMValueRef +lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal, + const char *Name); + +extern int +lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + LLVMModuleRef M, + unsigned OptLevel, + char **OutError); + + +#ifdef __cplusplus +} +#endif + + +#endif /* !LP_BLD_MISC_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index fde6bb5..b18f784 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -69,6 +69,7 @@ #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_cpu_detect.h" +#include "util/u_memory.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -76,6 +77,7 @@ #include "lp_bld_intr.h" #include "lp_bld_arit.h" #include "lp_bld_pack.h" +#include "lp_bld_swizzle.h" /** @@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm, return LLVMConstVector(elems, n); } +/** + * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack. + * See comment above lp_build_interleave2_half for more details. + */ +static LLVMValueRef +lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm, + unsigned n, unsigned lo_hi) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i, j; + + assert(n <= LP_MAX_VECTOR_LENGTH); + assert(lo_hi < 2); + + for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) { + if (i == (n / 2)) + j += n / 4; + + elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); + elems[i + 1] = lp_build_const_int32(gallivm, n + j); + } + + return LLVMConstVector(elems, n); +} /** * Build shuffle vectors that match PACKxx instructions. @@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n) return LLVMConstVector(elems, n); } +/** + * Return a vector with elements src[start:start+size] + * Most useful for getting half the values out of a 256bit sized vector, + * otherwise may cause data rearrangement to happen. + */ +LLVMValueRef +lp_build_extract_range(struct gallivm_state *gallivm, + LLVMValueRef src, + unsigned start, + unsigned size) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(size <= Elements(elems)); + + for (i = 0; i < size; ++i) + elems[i] = lp_build_const_int32(gallivm, i + start); + + if (size == 1) { + return LLVMBuildExtractElement(gallivm->builder, src, elems[0], ""); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(elems, size), ""); + } +} + +/** + * Concatenates several (must be a power of 2) vectors (of same type) + * into a larger one. + * Most useful for building up a 256bit sized vector out of two 128bit ones. + */ +LLVMValueRef +lp_build_concat(struct gallivm_state *gallivm, + LLVMValueRef src[], + struct lp_type src_type, + unsigned num_vectors) +{ + unsigned new_length, i; + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2]; + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + + assert(src_type.length * num_vectors <= Elements(shuffles)); + assert(util_is_power_of_two(num_vectors)); + + new_length = src_type.length; + + for (i = 0; i < num_vectors; i++) + tmp[i] = src[i]; + + while (num_vectors > 1) { + num_vectors >>= 1; + new_length <<= 1; + for (i = 0; i < new_length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, i); + } + for (i = 0; i < num_vectors; i++) { + tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1], + LLVMConstVector(shuffles, new_length), ""); + } + } + + return tmp[0]; +} /** * Interleave vector elements. @@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm, return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); } +/** + * Interleave vector elements but with 256 bit, + * treats it as interleave with 2 concatenated 128 bit vectors. + * + * This differs to lp_build_interleave2 as that function would do the following (for lo): + * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction. + * + * + * An example interleave 8x float with 8x float on AVX 256bit unpack: + * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7 + * + * Equivalent to interleaving 2x 128 bit vectors + * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7 + * + * So interleave-lo would result in: + * a0 b0 a1 b1 a4 b4 a5 b5 + * + * And interleave-hi would result in: + * a2 b2 a3 b3 a6 b6 a7 b7 + */ +LLVMValueRef +lp_build_interleave2_half(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi) +{ + if (type.length * type.width == 256) { + LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi); + return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); + } else { + return lp_build_interleave2(gallivm, type, a, b, lo_hi); + } +} /** * Double the bit width. @@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm, * Non-interleaved pack. * * This will move values as - * - * lo = __ l0 __ l1 __ l2 __.. __ ln - * hi = __ h0 __ h1 __ h2 __.. __ hn + * (LSB) (MSB) + * lo = l0 __ l1 __ l2 __.. __ ln __ + * hi = h0 __ h1 __ h2 __.. __ hn __ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn * * This will only change the number of bits the values are represented, not the @@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm, LLVMValueRef hi) { LLVMBuilderRef builder = gallivm->builder; -#if HAVE_LLVM < 0x0207 - LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type); -#endif LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type); LLVMValueRef shuffle; LLVMValueRef res = NULL; + struct lp_type intr_type = dst_type; + +#if HAVE_LLVM < 0x0207 + intr_type = src_type; +#endif assert(!src_type.floating); assert(!dst_type.floating); @@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm, assert(src_type.length * 2 == dst_type.length); /* Check for special cases first */ - if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { + if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) { + const char *intrinsic = NULL; + switch(src_type.width) { case 32: if(dst_type.sign) { -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi); -#endif + intrinsic = "llvm.x86.sse2.packssdw.128"; } else { if (util_cpu_caps.has_sse4_1) { - return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); - } - else { - /* use generic shuffle below */ - res = NULL; + intrinsic = "llvm.x86.sse41.packusdw"; +#if HAVE_LLVM < 0x0207 + /* llvm < 2.7 has inconsistent signatures except for packusdw */ + intr_type = dst_type; +#endif } } break; - case 16: - if(dst_type.sign) -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi); -#endif - else -#if HAVE_LLVM >= 0x0207 - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi); -#else - res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi); -#endif - break; - - default: - assert(0); - return LLVMGetUndef(dst_vec_type); + if (dst_type.sign) { + intrinsic = "llvm.x86.sse2.packsswb.128"; + } + else { + intrinsic = "llvm.x86.sse2.packuswb.128"; + } break; + /* default uses generic shuffle below */ } - - if (res) { - res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + if (intrinsic) { + if (src_type.width * src_type.length == 128) { + LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); + res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi); + if (dst_vec_type != intr_vec_type) { + res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + } + } + else { + int num_split = src_type.width * src_type.length / 128; + int i; + int nlen = 128 / src_type.width; + struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128); + struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128); + LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128]; + LLVMValueRef tmplo, tmphi; + LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type); + LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type); + + assert(num_split <= LP_MAX_VECTOR_WIDTH / 128); + + for (i = 0; i < num_split / 2; i++) { + tmplo = lp_build_extract_range(gallivm, + lo, i*nlen*2, nlen); + tmphi = lp_build_extract_range(gallivm, + lo, i*nlen*2 + nlen, nlen); + tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic, + nintr_vec_type, tmplo, tmphi); + if (ndst_vec_type != nintr_vec_type) { + tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, ""); + } + } + for (i = 0; i < num_split / 2; i++) { + tmplo = lp_build_extract_range(gallivm, + hi, i*nlen*2, nlen); + tmphi = lp_build_extract_range(gallivm, + hi, i*nlen*2 + nlen, nlen); + tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic, + nintr_vec_type, + tmplo, tmphi); + if (ndst_vec_type != nintr_vec_type) { + tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2], + ndst_vec_type, ""); + } + } + res = lp_build_concat(gallivm, tmpres, ndst_type, num_split); + } return res; } } @@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm, /* All X86 SSE non-interleaved pack instructions take signed inputs and * saturate them, so no need to clamp for those cases. */ if(util_cpu_caps.has_sse2 && - src_type.width * src_type.length == 128 && - src_type.sign) + src_type.width * src_type.length >= 128 && + src_type.sign && + (src_type.width == 32 || src_type.width == 16)) clamp = FALSE; if(clamp) { @@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm, LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned i; - /* Register width must remain constant */ assert(src_type.width * src_type.length == dst_type.width * dst_type.length); @@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm, /* * Register width remains constant -- use vector packing intrinsics */ - tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs); } else { - /* - * Do it element-wise. - */ - - assert(src_type.length == dst_type.length); - tmp[0] = lp_build_undef(gallivm, dst_type); - for (i = 0; i < dst_type.length; ++i) { - LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); - val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), ""); - tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + if (src_type.width / dst_type.width > num_srcs) { + /* + * First change src vectors size (with shuffle) so they have the + * same size as the destination vector, then pack normally. + * Note: cannot use cast/extract because llvm generates atrocious code. + */ + unsigned size_ratio = (src_type.width * src_type.length) / + (dst_type.length * dst_type.width); + unsigned new_length = src_type.length / size_ratio; + + for (i = 0; i < size_ratio * num_srcs; i++) { + unsigned start_index = (i % size_ratio) * new_length; + tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio], + start_index, new_length); + } + num_srcs *= size_ratio; + src_type.length = new_length; + tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs); + } + else { + /* + * Truncate bit width but expand vector size - first pack + * then expand simply because this should be more AVX-friendly + * for the cases we probably hit. + */ + unsigned size_ratio = (dst_type.width * dst_type.length) / + (src_type.length * src_type.width); + unsigned num_pack_srcs = num_srcs / size_ratio; + dst_type.length = dst_type.length / size_ratio; + + for (i = 0; i < size_ratio; i++) { + tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE, + &src[i*num_pack_srcs], num_pack_srcs); + } + tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio); } } } @@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm, /* * Do it element-wise. */ + assert(src_type.length * num_srcs == dst_type.length * num_dsts); + + for (i = 0; i < num_dsts; i++) { + tmp[i] = lp_build_undef(gallivm, dst_type); + } - assert(src_type.length == dst_type.length); - tmp[0] = lp_build_undef(gallivm, dst_type); - for (i = 0; i < dst_type.length; ++i) { - LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + for (i = 0; i < src_type.length; ++i) { + unsigned j = i / dst_type.length; + LLVMValueRef srcindex = lp_build_const_int32(gallivm, i); + LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, ""); if (src_type.sign && dst_type.sign) { val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } else { val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), ""); } - tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, ""); } } } @@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm, } +/** + * Expands src vector from src.length to dst_length + */ +LLVMValueRef +lp_build_pad_vector(struct gallivm_state *gallivm, + LLVMValueRef src, + struct lp_type src_type, + unsigned dst_length) +{ + LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type)); + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(dst_length <= Elements(elems)); + assert(dst_length > src_type.length); + + if (src_type.length == dst_length) + return src; + + /* If its a single scalar type, no need to reinvent the wheel */ + if (src_type.length == 1) { + return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src); + } + + /* All elements from src vector */ + for (i = 0; i < src_type.length; ++i) + elems[i] = lp_build_const_int32(gallivm, i); + + /* Undef fill remaining space */ + for (i = src_type.length; i < dst_length; ++i) + elems[i] = lp_build_const_int32(gallivm, src_type.length); + + /* Combine the two vectors */ + return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), ""); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index d58da4f..73f299c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -44,6 +44,12 @@ struct lp_type; +LLVMValueRef +lp_build_interleave2_half(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi); LLVMValueRef lp_build_interleave2(struct gallivm_state *gallivm, @@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm, LLVMValueRef src, LLVMValueRef *dst, unsigned num_dsts); +LLVMValueRef +lp_build_extract_range(struct gallivm_state *gallivm, + LLVMValueRef src, + unsigned start, + unsigned size); + +LLVMValueRef +lp_build_concat(struct gallivm_state *gallivm, + LLVMValueRef src[], + struct lp_type src_type, + unsigned num_vectors); LLVMValueRef lp_build_packs2(struct gallivm_state *gallivm, @@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm, LLVMValueRef *dst, unsigned num_dsts); +LLVMValueRef +lp_build_pad_vector(struct gallivm_state *gallivm, + LLVMValueRef src, + struct lp_type src_type, + unsigned dst_length); + #endif /* !LP_BLD_PACK_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index b0a5bc0..b1ba7c7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -26,6 +26,7 @@ **************************************************************************/ +#include "u_cpu_detect.h" #include "lp_bld_type.h" #include "lp_bld_arit.h" #include "lp_bld_const.h" @@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld, return lp_build_sub(bld, a_bottom, a_top); } - +/* + * To be able to handle multiple quads at once in texture sampling and + * do lod calculations per quad, it is necessary to get the per-quad + * derivatives into the lp_build_rho function. + * For 8-wide vectors the packed derivative values for 3 coords would + * look like this, this scales to a arbitrary (multiple of 4) vector size: + * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy + * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____ + * The second vector will be unused for 1d and 2d textures. + */ LLVMValueRef -lp_build_scalar_ddx(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld, + LLVMValueRef a) { - LLVMBuilderRef builder = bld->gallivm->builder; - LLVMValueRef idx_left = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT); - LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT); - LLVMValueRef a_left = LLVMBuildExtractElement(builder, a, idx_left, "left"); - LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right"); + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef vec1, vec2; + + /* same packing as _twocoord, but can use aos swizzle helper */ + + /* + * XXX could make swizzle1 a noop swizzle by using right top/bottom + * pair for ddy + */ + static const unsigned char swizzle1[] = { + LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + + vec1 = lp_build_swizzle_aos(bld, a, swizzle1); + vec2 = lp_build_swizzle_aos(bld, a, swizzle2); + if (bld->type.floating) - return LLVMBuildFSub(builder, a_right, a_left, "ddx"); + return LLVMBuildFSub(builder, vec2, vec1, "ddxddy"); else - return LLVMBuildSub(builder, a_right, a_left, "ddx"); + return LLVMBuildSub(builder, vec2, vec1, "ddxddy"); } LLVMValueRef -lp_build_scalar_ddy(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld, + LLVMValueRef a, LLVMValueRef b) { - LLVMBuilderRef builder = bld->gallivm->builder; - LLVMValueRef idx_top = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT); - LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT); - LLVMValueRef a_top = LLVMBuildExtractElement(builder, a, idx_top, "top"); - LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom"); + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef vec1, vec2; + unsigned length, num_quads, i; + + /* XXX: do hsub version */ + length = bld->type.length; + num_quads = length / 4; + for (i = 0; i < num_quads; i++) { + unsigned s1 = 4 * i; + unsigned s2 = 4 * i + length; + shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1); + shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1); + shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2); + shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2); + shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1); + shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1); + shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2); + shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2); + } + vec1 = LLVMBuildShuffleVector(builder, a, b, + LLVMConstVector(shuffles1, length), ""); + vec2 = LLVMBuildShuffleVector(builder, a, b, + LLVMConstVector(shuffles2, length), ""); if (bld->type.floating) - return LLVMBuildFSub(builder, a_bottom, a_top, "ddy"); + return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy"); else - return LLVMBuildSub(builder, a_bottom, a_top, "ddy"); + return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy"); } + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h index b799291..be6a1ef 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h @@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld, /* - * Scalar derivatives. - * - * Same as getting the first value of above. + * Packed derivatives (one derivative for each direction per quad) */ - LLVMValueRef -lp_build_scalar_ddx(struct lp_build_context *bld, - LLVMValueRef a); - +lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld, + LLVMValueRef a, LLVMValueRef b); LLVMValueRef -lp_build_scalar_ddy(struct lp_build_context *bld, - LLVMValueRef a); +lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld, + LLVMValueRef a); #endif /* LP_BLD_QUAD_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index d966788..8521116 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -44,6 +44,8 @@ #include "lp_bld_sample.h" #include "lp_bld_swizzle.h" #include "lp_bld_type.h" +#include "lp_bld_logic.h" +#include "lp_bld_pack.h" /* @@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** * Generate code to compute coordinate gradient (rho). - * \param ddx partial derivatives of (s, t, r, q) with respect to X - * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y * - * XXX: The resulting rho is scalar, so we ignore all but the first element of - * derivatives that are passed by the shader. + * The resulting rho is scalar per quad. */ static LLVMValueRef lp_build_rho(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4]) + const struct lp_derivatives *derivs) { + struct gallivm_state *gallivm = bld->gallivm; struct lp_build_context *int_size_bld = &bld->int_size_bld; struct lp_build_context *float_size_bld = &bld->float_size_bld; struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *coord_bld = &bld->coord_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; + const LLVMValueRef *ddx_ddy = derivs->ddx_ddy; const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); - LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; - LLVMValueRef rho_x, rho_y; LLVMValueRef rho_vec; LLVMValueRef int_size, float_size; LLVMValueRef rho; LLVMValueRef first_level, first_level_vec; + LLVMValueRef abs_ddx_ddy[2]; + unsigned length = coord_bld->type.length; + unsigned num_quads = length / 4; + unsigned i; + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef rho_xvec, rho_yvec; + + abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]); + if (dims > 2) { + abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]); + } - dsdx = ddx[0]; - dsdy = ddy[0]; - - if (dims <= 1) { - rho_x = dsdx; - rho_y = dsdy; + if (dims == 1) { + static const unsigned char swizzle1[] = { + 0, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1); + rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2); + } + else if (dims == 2) { + static const unsigned char swizzle1[] = { + 0, 2, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, 3, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1); + rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2); } else { - rho_x = float_size_bld->undef; - rho_y = float_size_bld->undef; - - rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, ""); - - dtdx = ddx[1]; - dtdy = ddy[1]; - - rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, ""); - - if (dims >= 3) { - drdx = ddx[2]; - drdy = ddy[2]; - - rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, ""); - rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, ""); + LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH]; + assert(dims == 3); + for (i = 0; i < num_quads; i++) { + shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i); + shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2); + shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i); + shuffles1[4*i + 3] = i32undef; + shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1); + shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3); + shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1); + shuffles2[4*i + 3] = i32undef; } + rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1], + LLVMConstVector(shuffles1, length), ""); + rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1], + LLVMConstVector(shuffles2, length), ""); } - rho_x = lp_build_abs(float_size_bld, rho_x); - rho_y = lp_build_abs(float_size_bld, rho_y); - - rho_vec = lp_build_max(float_size_bld, rho_x, rho_y); + rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec); first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); @@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld, int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec); float_size = lp_build_int_to_float(float_size_bld, int_size); - rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); + if (bld->coord_type.length > 4) { + /* expand size to each quad */ + if (dims > 1) { + /* could use some broadcast_vector helper for this? */ + int num_quads = bld->coord_type.length / 4; + LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4]; + for (i = 0; i < num_quads; i++) { + src[i] = float_size; + } + float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads); + } + else { + float_size = lp_build_broadcast_scalar(coord_bld, float_size); + } + rho_vec = lp_build_mul(coord_bld, rho_vec, float_size); - if (dims <= 1) { - rho = rho_vec; + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + static const unsigned char swizzle1[] = { + 0, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + static const unsigned char swizzle2[] = { + 1, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1); + rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2); + + rho = lp_build_max(coord_bld, rho_s, rho_t); + + if (dims >= 3) { + static const unsigned char swizzle3[] = { + 2, LP_BLD_SWIZZLE_DONTCARE, + LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE + }; + rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3); + rho = lp_build_max(coord_bld, rho, rho_r); + } + } + } + rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type, + perquadf_bld->type, rho); } else { - if (dims >= 2) { - LLVMValueRef rho_s, rho_t, rho_r; + if (dims <= 1) { + rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, ""); + } + rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); - rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); - rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, ""); + rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, ""); - rho = lp_build_max(float_bld, rho_s, rho_t); - if (dims >= 3) { - rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); - rho = lp_build_max(float_bld, rho, rho_r); + rho = lp_build_max(float_bld, rho_s, rho_t); + + if (dims >= 3) { + rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, ""); + rho = lp_build_max(float_bld, rho, rho_r); + } } } } @@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld, /** * Generate code to compute texture level of detail (lambda). - * \param ddx partial derivatives of (s, t, r, q) with respect to X - * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y * \param lod_bias optional float vector with the shader lod bias * \param explicit_lod optional float vector with the explicit lod * \param width scalar int texture width * \param height scalar int texture height * \param depth scalar int texture depth * - * XXX: The resulting lod is scalar, so ignore all but the first element of - * derivatives, lod_bias, etc that are passed by the shader. + * The resulting lod is scalar per quad, so only the first value per quad + * passed in from lod_bias, explicit_lod is used. */ void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ unsigned mip_filter, @@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, { LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; LLVMValueRef lod; - *out_lod_ipart = bld->int_bld.zero; - *out_lod_fpart = bld->float_bld.zero; + *out_lod_ipart = bld->perquadi_bld.zero; + *out_lod_fpart = perquadf_bld->zero; if (bld->static_state->min_max_lod_equal) { /* User is forcing sampling from a particular mipmap level. @@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef min_lod = bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit); - lod = min_lod; + lod = lp_build_broadcast_scalar(perquadf_bld, min_lod); } else { - LLVMValueRef sampler_lod_bias = - bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit); - LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0); - if (explicit_lod) { - lod = LLVMBuildExtractElement(builder, explicit_lod, - index0, ""); + lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, + perquadf_bld->type, explicit_lod); } else { LLVMValueRef rho; - rho = lp_build_rho(bld, unit, ddx, ddy); + rho = lp_build_rho(bld, unit, derivs); /* * Compute lod = log2(rho) @@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, if (mip_filter == PIPE_TEX_MIPFILTER_NONE || mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { - *out_lod_ipart = lp_build_ilog2(float_bld, rho); - *out_lod_fpart = bld->float_bld.zero; + *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho); + *out_lod_fpart = perquadf_bld->zero; return; } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR && !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { - lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR, + lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR, out_lod_ipart, out_lod_fpart); return; } } if (0) { - lod = lp_build_log2(float_bld, rho); + lod = lp_build_log2(perquadf_bld, rho); } else { - lod = lp_build_fast_log2(float_bld, rho); + lod = lp_build_fast_log2(perquadf_bld, rho); } /* add shader lod bias */ if (lod_bias) { - lod_bias = LLVMBuildExtractElement(builder, lod_bias, - index0, ""); + lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type, + perquadf_bld->type, lod_bias); lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias"); } } /* add sampler lod bias */ - if (bld->static_state->lod_bias_non_zero) + if (bld->static_state->lod_bias_non_zero) { + LLVMValueRef sampler_lod_bias = + bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit); + sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld, + sampler_lod_bias); lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias"); - + } /* clamp lod */ if (bld->static_state->apply_max_lod) { LLVMValueRef max_lod = bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit); + max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod); - lod = lp_build_min(float_bld, lod, max_lod); + lod = lp_build_min(perquadf_bld, lod, max_lod); } if (bld->static_state->apply_min_lod) { LLVMValueRef min_lod = bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit); + min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod); - lod = lp_build_max(float_bld, lod, min_lod); + lod = lp_build_max(perquadf_bld, lod, min_lod); } } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) { - lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR, + lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR, out_lod_ipart, out_lod_fpart); } else { - lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart); + lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart); } lp_build_name(*out_lod_fpart, "lod_fpart"); } else { - *out_lod_ipart = lp_build_iround(float_bld, lod); + *out_lod_ipart = lp_build_iround(perquadf_bld, lod); } lp_build_name(*out_lod_ipart, "lod_ipart"); @@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, /** * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer * mipmap level index. - * Note: this is all scalar code. - * \param lod scalar float texture level of detail + * Note: this is all scalar per quad code. + * \param lod_ipart int texture level of detail * \param level_out returns integer */ void @@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, LLVMValueRef lod_ipart, LLVMValueRef *level_out) { - struct lp_build_context *int_bld = &bld->int_bld; + struct lp_build_context *perquadi_bld = &bld->perquadi_bld; LLVMValueRef first_level, last_level, level; first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); last_level = bld->dynamic_state->last_level(bld->dynamic_state, bld->gallivm, unit); + first_level = lp_build_broadcast_scalar(perquadi_bld, first_level); + last_level = lp_build_broadcast_scalar(perquadi_bld, last_level); - /* convert float lod to integer */ - level = lp_build_add(int_bld, lod_ipart, first_level); + level = lp_build_add(perquadi_bld, lod_ipart, first_level); /* clamp level to legal range of levels */ - *level_out = lp_build_clamp(int_bld, level, first_level, last_level); + *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level); } /** - * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to - * two (adjacent) mipmap level indexes. Later, we'll sample from those - * two mipmap levels and interpolate between them. + * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad) + * (adjacent) mipmap level indexes, and fix up float lod part accordingly. + * Later, we'll sample from those two mipmap levels and interpolate between them. */ void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, @@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, LLVMValueRef *level1_out) { LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context *int_bld = &bld->int_bld; - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *perquadi_bld = &bld->perquadi_bld; + struct lp_build_context *perquadf_bld = &bld->perquadf_bld; LLVMValueRef first_level, last_level; LLVMValueRef clamp_min; LLVMValueRef clamp_max; first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); - - *level0_out = lp_build_add(int_bld, lod_ipart, first_level); - *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one); - last_level = bld->dynamic_state->last_level(bld->dynamic_state, bld->gallivm, unit); + first_level = lp_build_broadcast_scalar(perquadi_bld, first_level); + last_level = lp_build_broadcast_scalar(perquadi_bld, last_level); + + *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level); + *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one); /* * Clamp both *level0_out and *level1_out to [first_level, last_level], with @@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, * ends in the process. */ + /* + * This code (vector select in particular) only works with llvm 3.1 + * (if there's more than one quad, with x86 backend). Might consider + * converting to our lp_bld_logic helpers. + */ +#if HAVE_LLVM < 0x0301 + assert(perquadi_bld->type.length == 1); +#endif + /* *level0_out < first_level */ clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, *level0_out, first_level, @@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, first_level, *level1_out, ""); *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, - float_bld->zero, *lod_fpart_inout, ""); + perquadf_bld->zero, *lod_fpart_inout, ""); /* *level0_out >= last_level */ clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, @@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, last_level, *level1_out, ""); *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, - float_bld->zero, *lod_fpart_inout, ""); + perquadf_bld->zero, *lod_fpart_inout, ""); lp_build_name(*level0_out, "sampler%u_miplevel0", unit); lp_build_name(*level1_out, "sampler%u_miplevel1", unit); @@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld, } -LLVMValueRef -lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - int level) -{ - LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level); - return lp_build_get_mipmap_level(bld, lvl); -} - - /** * Codegen equivalent for u_minify(). * Return max(1, base_size >> level); @@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, * bld->int_size_type or bld->float_size_type) * @param coord_type type of the texture size vector (either * bld->int_coord_type or bld->coord_type) - * @param int_size vector with the integer texture size (width, height, - * depth) + * @param size vector with the texture size (width, height, depth) */ void lp_build_extract_image_sizes(struct lp_build_sample_context *bld, @@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld, /** * Unnormalize coords. * - * @param int_size vector with the integer texture size (width, height, depth) + * @param flt_size vector with the integer texture size (width, height, depth) */ void lp_build_unnormalized_coords(struct lp_build_sample_context *bld, @@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld, /** Helper used by lp_build_cube_lookup() */ static LLVMValueRef -lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) +lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord) +{ + /* ima = +0.5 / abs(coord); */ + LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); + LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord); + return ima; +} + +/** Helper used by lp_build_cube_lookup() */ +static LLVMValueRef +lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord) { /* ima = -0.5 / abs(coord); */ LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5); @@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) return ima; } - /** * Helper used by lp_build_cube_lookup() + * FIXME: the sign here can also be 0. + * Arithmetically this could definitely make a difference. Either + * fix the comment or use other (simpler) sign function, not sure + * which one it should be. * \param sign scalar +1 or -1 * \param coord float vector * \param ima float vector @@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, LLVMValueRef *face_s, LLVMValueRef *face_t) { - struct lp_build_context *float_bld = &bld->float_bld; struct lp_build_context *coord_bld = &bld->coord_bld; LLVMBuilderRef builder = bld->gallivm->builder; + struct gallivm_state *gallivm = bld->gallivm; LLVMValueRef rx, ry, rz; - LLVMValueRef arx, ary, arz; - LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25); - LLVMValueRef arx_ge_ary, arx_ge_arz; - LLVMValueRef ary_ge_arx, ary_ge_arz; - LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; - - assert(bld->coord_bld.type.length == 4); + LLVMValueRef tmp[4], rxyz, arxyz; /* * Use the average of the four pixel's texcoords to choose the face. + * Slight simplification just calculate the sum, skip scaling. */ - rx = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, s)); - ry = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, t)); - rz = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, r)); + tmp[0] = s; + tmp[1] = t; + tmp[2] = r; + rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3); + arxyz = lp_build_abs(&bld->coord_bld, rxyz); + + if (coord_bld->type.length > 4) { + struct lp_build_context *cint_bld = &bld->int_coord_bld; + struct lp_type intctype = cint_bld->type; + LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign; + LLVMValueRef arxs, arys, arzs; + LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary; + LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz; + LLVMValueRef ryneg, rzneg; + LLVMValueRef ma, ima; + LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5); + LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype, + 1 << (intctype.width - 1)); + LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype, + intctype.width -1); + LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X); + LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y); + LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z); + + assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1); + assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1); + assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1); + + rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), ""); + ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), ""); + rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), ""); + ryneg = LLVMBuildXor(builder, ry, signmask, ""); + rzneg = LLVMBuildXor(builder, rz, signmask, ""); + + /* the sign bit comes from the averaged vector (per quad), + * as does the decision which face to use */ + signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), ""); + signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, ""); + + arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0); + arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1); + arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2); - arx = lp_build_abs(float_bld, rx); - ary = lp_build_abs(float_bld, ry); - arz = lp_build_abs(float_bld, rz); + /* + * select x if x >= y else select y + * select previous result if y >= max(x,y) else select z + */ + arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys); + maxarxsarys = lp_build_max(coord_bld, arxs, arys); + arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs); - /* - * Compare sign/magnitude of rx,ry,rz to determine face - */ - arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, ""); - arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, ""); - ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, ""); - ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, ""); + /* + * compute all possible new s/t coords + * snewx = signrx * -rz; + * tnewx = -ry; + * snewy = rx; + * tnewy = signry * rz; + * snewz = signrz * rx; + * tnewz = -ry; + */ + signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0); + snewx = LLVMBuildXor(builder, signrxs, rzneg, ""); + tnewx = ryneg; + + signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1); + snewy = rx; + tnewy = LLVMBuildXor(builder, signrys, rz, ""); + + signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2); + snewz = LLVMBuildXor(builder, signrzs, rx, ""); + tnewz = ryneg; + + /* XXX on x86 unclear if we should cast the values back to float + * or not - on some cpus (nehalem) pblendvb has twice the throughput + * of blendvps though on others there just might be domain + * transition penalties when using it (this depends on what llvm + * will chose for the bit ops above so there appears no "right way", + * but given the boatload of selects let's just use the int type). + * + * Unfortunately we also need the sign bit of the summed coords. + */ + *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy); + *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy); + ma = lp_build_select(coord_bld, arx_ge_ary, s, t); + *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey); + sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys); + + *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz); + *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz); + ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r); + *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez); + sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs); + + *face_s = LLVMBuildBitCast(builder, *face_s, + lp_build_vec_type(gallivm, coord_bld->type), ""); + *face_t = LLVMBuildBitCast(builder, *face_t, + lp_build_vec_type(gallivm, coord_bld->type), ""); + + /* add +1 for neg face */ + /* XXX with AVX probably want to use another select here - + * as long as we ensure vblendvps gets used we can actually + * skip the comparison and just use sign as a "mask" directly. + */ + sign = LLVMBuildLShr(builder, sign, signshift, ""); + *face = LLVMBuildOr(builder, *face, sign, "face"); - arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, ""); - ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, ""); + ima = lp_build_cube_imapos(coord_bld, ma); + + *face_s = lp_build_mul(coord_bld, *face_s, ima); + *face_s = lp_build_add(coord_bld, *face_s, posHalf); + *face_t = lp_build_mul(coord_bld, *face_t, ima); + *face_t = lp_build_add(coord_bld, *face_t, posHalf); + } - { + else { struct lp_build_if_state if_ctx; LLVMValueRef face_s_var; LLVMValueRef face_t_var; LLVMValueRef face_var; - - face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var"); - face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var"); - face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var"); - - lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz); + LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; + LLVMValueRef shuffles[4]; + LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz; + LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz; + struct lp_build_context *float_bld = &bld->float_bld; + + assert(bld->coord_bld.type.length == 4); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + shuffles[2] = lp_build_const_int32(gallivm, 0); + shuffles[3] = lp_build_const_int32(gallivm, 1); + arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), ""); + shuffles[0] = lp_build_const_int32(gallivm, 1); + shuffles[1] = lp_build_const_int32(gallivm, 0); + shuffles[2] = lp_build_const_int32(gallivm, 2); + shuffles[3] = lp_build_const_int32(gallivm, 2); + aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), ""); + arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz); + + shuffles[0] = lp_build_const_int32(gallivm, 0); + shuffles[1] = lp_build_const_int32(gallivm, 1); + arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz, + LLVMConstVector(shuffles, 2), ""); + shuffles[0] = lp_build_const_int32(gallivm, 2); + shuffles[1] = lp_build_const_int32(gallivm, 3); + arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz, + LLVMConstVector(shuffles, 2), ""); + arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, ""); + + arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz, + lp_build_const_int32(gallivm, 0), ""); + arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz, + lp_build_const_int32(gallivm, 0), ""); + ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz, + lp_build_const_int32(gallivm, 1), ""); + ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz, + lp_build_const_int32(gallivm, 0), ""); + face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var"); + face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var"); + face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var"); + + lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz); { /* +/- X face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rx); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, s); + LLVMValueRef sign, ima; + rx = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 0), ""); + /* +/- X face */ + sign = lp_build_sgn(float_bld, rx); + ima = lp_build_cube_imaneg(coord_bld, s); *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima); *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); *face = lp_build_cube_face(bld, rx, @@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, { struct lp_build_if_state if_ctx2; - lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz); + lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz); { + LLVMValueRef sign, ima; /* +/- Y face */ - LLVMValueRef sign = lp_build_sgn(float_bld, ry); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, t); + ry = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 1), ""); + sign = lp_build_sgn(float_bld, ry); + ima = lp_build_cube_imaneg(coord_bld, t); *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima); *face = lp_build_cube_face(bld, ry, @@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, lp_build_else(&if_ctx2); { /* +/- Z face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rz); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, r); + LLVMValueRef sign, ima; + rz = LLVMBuildExtractElement(builder, rxyz, + lp_build_const_int32(gallivm, 2), ""); + sign = lp_build_sgn(float_bld, rz); + ima = lp_build_cube_imaneg(coord_bld, r); *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima); *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); *face = lp_build_cube_face(bld, rz, @@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, *face_s = LLVMBuildLoad(builder, face_s_var, "face_s"); *face_t = LLVMBuildLoad(builder, face_t_var, "face_t"); *face = LLVMBuildLoad(builder, face_var, "face"); + *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index dad138a..0f3d8ae 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -52,6 +52,15 @@ struct lp_build_context; /** + * Helper struct holding all derivatives needed for sampling + */ +struct lp_derivatives +{ + LLVMValueRef ddx_ddy[2]; +}; + + +/** * Sampler static state. * * These are the bits of state from pipe_resource and pipe_sampler_state that @@ -192,6 +201,9 @@ struct lp_build_sample_context /* See texture_dims() */ unsigned dims; + /** SIMD vector width */ + unsigned vector_width; + /** regular scalar float type */ struct lp_type float_type; struct lp_build_context float_bld; @@ -199,7 +211,7 @@ struct lp_build_sample_context /** float vector type */ struct lp_build_context float_vec_bld; - /** regular scalar float type */ + /** regular scalar int type */ struct lp_type int_type; struct lp_build_context int_bld; @@ -223,10 +235,15 @@ struct lp_build_sample_context struct lp_type texel_type; struct lp_build_context texel_bld; + /** Float per-quad type */ + struct lp_type perquadf_type; + struct lp_build_context perquadf_bld; + + /** Int per-quad type */ + struct lp_type perquadi_type; + struct lp_build_context perquadi_bld; + /* Common dynamic state values */ - LLVMValueRef width; - LLVMValueRef height; - LLVMValueRef depth; LLVMValueRef row_stride_array; LLVMValueRef img_stride_array; LLVMValueRef data_array; @@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ unsigned mip_filter, @@ -331,10 +347,6 @@ LLVMValueRef lp_build_get_mipmap_level(struct lp_build_sample_context *bld, LLVMValueRef level); -LLVMValueRef -lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - int level); - void lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, @@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, LLVMValueRef explicit_lod, LLVMValueRef texel_out[4]); + +void +lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld, + LLVMValueRef coord_f, + LLVMValueRef length_i, + LLVMValueRef length_f, + LLVMValueRef *coord0_i, + LLVMValueRef *weight_f); + + void lp_build_size_query_soa(struct gallivm_state *gallivm, const struct lp_sampler_static_state *static_state, struct lp_sampler_dynamic_state *dynamic_state, + struct lp_type int_type, unsigned unit, LLVMValueRef explicit_lod, LLVMValueRef *sizes_out); void -lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, +lp_build_sample_nop(struct gallivm_state *gallivm, + struct lp_type type, + unsigned num_coords, + const LLVMValueRef *coords, LLVMValueRef texel_out[4]); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index 74858bc..ad1b29c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -27,7 +27,7 @@ /** * @file - * Texture sampling -- SoA. + * Texture sampling -- AoS. * * @author Jose Fonseca <jfonseca@vmware.com> * @author Brian Paul <brianp@vmware.com> @@ -40,6 +40,7 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -75,6 +76,7 @@ static void lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, unsigned block_length, LLVMValueRef coord, + LLVMValueRef coord_f, LLVMValueRef length, LLVMValueRef stride, boolean is_pot, @@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, if(is_pot) coord = LLVMBuildAnd(builder, coord, length_minus_one, ""); else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - coord = LLVMBuildAdd(builder, coord, bias, ""); - coord = LLVMBuildURem(builder, coord, length, ""); + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); + coord = lp_build_fract_safe(coord_bld, coord_f); + coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_itrunc(coord_bld, coord); } break; @@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, /** + * Build LLVM code for texture coord wrapping, for nearest filtering, + * for float texcoords. + * \param coord the incoming texcoord (s,t,r or q) + * \param length the texture size along one dimension + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param icoord the texcoord after wrapping, as int + */ +static void +lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld, + LLVMValueRef coord, + LLVMValueRef length, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *icoord) +{ + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMValueRef length_minus_one; + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + /* take fraction, unnormalize */ + coord = lp_build_fract_safe(coord_bld, coord); + coord = lp_build_mul(coord_bld, coord, length); + *icoord = lp_build_itrunc(coord_bld, coord); + break; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); + if (bld->static_state->normalized_coords) { + /* scale coord to length */ + coord = lp_build_mul(coord_bld, coord, length); + } + coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, + length_minus_one); + *icoord = lp_build_itrunc(coord_bld, coord); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + } +} + + +/** * Build LLVM code for texture coord wrapping, for linear filtering, * for scaled integer texcoords. * \param block_length is the length of the pixel block along the @@ -139,6 +192,8 @@ static void lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, unsigned block_length, LLVMValueRef coord0, + LLVMValueRef *weight_i, + LLVMValueRef coord_f, LLVMValueRef length, LLVMValueRef stride, boolean is_pot, @@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, LLVMValueRef length_minus_one; LLVMValueRef lmask, umask, mask; - if (block_length != 1) { - /* - * If the pixel block covers more than one pixel then there is no easy - * way to calculate offset1 relative to offset0. Instead, compute them - * independently. - */ - - LLVMValueRef coord1; - - lp_build_sample_wrap_nearest_int(bld, - block_length, - coord0, - length, - stride, - is_pot, - wrap_mode, - offset0, i0); - - coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + /* + * If the pixel block covers more than one pixel then there is no easy + * way to calculate offset1 relative to offset0. Instead, compute them + * independently. Otherwise, try to compute offset0 and offset1 with + * a single stride multiplication. + */ - lp_build_sample_wrap_nearest_int(bld, - block_length, - coord1, - length, - stride, - is_pot, - wrap_mode, - offset1, i1); + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); + if (block_length != 1) { + LLVMValueRef coord1; + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); + coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, ""); + } + else { + LLVMValueRef mask; + LLVMValueRef weight; + LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); + lp_build_coord_repeat_npot_linear(bld, coord_f, + length, length_f, + &coord0, &weight); + mask = lp_build_compare(bld->gallivm, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + coord1 = LLVMBuildAnd(builder, + lp_build_add(int_coord_bld, coord0, + int_coord_bld->one), + mask, ""); + weight = lp_build_mul_imm(&bld->coord_bld, weight, 256); + *weight_i = lp_build_itrunc(&bld->coord_bld, weight); + } + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero, + length_minus_one); + coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero, + length_minus_one); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + coord0 = int_coord_bld->zero; + coord1 = int_coord_bld->zero; + break; + } + lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride, + offset0, i0); + lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride, + offset1, i1); return; } - /* - * Scalar pixels -- try to compute offset0 and offset1 with a single stride - * multiplication. - */ - *i0 = int_coord_bld->zero; *i1 = int_coord_bld->zero; - length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); - switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: if (is_pot) { coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - coord0 = LLVMBuildAdd(builder, coord0, bias, ""); - coord0 = LLVMBuildURem(builder, coord0, length, ""); + LLVMValueRef weight; + LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); + lp_build_coord_repeat_npot_linear(bld, coord_f, + length, length_f, + &coord0, &weight); + weight = lp_build_mul_imm(&bld->coord_bld, weight, 256); + *weight_i = lp_build_itrunc(&bld->coord_bld, weight); } mask = lp_build_compare(bld->gallivm, int_coord_bld->type, @@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, break; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + /* XXX this might be slower than the separate path + * on some newer cpus. With sse41 this is 8 instructions vs. 7 + * - at least on SNB this is almost certainly slower since + * min/max are cheaper than selects, and the muls aren't bad. + */ lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, @@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, /** + * Build LLVM code for texture coord wrapping, for linear filtering, + * for float texcoords. + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param coord the incoming texcoord (s,t,r or q) + * \param length the texture size along one dimension + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param coord0 the first texcoord after wrapping, as int + * \param coord1 the second texcoord after wrapping, as int + * \param weight the filter weight as int (0-255) + * \param force_nearest if this coord actually uses nearest filtering + */ +static void +lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef length, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *coord0, + LLVMValueRef *coord1, + LLVMValueRef *weight, + unsigned force_nearest) +{ + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + /* mul by size and subtract 0.5 */ + coord = lp_build_mul(coord_bld, coord, length); + if (!force_nearest) + coord = lp_build_sub(coord_bld, coord, half); + *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, coord0, weight); + *coord1 = lp_build_ifloor(coord_bld, *coord1); + /* repeat wrap */ + length_minus_one = lp_build_itrunc(coord_bld, length_minus_one); + *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, ""); + *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, ""); + } + else { + LLVMValueRef mask; + /* wrap with normalized floats is just fract */ + coord = lp_build_fract(coord_bld, coord); + /* unnormalize */ + coord = lp_build_mul(coord_bld, coord, length); + /* + * we avoided the 0.5/length division, have to fix up wrong + * edge cases with selects + */ + *coord1 = lp_build_add(coord_bld, coord, half); + coord = lp_build_sub(coord_bld, coord, half); + *weight = lp_build_fract(coord_bld, coord); + mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, + PIPE_FUNC_LESS, coord, coord_bld->zero); + *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord); + *coord0 = lp_build_itrunc(coord_bld, *coord0); + mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, + PIPE_FUNC_LESS, *coord1, length); + *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero); + *coord1 = lp_build_itrunc(coord_bld, *coord1); + } + break; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + if (bld->static_state->normalized_coords) { + /* mul by tex size */ + coord = lp_build_mul(coord_bld, coord, length); + } + /* subtract 0.5 */ + if (!force_nearest) { + coord = lp_build_sub(coord_bld, coord, half); + } + /* clamp to [0, length - 1] */ + coord = lp_build_min(coord_bld, coord, length_minus_one); + coord = lp_build_max(coord_bld, coord, coord_bld->zero); + *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, coord0, weight); + /* coord1 = min(coord1, length-1) */ + *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one); + *coord1 = lp_build_itrunc(coord_bld, *coord1); + break; + default: + assert(0); + *coord0 = int_coord_bld->zero; + *coord1 = int_coord_bld->zero; + *weight = coord_bld->zero; + break; + } + *weight = lp_build_mul_imm(coord_bld, *weight, 256); + *weight = lp_build_itrunc(coord_bld, *weight); + return; +} + + +/** + * Fetch texels for image with nearest sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, + LLVMValueRef data_ptr, + LLVMValueRef offset, + LLVMValueRef x_subcoord, + LLVMValueRef y_subcoord, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + /* + * Fetch the pixels as 4 x 32bit (rgba order might differ): + * + * rgba0 rgba1 rgba2 rgba3 + * + * bit cast them into 16 x u8 + * + * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + * + * unpack them into two 8 x i16: + * + * r0 g0 b0 a0 r1 g1 b1 a1 + * r2 g2 b2 a2 r3 g3 b3 a3 + * + * The higher 8 bits of the resulting elements will be zero. + */ + LLVMBuilderRef builder = bld->gallivm->builder; + LLVMValueRef rgba8; + struct lp_build_context h16, u8n; + LLVMTypeRef u8n_vec_type; + + lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); + lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); + u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->gallivm, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, + bld->format_desc, + u8n.type, + data_ptr, offset, + x_subcoord, + y_subcoord); + } + + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(bld->gallivm, u8n.type, h16.type, + rgba8, + colors_lo, colors_hi); +} + + +/** * Sample a single texture image with nearest sampling. * If sampling a cube texture, r = cube face in [0,5]. * Return filtered color as two vectors of 16-bit fixed point values. @@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context i32, h16, u8n; - LLVMTypeRef i32_vec_type, u8n_vec_type; + struct lp_build_context i32; + LLVMTypeRef i32_vec_type; LLVMValueRef i32_c8; LLVMValueRef width_vec, height_vec, depth_vec; LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL; + LLVMValueRef s_float, t_float = NULL, r_float = NULL; LLVMValueRef x_stride; LLVMValueRef x_offset, offset; LLVMValueRef x_subcoord, y_subcoord, z_subcoord; - lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); - lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); - lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); + lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); - u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, @@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, &height_vec, &depth_vec); + s_float = s; t_float = t; r_float = r; + if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; @@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, /* Do texcoord wrapping, compute texel offset */ lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.width, - s_ipart, width_vec, x_stride, + s_ipart, s_float, + width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset, &x_subcoord); @@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef y_offset; lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.height, - t_ipart, height_vec, row_stride_vec, + t_ipart, t_float, + height_vec, row_stride_vec, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset, &y_subcoord); @@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef z_offset; lp_build_sample_wrap_nearest_int(bld, 1, /* block length (depth) */ - r_ipart, depth_vec, img_stride_vec, + r_ipart, r_float, + depth_vec, img_stride_vec, bld->static_state->pot_depth, bld->static_state->wrap_r, &z_offset, &z_subcoord); @@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, } } + lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, + x_subcoord, y_subcoord, + colors_lo, colors_hi); +} + + +/** + * Sample a single texture image with nearest sampling. + * If sampling a cube texture, r = cube face in [0,5]. + * Return filtered color as two vectors of 16-bit fixed point values. + * Does address calcs (except offsets) with floats. + * Useful for AVX which has support for 8x32 floats but not 8x32 ints. + */ +static void +lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld, + LLVMValueRef int_size, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) + { + const unsigned dims = bld->dims; + LLVMValueRef width_vec, height_vec, depth_vec; + LLVMValueRef offset; + LLVMValueRef x_subcoord, y_subcoord; + LLVMValueRef x_icoord, y_icoord, z_icoord; + LLVMValueRef flt_size; + + flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); + + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &width_vec, + &height_vec, + &depth_vec); + + /* Do texcoord wrapping */ + lp_build_sample_wrap_nearest_float(bld, + s, width_vec, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_icoord); + + if (dims >= 2) { + lp_build_sample_wrap_nearest_float(bld, + t, height_vec, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_icoord); + + if (dims >= 3) { + lp_build_sample_wrap_nearest_float(bld, + r, depth_vec, + bld->static_state->pot_depth, + bld->static_state->wrap_r, + &z_icoord); + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + z_icoord = r; + } + } + + /* + * From here on we deal with ints, and we should split up the 256bit + * vectors manually for better generated code. + */ + + /* + * compute texel offsets - + * cannot do offset calc with floats, difficult for block-based formats, + * and not enough precision anyway. + */ + lp_build_sample_offset(&bld->int_coord_bld, + bld->format_desc, + x_icoord, y_icoord, + z_icoord, + row_stride_vec, img_stride_vec, + &offset, + &x_subcoord, &y_subcoord); + + lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, + x_subcoord, y_subcoord, + colors_lo, colors_hi); +} + + +/** + * Fetch texels for image with linear sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, + LLVMValueRef data_ptr, + LLVMValueRef offset[2][2][2], + LLVMValueRef x_subcoord[2], + LLVMValueRef y_subcoord[2], + LLVMValueRef s_fpart, + LLVMValueRef t_fpart, + LLVMValueRef r_fpart, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMBuilderRef builder = bld->gallivm->builder; + struct lp_build_context h16, u8n; + LLVMTypeRef h16_vec_type, u8n_vec_type; + LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); + LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffle_lo, shuffle_hi; + LLVMValueRef s_fpart_lo, s_fpart_hi; + LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL; + LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL; + LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ + LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ + LLVMValueRef packed_lo, packed_hi; + unsigned i, j, k; + unsigned numj, numk; + + lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); + lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); + h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); + u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + + /* + * Transform 4 x i32 in + * + * s_fpart = {s0, s1, s2, s3} + * + * into 8 x i16 + * + * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} + * + * into two 8 x i16 + * + * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} + * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} + * + * and likewise for t_fpart. There is no risk of loosing precision here + * since the fractional parts only use the lower 8bits. + */ + s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); + if (dims >= 2) + t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); + if (dims >= 3) + r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); + + for (j = 0; j < h16.type.length; j += 4) { +#ifdef PIPE_ARCH_LITTLE_ENDIAN + unsigned subindex = 0; +#else + unsigned subindex = 1; +#endif + LLVMValueRef index; + + index = LLVMConstInt(elem_type, j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_lo[j + i] = index; + + index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_hi[j + i] = index; + } + + shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); + shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); + + s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_lo, ""); + s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_hi, ""); + if (dims >= 2) { + t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_lo, ""); + t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_hi, ""); + } + if (dims >= 3) { + r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_lo, ""); + r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_hi, ""); + } + /* * Fetch the pixels as 4 x 32bit (rgba order might differ): * @@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, * * The higher 8 bits of the resulting elements will be zero. */ - { - LLVMValueRef rgba8; + numj = 1 + (dims >= 2); + numk = 1 + (dims >= 3); - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* - * Given the format is a rgba8, just read the pixels as is, - * without any swizzling. Swizzling will be done later. - */ - rgba8 = lp_build_gather(bld->gallivm, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); + for (k = 0; k < numk; k++) { + for (j = 0; j < numj; j++) { + for (i = 0; i < 2; i++) { + LLVMValueRef rgba8; + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->gallivm, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset[k][j][i]); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, + bld->format_desc, + u8n.type, + data_ptr, offset[k][j][i], + x_subcoord[i], + y_subcoord[j]); + } - rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(bld->gallivm, u8n.type, h16.type, + rgba8, + &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); + } } - else { - rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, - bld->format_desc, - u8n.type, - data_ptr, offset, - x_subcoord, - y_subcoord); + } + + /* + * Linear interpolation with 8.8 fixed point. + */ + if (bld->static_state->force_nearest_s) { + /* special case 1-D lerp */ + packed_lo = lp_build_lerp(&h16, + t_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + t_fpart_hi, + neighbors_hi[0][1][0], + neighbors_hi[0][1][0]); + } + else if (bld->static_state->force_nearest_t) { + /* special case 1-D lerp */ + packed_lo = lp_build_lerp(&h16, + s_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + s_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1]); + } + else { + /* general 1/2/3-D lerping */ + if (dims == 1) { + packed_lo = lp_build_lerp(&h16, + s_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + s_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1]); } + else { + /* 2-D lerp */ + packed_lo = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1], + neighbors_lo[0][1][0], + neighbors_lo[0][1][1]); + + packed_hi = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1], + neighbors_hi[0][1][0], + neighbors_hi[0][1][1]); + + if (dims >= 3) { + LLVMValueRef packed_lo2, packed_hi2; + + /* lerp in the second z slice */ + packed_lo2 = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[1][0][0], + neighbors_lo[1][0][1], + neighbors_lo[1][1][0], + neighbors_lo[1][1][1]); - /* Expand one 4*rgba8 to two 2*rgba16 */ - lp_build_unpack2(bld->gallivm, u8n.type, h16.type, - rgba8, - colors_lo, colors_hi); + packed_hi2 = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[1][0][0], + neighbors_hi[1][0][1], + neighbors_hi[1][1][0], + neighbors_hi[1][1][1]); + /* interp between two z slices */ + packed_lo = lp_build_lerp(&h16, r_fpart_lo, + packed_lo, packed_lo2); + packed_hi = lp_build_lerp(&h16, r_fpart_hi, + packed_hi, packed_hi2); + } + } } -} + *colors_lo = packed_lo; + *colors_hi = packed_hi; +} /** * Sample a single texture image with (bi-)(tri-)linear sampling. @@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; - struct lp_build_context i32, h16, u8n; - LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; + struct lp_build_context i32; + LLVMTypeRef i32_vec_type; LLVMValueRef i32_c8, i32_c128, i32_c255; LLVMValueRef width_vec, height_vec, depth_vec; - LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; - LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL; - LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL; + LLVMValueRef s_ipart, s_fpart, s_float; + LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL; + LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL; LLVMValueRef x_stride, y_stride, z_stride; LLVMValueRef x_offset0, x_offset1; LLVMValueRef y_offset0, y_offset1; LLVMValueRef z_offset0, z_offset1; LLVMValueRef offset[2][2][2]; /* [z][y][x] */ LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2]; - LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ - LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ - LLVMValueRef packed_lo, packed_hi; unsigned x, y, z; - unsigned i, j, k; - unsigned numj, numk; - lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); - lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); - lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); + lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); - h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); - u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, @@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, &height_vec, &depth_vec); + s_float = s; t_float = t; r_float = r; + if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; @@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, /* do texcoord wrapping and compute texel offsets */ lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.width, - s_ipart, width_vec, x_stride, + s_ipart, &s_fpart, s_float, + width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset0, &x_offset1, @@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, if (dims >= 2) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, - t_ipart, height_vec, y_stride, + t_ipart, &t_fpart, t_float, + height_vec, y_stride, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset0, &y_offset1, @@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, if (dims >= 3) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, - r_ipart, depth_vec, z_stride, + r_ipart, &r_fpart, r_float, + depth_vec, z_stride, bld->static_state->pot_depth, bld->static_state->wrap_r, &z_offset0, &z_offset1, @@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, } } - /* - * Transform 4 x i32 in - * - * s_fpart = {s0, s1, s2, s3} - * - * into 8 x i16 - * - * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} - * - * into two 8 x i16 - * - * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} - * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} - * - * and likewise for t_fpart. There is no risk of loosing precision here - * since the fractional parts only use the lower 8bits. - */ - s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); - if (dims >= 2) - t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); - if (dims >= 3) - r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); + lp_build_sample_fetch_image_linear(bld, data_ptr, offset, + x_subcoord, y_subcoord, + s_fpart, t_fpart, r_fpart, + colors_lo, colors_hi); +} - { - LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); - LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffle_lo; - LLVMValueRef shuffle_hi; - for (j = 0; j < h16.type.length; j += 4) { -#ifdef PIPE_ARCH_LITTLE_ENDIAN - unsigned subindex = 0; -#else - unsigned subindex = 1; -#endif - LLVMValueRef index; +/** + * Sample a single texture image with (bi-)(tri-)linear sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + * Does address calcs (except offsets) with floats. + * Useful for AVX which has support for 8x32 floats but not 8x32 ints. + */ +static void +lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld, + LLVMValueRef int_size, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMValueRef width_vec, height_vec, depth_vec; + LLVMValueRef s_fpart; + LLVMValueRef t_fpart = NULL; + LLVMValueRef r_fpart = NULL; + LLVMValueRef x_stride, y_stride, z_stride; + LLVMValueRef x_offset0, x_offset1; + LLVMValueRef y_offset0, y_offset1; + LLVMValueRef z_offset0, z_offset1; + LLVMValueRef offset[2][2][2]; /* [z][y][x] */ + LLVMValueRef x_subcoord[2], y_subcoord[2]; + LLVMValueRef flt_size; + LLVMValueRef x_icoord0, x_icoord1; + LLVMValueRef y_icoord0, y_icoord1; + LLVMValueRef z_icoord0, z_icoord1; + unsigned x, y, z; - index = LLVMConstInt(elem_type, j/2 + subindex, 0); - for (i = 0; i < 4; ++i) - shuffles_lo[j + i] = index; + flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); - index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); - for (i = 0; i < 4; ++i) - shuffles_hi[j + i] = index; - } + lp_build_extract_image_sizes(bld, + bld->float_size_type, + bld->coord_type, + flt_size, + &width_vec, + &height_vec, + &depth_vec); - shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); - shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); + /* do texcoord wrapping and compute texel offsets */ + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.width, + s, width_vec, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_icoord0, &x_icoord1, + &s_fpart, + bld->static_state->force_nearest_s); + + if (dims >= 2) { + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.height, + t, height_vec, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_icoord0, &y_icoord1, + &t_fpart, + bld->static_state->force_nearest_t); - s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, - shuffle_lo, ""); - s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, - shuffle_hi, ""); - if (dims >= 2) { - t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, - shuffle_lo, ""); - t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, - shuffle_hi, ""); - } if (dims >= 3) { - r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, - shuffle_lo, ""); - r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, - shuffle_hi, ""); + lp_build_sample_wrap_linear_float(bld, + bld->format_desc->block.height, + r, depth_vec, + bld->static_state->pot_depth, + bld->static_state->wrap_r, + &z_icoord0, &z_icoord1, + &r_fpart, 0); } } /* - * Fetch the pixels as 4 x 32bit (rgba order might differ): - * - * rgba0 rgba1 rgba2 rgba3 - * - * bit cast them into 16 x u8 - * - * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 - * - * unpack them into two 8 x i16: - * - * r0 g0 b0 a0 r1 g1 b1 a1 - * r2 g2 b2 a2 r3 g3 b3 a3 - * - * The higher 8 bits of the resulting elements will be zero. + * From here on we deal with ints, and we should split up the 256bit + * vectors manually for better generated code. */ - numj = 1 + (dims >= 2); - numk = 1 + (dims >= 3); - for (k = 0; k < numk; k++) { - for (j = 0; j < numj; j++) { - for (i = 0; i < 2; i++) { - LLVMValueRef rgba8; - - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* - * Given the format is a rgba8, just read the pixels as is, - * without any swizzling. Swizzling will be done later. - */ - rgba8 = lp_build_gather(bld->gallivm, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset[k][j][i]); - - rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); - } - else { - rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, - bld->format_desc, - u8n.type, - data_ptr, offset[k][j][i], - x_subcoord[i], - y_subcoord[j]); - } - - /* Expand one 4*rgba8 to two 2*rgba16 */ - lp_build_unpack2(bld->gallivm, u8n.type, h16.type, - rgba8, - &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); - } - } - } + /* get pixel, row and image strides */ + x_stride = lp_build_const_vec(bld->gallivm, + bld->int_coord_bld.type, + bld->format_desc->block.bits/8); + y_stride = row_stride_vec; + z_stride = img_stride_vec; /* - * Linear interpolation with 8.8 fixed point. + * compute texel offset - + * cannot do offset calc with floats, difficult for block-based formats, + * and not enough precision anyway. */ - if (bld->static_state->force_nearest_s) { - /* special case 1-D lerp */ - packed_lo = lp_build_lerp(&h16, - t_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - - packed_hi = lp_build_lerp(&h16, - t_fpart_hi, - neighbors_hi[0][1][0], - neighbors_hi[0][1][0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.width, + x_icoord0, x_stride, + &x_offset0, &x_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.width, + x_icoord1, x_stride, + &x_offset1, &x_subcoord[1]); + for (z = 0; z < 2; z++) { + for (y = 0; y < 2; y++) { + offset[z][y][0] = x_offset0; + offset[z][y][1] = x_offset1; + } } - else if (bld->static_state->force_nearest_t) { - /* special case 1-D lerp */ - packed_lo = lp_build_lerp(&h16, - s_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - packed_hi = lp_build_lerp(&h16, - s_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1]); + if (dims >= 2) { + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.height, + y_icoord0, y_stride, + &y_offset0, &y_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + bld->format_desc->block.height, + y_icoord1, y_stride, + &y_offset1, &y_subcoord[1]); + for (z = 0; z < 2; z++) { + for (x = 0; x < 2; x++) { + offset[z][0][x] = lp_build_add(&bld->int_coord_bld, + offset[z][0][x], y_offset0); + offset[z][1][x] = lp_build_add(&bld->int_coord_bld, + offset[z][1][x], y_offset1); + } + } } - else { - /* general 1/2/3-D lerping */ - if (dims == 1) { - packed_lo = lp_build_lerp(&h16, - s_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1]); - packed_hi = lp_build_lerp(&h16, - s_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1]); + if (dims >= 3) { + LLVMValueRef z_subcoord[2]; + lp_build_sample_partial_offset(&bld->int_coord_bld, + 1, + z_icoord0, z_stride, + &z_offset0, &z_subcoord[0]); + lp_build_sample_partial_offset(&bld->int_coord_bld, + 1, + z_icoord1, z_stride, + &z_offset1, &z_subcoord[1]); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, + offset[0][y][x], z_offset0); + offset[1][y][x] = lp_build_add(&bld->int_coord_bld, + offset[1][y][x], z_offset1); + } } - else { - /* 2-D lerp */ - packed_lo = lp_build_lerp_2d(&h16, - s_fpart_lo, t_fpart_lo, - neighbors_lo[0][0][0], - neighbors_lo[0][0][1], - neighbors_lo[0][1][0], - neighbors_lo[0][1][1]); - - packed_hi = lp_build_lerp_2d(&h16, - s_fpart_hi, t_fpart_hi, - neighbors_hi[0][0][0], - neighbors_hi[0][0][1], - neighbors_hi[0][1][0], - neighbors_hi[0][1][1]); - - if (dims >= 3) { - LLVMValueRef packed_lo2, packed_hi2; - - /* lerp in the second z slice */ - packed_lo2 = lp_build_lerp_2d(&h16, - s_fpart_lo, t_fpart_lo, - neighbors_lo[1][0][0], - neighbors_lo[1][0][1], - neighbors_lo[1][1][0], - neighbors_lo[1][1][1]); - - packed_hi2 = lp_build_lerp_2d(&h16, - s_fpart_hi, t_fpart_hi, - neighbors_hi[1][0][0], - neighbors_hi[1][0][1], - neighbors_hi[1][1][0], - neighbors_hi[1][1][1]); - /* interp between two z slices */ - packed_lo = lp_build_lerp(&h16, r_fpart_lo, - packed_lo, packed_lo2); - packed_hi = lp_build_lerp(&h16, r_fpart_hi, - packed_hi, packed_hi2); + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef z_offset; + z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + /* The r coord is the cube face in [0,5] */ + offset[0][y][x] = lp_build_add(&bld->int_coord_bld, + offset[0][y][x], z_offset); } } } - *colors_lo = packed_lo; - *colors_hi = packed_hi; + lp_build_sample_fetch_image_linear(bld, data_ptr, offset, + x_subcoord, y_subcoord, + s_fpart, t_fpart, r_fpart, + colors_lo, colors_hi); } @@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef size0; LLVMValueRef size1; - LLVMValueRef row_stride0_vec; - LLVMValueRef row_stride1_vec; - LLVMValueRef img_stride0_vec; - LLVMValueRef img_stride1_vec; + LLVMValueRef row_stride0_vec = NULL; + LLVMValueRef row_stride1_vec = NULL; + LLVMValueRef img_stride0_vec = NULL; + LLVMValueRef img_stride1_vec = NULL; LLVMValueRef data_ptr0; LLVMValueRef data_ptr1; LLVMValueRef colors0_lo, colors0_hi; @@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, &size0, &row_stride0_vec, &img_stride0_vec); data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); - if (img_filter == PIPE_TEX_FILTER_NEAREST) { - lp_build_sample_image_nearest(bld, - size0, - row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, - &colors0_lo, &colors0_hi); + if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest_afloat(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + else { + assert(img_filter == PIPE_TEX_FILTER_LINEAR); + lp_build_sample_image_linear_afloat(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } } else { - assert(img_filter == PIPE_TEX_FILTER_LINEAR); - lp_build_sample_image_linear(bld, - size0, - row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, - &colors0_lo, &colors0_hi); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + else { + assert(img_filter == PIPE_TEX_FILTER_LINEAR); + lp_build_sample_image_linear(bld, + size0, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } } /* Store the first level's colors in the output variables */ @@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMBuildStore(builder, colors0_hi, colors_hi_var); if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0); - LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32); + LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm, + bld->perquadf_bld.type, 256.0); + LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type); struct lp_build_if_state if_ctx; LLVMValueRef need_lerp; + unsigned num_quads = bld->coord_bld.type.length / 4; + unsigned i; - lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, ""); - lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16"); + lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, ""); + lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16"); /* need_lerp = lod_fpart > 0 */ - need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, - lod_fpart, LLVMConstNull(i32_type), - "need_lerp"); + if (num_quads == 1) { + need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, + lod_fpart, bld->perquadi_bld.zero, + "need_lerp"); + } + else { + /* + * We'll do mip filtering if any of the quads need it. + * It might be better to split the vectors here and only fetch/filter + * quads which need it. + */ + /* + * We need to clamp lod_fpart here since we can get negative + * values which would screw up filtering if not all + * lod_fpart values have same sign. + * We can however then skip the greater than comparison. + */ + lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart, + bld->perquadi_bld.zero); + need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart); + } lp_build_if(&if_ctx, bld->gallivm, need_lerp); { struct lp_build_context h16_bld; - lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16)); + lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); /* sample the second mipmap level */ lp_build_mipmap_level_sizes(bld, ilevel1, &size1, &row_stride1_vec, &img_stride1_vec); data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); - if (img_filter == PIPE_TEX_FILTER_NEAREST) { - lp_build_sample_image_nearest(bld, - size1, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); + + if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest_afloat(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear_afloat(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } } else { - lp_build_sample_image_linear(bld, - size1, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, - &colors1_lo, &colors1_hi); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear(bld, + size1, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } } /* interpolate samples from the two mipmap levels */ - lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); - lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); + if (num_quads == 1) { + lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); + lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); #if HAVE_LLVM == 0x208 - /* This is a work-around for a bug in LLVM 2.8. - * Evidently, something goes wrong in the construction of the - * lod_fpart short[8] vector. Adding this no-effect shuffle seems - * to force the vector to be properly constructed. - * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f). - */ - { - LLVMValueRef shuffles[8], shuffle; - int i; - assert(h16_bld.type.length <= Elements(shuffles)); - for (i = 0; i < h16_bld.type.length; i++) - shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1)); - shuffle = LLVMConstVector(shuffles, h16_bld.type.length); - lod_fpart = LLVMBuildShuffleVector(builder, - lod_fpart, lod_fpart, - shuffle, ""); - } + /* This is a work-around for a bug in LLVM 2.8. + * Evidently, something goes wrong in the construction of the + * lod_fpart short[8] vector. Adding this no-effect shuffle seems + * to force the vector to be properly constructed. + * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f). + */ + { + LLVMValueRef shuffles[8], shuffle; + assert(h16_bld.type.length <= Elements(shuffles)); + for (i = 0; i < h16_bld.type.length; i++) + shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1)); + shuffle = LLVMConstVector(shuffles, h16_bld.type.length); + lod_fpart = LLVMBuildShuffleVector(builder, + lod_fpart, lod_fpart, + shuffle, ""); + } #endif - colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, - colors0_lo, colors1_lo); - colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, - colors0_hi, colors1_hi); + colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, + colors0_hi, colors1_hi); + } + else { + LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16]; + struct lp_type perquadi16_type = bld->perquadi_bld.type; + perquadi16_type.width /= 2; + perquadi16_type.length *= 2; + lod_fpart = LLVMBuildBitCast(builder, lod_fpart, + lp_build_vec_type(bld->gallivm, + perquadi16_type), ""); + /* XXX this only works for exactly 2 quads. More quads need shuffle */ + assert(num_quads == 2); + for (i = 0; i < num_quads; i++) { + LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2); + lod_parts[i] = lp_build_extract_broadcast(bld->gallivm, + perquadi16_type, + h16_bld.type, + lod_fpart, + indexi2); + } + colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0], + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1], + colors0_hi, colors1_hi); + } LLVMBuildStore(builder, colors0_lo, colors_lo_var); LLVMBuildStore(builder, colors0_hi, colors_hi_var); @@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef texel_out[4]) { struct lp_build_context *int_bld = &bld->int_bld; @@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; const unsigned dims = bld->dims; - LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; - LLVMValueRef ilevel0, ilevel1 = NULL; LLVMValueRef packed, packed_lo, packed_hi; LLVMValueRef unswizzled[4]; - LLVMValueRef face_ddx[4], face_ddy[4]; struct lp_build_context h16_bld; - LLVMValueRef first_level; - LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0); /* we only support the common/simple wrap modes at this time */ assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s)); @@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, /* make 16-bit fixed-pt builder context */ - lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16)); - - /* cube face selection, compute pre-face coords, etc. */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ - /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ - - /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); - face_ddx[2] = NULL; - face_ddx[3] = NULL; - face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); - face_ddy[2] = NULL; - face_ddy[3] = NULL; - ddx = face_ddx; - ddy = face_ddy; - } - - /* - * Compute the level of detail (float). - */ - if (min_filter != mag_filter || - mip_filter != PIPE_TEX_MIPFILTER_NONE) { - /* Need to compute lod either to choose mipmap levels or to - * distinguish between minification/magnification with one mipmap level. - */ - lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - mip_filter, - &lod_ipart, &lod_fpart); - } else { - lod_ipart = i32t_zero; - } - - /* - * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 - */ - switch (mip_filter) { - default: - assert(0 && "bad mip_filter value in lp_build_sample_aos()"); - /* fall-through */ - case PIPE_TEX_MIPFILTER_NONE: - /* always use mip level 0 */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - /* XXX this is a work-around for an apparent bug in LLVM 2.7. - * We should be able to set ilevel0 = const(0) but that causes - * bad x86 code to be emitted. - */ - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); - } - else { - first_level = bld->dynamic_state->first_level(bld->dynamic_state, - bld->gallivm, unit); - ilevel0 = first_level; - } - break; - case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); - break; - case PIPE_TEX_MIPFILTER_LINEAR: - assert(lod_ipart); - assert(lod_fpart); - lp_build_linear_mip_levels(bld, unit, - lod_ipart, &lod_fpart, - &ilevel0, &ilevel1); - break; - } + lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width)); /* * Get/interpolate texture colors. @@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi"); if (min_filter == mag_filter) { - /* no need to distinquish between minification and magnification */ + /* no need to distinguish between minification and magnification */ lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, @@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, * into 'packed' */ packed = lp_build_pack2(bld->gallivm, - h16_bld.type, lp_type_unorm(8), + h16_bld.type, lp_type_unorm(8, bld->vector_width), LLVMBuildLoad(builder, packed_lo, ""), LLVMBuildLoad(builder, packed_hi, "")); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h index 5d9ecac..55b3bc1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h @@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef texel_out[4]); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 73dc3e7..aaef797 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -41,6 +41,7 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -57,6 +58,7 @@ #include "lp_bld_sample_aos.h" #include "lp_bld_struct.h" #include "lp_bld_quad.h" +#include "lp_bld_pack.h" /** @@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, /** + * Helper to compute the first coord and the weight for + * linear wrap repeat npot textures + */ +void +lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld, + LLVMValueRef coord_f, + LLVMValueRef length_i, + LLVMValueRef length_f, + LLVMValueRef *coord0_i, + LLVMValueRef *weight_f) +{ + struct lp_build_context *coord_bld = &bld->coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); + LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i, + int_coord_bld->one); + LLVMValueRef mask; + /* wrap with normalized floats is just fract */ + coord_f = lp_build_fract(coord_bld, coord_f); + /* mul by size and subtract 0.5 */ + coord_f = lp_build_mul(coord_bld, coord_f, length_f); + coord_f = lp_build_sub(coord_bld, coord_f, half); + /* + * we avoided the 0.5/length division before the repeat wrap, + * now need to fix up edge cases with selects + */ + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f); + mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, + PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero); + *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i); +} + + +/** * Build LLVM code for texture wrap mode for linear filtering. * \param x0_out returns first integer texcoord * \param x1_out returns second integer texcoord @@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: - /* mul by size and subtract 0.5 */ - coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_sub(coord_bld, coord, half); - /* convert to int, compute lerp weight */ - lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); - /* repeat wrap */ if (is_pot) { + /* mul by size and subtract 0.5 */ + coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_sub(coord_bld, coord, half); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + /* repeat wrap */ coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, ""); } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); LLVMValueRef mask; - coord0 = LLVMBuildAdd(builder, coord0, bias, ""); - coord0 = LLVMBuildURem(builder, coord0, length, ""); - mask = lp_build_compare(bld->gallivm, int_coord_bld->type, + lp_build_coord_repeat_npot_linear(bld, coord, + length, length_f, + &coord0, &weight); + mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); coord1 = LLVMBuildAnd(builder, - lp_build_add(int_coord_bld, coord0, int_coord_bld->one), - mask, ""); + lp_build_add(int_coord_bld, coord0, int_coord_bld->one), + mask, ""); } break; @@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: - coord = lp_build_mul(coord_bld, coord, length_f); - icoord = lp_build_ifloor(coord_bld, coord); - if (is_pot) + if (is_pot) { + coord = lp_build_mul(coord_bld, coord, length_f); + icoord = lp_build_ifloor(coord_bld, coord); icoord = LLVMBuildAnd(builder, icoord, length_minus_one, ""); + } else { - /* Add a bias to the texcoord to handle negative coords */ - LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); - icoord = LLVMBuildAdd(builder, icoord, bias, ""); - icoord = LLVMBuildURem(builder, icoord, length, ""); + /* take fraction, unnormalize */ + coord = lp_build_fract_safe(coord_bld, coord); + coord = lp_build_mul(coord_bld, coord, length_f); + icoord = lp_build_itrunc(coord_bld, coord); } break; @@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, break; case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */ + /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */ { LLVMValueRef min, max; @@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { struct lp_build_if_state if_ctx; LLVMValueRef need_lerp; + unsigned num_quads = bld->coord_bld.type.length / 4; /* need_lerp = lod_fpart > 0 */ - need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, - lod_fpart, - bld->float_bld.zero, - "need_lerp"); + if (num_quads == 1) { + need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, + lod_fpart, bld->perquadf_bld.zero, + "need_lerp"); + } + else { + /* + * We'll do mip filtering if any of the quads need it. + * It might be better to split the vectors here and only fetch/filter + * quads which need it. + */ + /* + * We unfortunately need to clamp lod_fpart here since we can get + * negative values which would screw up filtering if not all + * lod_fpart values have same sign. + */ + lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart, + bld->perquadf_bld.zero); + need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type, + PIPE_FUNC_GREATER, + lod_fpart, bld->perquadf_bld.zero); + need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp); + } lp_build_if(&if_ctx, bld->gallivm, need_lerp); { @@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, /* interpolate samples from the two mipmap levels */ - lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart); + lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm, + bld->perquadf_bld.type, + bld->texel_bld.type, + lod_fpart); for (chan = 0; chan < 4; chan++) { colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, @@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, } } - - /** - * General texture sampling codegen. - * This function handles texture sampling for all texture targets (1D, - * 2D, 3D, cube) and all filtering modes. + * Calculate cube face, lod, mip levels. */ static void -lp_build_sample_general(struct lp_build_sample_context *bld, - unsigned unit, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, - LLVMValueRef lod_bias, /* optional */ - LLVMValueRef explicit_lod, /* optional */ - LLVMValueRef *colors_out) +lp_build_sample_common(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef *s, + LLVMValueRef *t, + LLVMValueRef *r, + const struct lp_derivatives *derivs, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef *lod_ipart, + LLVMValueRef *lod_fpart, + LLVMValueRef *ilevel0, + LLVMValueRef *ilevel1) { - struct lp_build_context *int_bld = &bld->int_bld; - LLVMBuilderRef builder = bld->gallivm->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; - LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; - LLVMValueRef ilevel0, ilevel1 = NULL; - LLVMValueRef face_ddx[4], face_ddy[4]; - LLVMValueRef texels[4]; LLVMValueRef first_level; - LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0); - unsigned chan; + struct lp_derivatives face_derivs; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld, */ if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ + lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t); + *s = face_s; /* vec */ + *t = face_t; /* vec */ /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ + *r = face; /* vec */ /* recompute ddx, ddy using the new (s,t) face texcoords */ - face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s); - face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t); - face_ddx[2] = NULL; - face_ddx[3] = NULL; - face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s); - face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t); - face_ddy[2] = NULL; - face_ddy[3] = NULL; - ddx = face_ddx; - ddy = face_ddy; + face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t); + face_derivs.ddx_ddy[1] = NULL; + derivs = &face_derivs; } /* @@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lp_build_lod_selector(bld, unit, ddx, ddy, + lp_build_lod_selector(bld, unit, derivs, lod_bias, explicit_lod, mip_filter, - &lod_ipart, &lod_fpart); + lod_ipart, lod_fpart); } else { - lod_ipart = i32t_zero; + *lod_ipart = bld->perquadi_bld.zero; } /* @@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* XXX this is a work-around for an apparent bug in LLVM 2.7. * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. + * XXX should probably disable that on other llvm versions. */ - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + assert(*lod_ipart); + lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0); } else { first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm, unit); - ilevel0 = first_level; + first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level); + *ilevel0 = first_level; } break; case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod_ipart); - lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + assert(*lod_ipart); + lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0); break; case PIPE_TEX_MIPFILTER_LINEAR: - assert(lod_ipart); - assert(lod_fpart); + assert(*lod_ipart); + assert(*lod_fpart); lp_build_linear_mip_levels(bld, unit, - lod_ipart, &lod_fpart, - &ilevel0, &ilevel1); + *lod_ipart, lod_fpart, + ilevel0, ilevel1); break; } +} + +/** + * General texture sampling codegen. + * This function handles texture sampling for all texture targets (1D, + * 2D, 3D, cube) and all filtering modes. + */ +static void +lp_build_sample_general(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef lod_ipart, + LLVMValueRef lod_fpart, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, + LLVMValueRef *colors_out) +{ + struct lp_build_context *int_bld = &bld->int_bld; + LLVMBuilderRef builder = bld->gallivm->builder; + const unsigned mip_filter = bld->static_state->min_mip_filter; + const unsigned min_filter = bld->static_state->min_img_filter; + const unsigned mag_filter = bld->static_state->mag_img_filter; + LLVMValueRef texels[4]; + unsigned chan; /* * Get/interpolate texture colors. @@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld, } if (min_filter == mag_filter) { - /* no need to distinquish between minification and magnification */ + /* no need to distinguish between minification and magnification */ lp_build_sample_mipmap(bld, unit, min_filter, mip_filter, s, t, r, @@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld, * For debugging. */ void -lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, +lp_build_sample_nop(struct gallivm_state *gallivm, + struct lp_type type, + unsigned num_coords, + const LLVMValueRef *coords, LLVMValueRef texel_out[4]) { LLVMValueRef one = lp_build_one(gallivm, type); @@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type, * 'texel' will return a vector of four LLVMValueRefs corresponding to * R, G, B, A. * \param type vector float type to use for coords, etc. - * \param ddx partial derivatives of (s,t,r,q) with respect to x - * \param ddy partial derivatives of (s,t,r,q) with respect to y + * \param derivs partial derivatives of (s,t,r,q) with respect to x and y */ void lp_build_sample_soa(struct gallivm_state *gallivm, @@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef texel_out[4]) @@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm, struct lp_build_sample_context bld; LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef tex_width, tex_height, tex_depth; LLVMValueRef s; LLVMValueRef t; LLVMValueRef r; - struct lp_type float_vec_type; if (0) { enum pipe_format fmt = static_state->format; @@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm, bld.format_desc = util_format_description(static_state->format); bld.dims = dims; + bld.vector_width = lp_type_width(type); + bld.float_type = lp_type_float(32); bld.int_type = lp_type_int(32); bld.coord_type = type; @@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm, bld.float_size_type.length = dims > 1 ? 4 : 1; bld.int_size_type = lp_int_type(bld.float_size_type); bld.texel_type = type; - - float_vec_type = lp_type_float_vec(32); + bld.perquadf_type = type; + /* we want native vector size to be able to use our intrinsics */ + bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1; + bld.perquadi_type = lp_int_type(bld.perquadf_type); lp_build_context_init(&bld.float_bld, gallivm, bld.float_type); - lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type); + lp_build_context_init(&bld.float_vec_bld, gallivm, type); lp_build_context_init(&bld.int_bld, gallivm, bld.int_type); lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type); lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type); lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type); lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type); lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type); + lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type); + lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type); /* Get the dynamic state */ - bld.width = dynamic_state->width(dynamic_state, gallivm, unit); - bld.height = dynamic_state->height(dynamic_state, gallivm, unit); - bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit); + tex_width = dynamic_state->width(dynamic_state, gallivm, unit); + tex_height = dynamic_state->height(dynamic_state, gallivm, unit); + tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit); bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit); bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit); bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit); @@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm, /* width, height, depth as single int vector */ if (dims <= 1) { - bld.int_size = bld.width; + bld.int_size = tex_width; } else { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef, - bld.width, LLVMConstInt(i32t, 0, 0), ""); + tex_width, LLVMConstInt(i32t, 0, 0), ""); if (dims >= 2) { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, - bld.height, LLVMConstInt(i32t, 1, 0), ""); + tex_height, LLVMConstInt(i32t, 1, 0), ""); if (dims >= 3) { bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, - bld.depth, LLVMConstInt(i32t, 2, 0), ""); + tex_depth, LLVMConstInt(i32t, 2, 0), ""); } } } if (0) { /* For debug: no-op texture sampling */ - lp_build_sample_nop(gallivm, bld.texel_type, texel_out); - } - else if (util_format_fits_8unorm(bld.format_desc) && - lp_is_simple_wrap_mode(static_state->wrap_s) && - lp_is_simple_wrap_mode(static_state->wrap_t)) { - /* do sampling/filtering with fixed pt arithmetic */ - lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy, - lod_bias, explicit_lod, + lp_build_sample_nop(gallivm, + bld.texel_type, + num_coords, + coords, texel_out); } - else { + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; + LLVMValueRef ilevel0 = NULL, ilevel1 = NULL; + unsigned num_quads = type.length / 4; + const unsigned mip_filter = bld.static_state->min_mip_filter; + boolean use_aos = util_format_fits_8unorm(bld.format_desc) && + lp_is_simple_wrap_mode(static_state->wrap_s) && + lp_is_simple_wrap_mode(static_state->wrap_t); + if ((gallivm_debug & GALLIVM_DEBUG_PERF) && - util_format_fits_8unorm(bld.format_desc)) { + !use_aos && util_format_fits_8unorm(bld.format_desc)) { debug_printf("%s: using floating point linear filtering for %s\n", __FUNCTION__, bld.format_desc->short_name); debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n", @@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm, static_state->wrap_t); } - lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy, - lod_bias, explicit_lod, - texel_out); + lp_build_sample_common(&bld, unit, + &s, &t, &r, + derivs, lod_bias, explicit_lod, + &lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); + + /* + * we only try 8-wide sampling with soa as it appears to + * be a loss with aos with AVX. + */ + if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE && + !use_aos)) { + + if (num_quads > 1) { + LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); + /* These parameters are the same for all quads */ + lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, ""); + ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, ""); + } + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld, unit, + s, t, r, + lod_ipart, lod_fpart, + ilevel0, ilevel1, + texel_out); + } + + else { + lp_build_sample_general(&bld, unit, + s, t, r, + lod_ipart, lod_fpart, + ilevel0, ilevel1, + texel_out); + } + } + else { + struct lp_build_if_state if_ctx; + LLVMValueRef notsame_levels, notsame; + LLVMValueRef index0 = lp_build_const_int32(gallivm, 0); + LLVMValueRef texels[4]; + LLVMValueRef texelout[4]; + unsigned j; + + texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr"); + texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg"); + texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb"); + texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa"); + + /* only build the if if we MAY split, otherwise always split */ + if (!use_aos) { + notsame = lp_build_extract_broadcast(gallivm, + bld.perquadi_bld.type, + bld.perquadi_bld.type, + ilevel0, index0); + notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame); + notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads, + notsame); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + notsame = lp_build_extract_broadcast(gallivm, + bld.perquadi_bld.type, + bld.perquadi_bld.type, + ilevel1, index0); + notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame); + notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame); + notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, ""); + } + lp_build_if(&if_ctx, gallivm, notsame_levels); + } + + { + struct lp_build_sample_context bld4; + struct lp_type type4 = type; + unsigned i; + LLVMValueRef texelout4[4]; + LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16]; + + type4.length = 4; + + /* Setup our build context */ + memset(&bld4, 0, sizeof bld4); + bld4.gallivm = bld.gallivm; + bld4.static_state = bld.static_state; + bld4.dynamic_state = bld.dynamic_state; + bld4.format_desc = bld.format_desc; + bld4.dims = bld.dims; + bld4.row_stride_array = bld.row_stride_array; + bld4.img_stride_array = bld.img_stride_array; + bld4.data_array = bld.data_array; + bld4.int_size = bld.int_size; + + bld4.vector_width = lp_type_width(type4); + + bld4.float_type = lp_type_float(32); + bld4.int_type = lp_type_int(32); + bld4.coord_type = type4; + bld4.int_coord_type = lp_int_type(type4); + bld4.float_size_type = lp_type_float(32); + bld4.float_size_type.length = dims > 1 ? 4 : 1; + bld4.int_size_type = lp_int_type(bld4.float_size_type); + bld4.texel_type = type4; + bld4.perquadf_type = type4; + /* we want native vector size to be able to use our intrinsics */ + bld4.perquadf_type.length = 1; + bld4.perquadi_type = lp_int_type(bld4.perquadf_type); + + lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type); + lp_build_context_init(&bld4.float_vec_bld, gallivm, type4); + lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type); + lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type); + lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type); + lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type); + lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type); + lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type); + lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type); + lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type); + + for (i = 0; i < num_quads; i++) { + LLVMValueRef s4, t4, r4; + LLVMValueRef lod_iparts, lod_fparts = NULL; + LLVMValueRef ilevel0s, ilevel1s = NULL; + LLVMValueRef indexi = lp_build_const_int32(gallivm, i); + + s4 = lp_build_extract_range(gallivm, s, 4*i, 4); + t4 = lp_build_extract_range(gallivm, t, 4*i, 4); + r4 = lp_build_extract_range(gallivm, r, 4*i, 4); + lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, ""); + ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, ""); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, ""); + lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, ""); + } + + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld4, unit, + s4, t4, r4, + lod_iparts, lod_fparts, + ilevel0s, ilevel1s, + texelout4); + } + + else { + lp_build_sample_general(&bld4, unit, + s4, t4, r4, + lod_iparts, lod_fparts, + ilevel0s, ilevel1s, + texelout4); + } + for (j = 0; j < 4; j++) { + texelouttmp[j][i] = texelout4[j]; + } + } + for (j = 0; j < 4; j++) { + texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads); + LLVMBuildStore(builder, texelout[j], texels[j]); + } + } + if (!use_aos) { + LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL; + + lp_build_else(&if_ctx); + + /* These parameters are the same for all quads */ + lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, ""); + ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, ""); + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, ""); + } + + if (use_aos) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld, unit, + s, t, r, + lod_iparts, lod_fpart, + ilevel0s, ilevel1s, + texelout); + } + + else { + lp_build_sample_general(&bld, unit, + s, t, r, + lod_iparts, lod_fpart, + ilevel0s, ilevel1s, + texelout); + } + for (j = 0; j < 4; j++) { + LLVMBuildStore(builder, texelout[j], texels[j]); + } + + lp_build_endif(&if_ctx); + } + + for (j = 0; j < 4; j++) { + texel_out[j] = LLVMBuildLoad(builder, texels[j], ""); + } + } } lp_build_sample_compare(&bld, r, texel_out); @@ -1283,6 +1559,7 @@ void lp_build_size_query_soa(struct gallivm_state *gallivm, const struct lp_sampler_static_state *static_state, struct lp_sampler_dynamic_state *dynamic_state, + struct lp_type int_type, unsigned unit, LLVMValueRef explicit_lod, LLVMValueRef *sizes_out) @@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, return; } - lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32)); + assert(!int_type.floating); + + lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128)); if (explicit_lod) { LLVMValueRef first_level; @@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, size = lp_build_minify(&bld_int_vec, size, lod); for (i=0; i < dims; i++) { - sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type, + sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type, size, lp_build_const_int32(gallivm, i)); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 5d44068..641c960 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -40,6 +40,7 @@ #include "lp_bld_init.h" #include "lp_bld_logic.h" #include "lp_bld_swizzle.h" +#include "lp_bld_pack.h" LLVMValueRef @@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld, /** - * Combined extract and broadcast (or a mere shuffle when the two types match) + * Combined extract and broadcast (mere shuffle in most cases) */ LLVMValueRef lp_build_extract_broadcast(struct gallivm_state *gallivm, @@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, } } else { - if (dst_type.length == src_type.length) { + if (dst_type.length > 1) { /* - * Special shuffle of the same size. + * shuffle - result can be of different length. */ LLVMValueRef shuffle; @@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, LLVMVectorType(i32t, dst_type.length), index); res = LLVMBuildShuffleVector(gallivm->builder, vector, - LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)), + LLVMGetUndef(lp_build_vec_type(gallivm, src_type)), shuffle, ""); } else { - LLVMValueRef scalar; - scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); - if (dst_type.length == 1) { - /* - * Trivial extract scalar from vector. - */ - - res = scalar; - } - else { - /* - * General case of different sized vectors. - */ - - res = lp_build_broadcast(gallivm, - lp_build_vec_type(gallivm, dst_type), - vector); - } + /* + * Trivial extract scalar from vector. + */ + res = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); } } @@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld, return bld->zero; case PIPE_SWIZZLE_ONE: return bld->one; + case LP_BLD_SWIZZLE_DONTCARE: + return bld->undef; default: assert(0); return bld->undef; @@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld, case PIPE_SWIZZLE_BLUE: case PIPE_SWIZZLE_ALPHA: shuffle = j + swizzles[i]; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); break; case PIPE_SWIZZLE_ZERO: shuffle = type.length + 0; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[0]) { aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0); } break; case PIPE_SWIZZLE_ONE: shuffle = type.length + 1; + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[1]) { aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0); } break; + case LP_BLD_SWIZZLE_DONTCARE: + shuffles[j + i] = LLVMGetUndef(i32t); + break; } - shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); } } @@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld, lp_build_swizzle_soa(bld, unswizzled, swizzles, values); } + + +/** + * Transpose from AOS <-> SOA + * + * @param single_type_lp type of pixels + * @param src the 4 * n pixel input + * @param dst the 4 * n pixel output + */ +void +lp_build_transpose_aos(struct gallivm_state *gallivm, + struct lp_type single_type_lp, + const LLVMValueRef src[4], + LLVMValueRef dst[4]) +{ + struct lp_type double_type_lp = single_type_lp; + LLVMTypeRef single_type; + LLVMTypeRef double_type; + LLVMValueRef t0, t1, t2, t3; + + double_type_lp.length >>= 1; + double_type_lp.width <<= 1; + + double_type = lp_build_vec_type(gallivm, double_type_lp); + single_type = lp_build_vec_type(gallivm, single_type_lp); + + /* Interleave x, y, z, w -> xy and zw */ + t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0); + t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0); + t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1); + t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1); + + /* Cast to double width type for second interleave */ + t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0"); + t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1"); + t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2"); + t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3"); + + /* Interleave xy, zw -> xyzw */ + dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0); + dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1); + dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0); + dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1); + + /* Cast back to original single width type */ + dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0"); + dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1"); + dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2"); + dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3"); +} + + +/** + * Pack first element of aos values, + * pad out to destination size. + * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _ + */ +LLVMValueRef +lp_build_pack_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src) +{ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef undef = LLVMGetUndef(i32t); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned num_src = src_type.length / 4; + unsigned num_dst = dst_type.length; + unsigned i; + + assert(num_src <= num_dst); + + for (i = 0; i < num_src; i++) { + shuffles[i] = LLVMConstInt(i32t, i * 4, 0); + } + for (i = num_src; i < num_dst; i++) { + shuffles[i] = undef; + } + + if (num_dst == 1) { + return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], ""); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(shuffles, num_dst), ""); + } +} + + +/** + * Unpack and broadcast packed aos values consisting of only the + * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2 + */ +LLVMValueRef +lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src) +{ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned num_dst = dst_type.length; + unsigned num_src = dst_type.length / 4; + unsigned i; + + assert(num_dst / 4 <= src_type.length); + + for (i = 0; i < num_src; i++) { + shuffles[i*4] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+1] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+2] = LLVMConstInt(i32t, i, 0); + shuffles[i*4+3] = LLVMConstInt(i32t, i, 0); + } + + if (num_src == 1) { + return lp_build_extract_broadcast(gallivm, src_type, dst_type, + src, shuffles[0]); + } + else { + return LLVMBuildShuffleVector(gallivm->builder, src, src, + LLVMConstVector(shuffles, num_dst), ""); + } +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index c366a65..0bf4ce9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -44,6 +44,9 @@ struct lp_type; struct lp_build_context; +#define LP_BLD_SWIZZLE_DONTCARE 0xFF + + LLVMValueRef lp_build_broadcast(struct gallivm_state *gallivm, LLVMTypeRef vec_type, @@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld, const unsigned char swizzles[4]); +void +lp_build_transpose_aos(struct gallivm_state *gallivm, + struct lp_type type, + const LLVMValueRef src[4], + LLVMValueRef dst[4]); + + +LLVMValueRef +lp_build_pack_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src); + + +LLVMValueRef +lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef src); + + #endif /* !LP_BLD_SWIZZLE_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 4423bc5..e292420 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -60,6 +60,7 @@ struct tgsi_token; struct tgsi_shader_info; struct lp_build_mask_context; struct gallivm_state; +struct lp_derivatives; enum lp_build_tex_modifier { @@ -174,8 +175,7 @@ struct lp_build_sampler_soa unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *texel); @@ -183,6 +183,7 @@ struct lp_build_sampler_soa void (*emit_size_query)( const struct lp_build_sampler_soa *sampler, struct gallivm_state *gallivm, + struct lp_type type, unsigned unit, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *sizes_out); @@ -197,8 +198,7 @@ struct lp_build_sampler_aos unsigned target, /* TGSI_TEXTURE_* */ unsigned unit, LLVMValueRef coords, - LLVMValueRef ddx, - LLVMValueRef ddy, + const struct lp_derivatives derivs, enum lp_build_tex_modifier modifier); }; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c index 24bc13a..0666bba 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -56,6 +56,7 @@ #include "lp_bld_quad.h" #include "lp_bld_tgsi.h" #include "lp_bld_debug.h" +#include "lp_bld_sample.h" /** @@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, LLVMValueRef coords; LLVMValueRef ddx; LLVMValueRef ddy; + struct lp_derivatives derivs; if (!bld->sampler) { _debug_printf("warning: found texture instruction but no sampler generator supplied\n"); @@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL); - if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL); ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL); unit = inst->Src[3].Register.Index; @@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, ddy = lp_build_ddy( &bld->bld_base.base, coords ); #else /* TODO */ - ddx = bld->bld_base.base.one; - ddy = bld->bld_base.base.one; + derivs.ddx_ddy[0] = bld->bld_base.base.one; + derivs.ddx_ddy[1] = bld->bld_base.base.one; #endif unit = inst->Src[1].Register.Index; } @@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld, return bld->sampler->emit_fetch_texel(bld->sampler, &bld->bld_base.base, target, unit, - coords, ddx, ddy, + coords, derivs, modifier); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index d9faaf2..85a4401 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -62,6 +62,7 @@ #include "lp_bld_limits.h" #include "lp_bld_debug.h" #include "lp_bld_printf.h" +#include "lp_bld_sample.h" static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld) @@ -763,7 +764,7 @@ emit_fetch_temporary( else { LLVMValueRef temp_ptr; if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) { - LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); + LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0); LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle); temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, ""); @@ -1068,7 +1069,7 @@ emit_store_chan( switch (dtype) { case TGSI_TYPE_UNSIGNED: case TGSI_TYPE_SIGNED: { - LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + LLVMTypeRef itype = bld_base->int_bld.vec_type; LLVMTypeRef ivtype = LLVMPointerType(itype, 0); LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index); @@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, LLVMValueRef *texel) { LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; unsigned unit; LLVMValueRef lod_bias, explicit_lod; LLVMValueRef oow = NULL; LLVMValueRef coords[3]; - LLVMValueRef ddx[3]; - LLVMValueRef ddy[3]; + struct lp_derivatives derivs; unsigned num_coords; + unsigned dims; unsigned i; if (!bld->sampler) { @@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, return; } + derivs.ddx_ddy[0] = bld->bld_base.base.undef; + derivs.ddx_ddy[1] = bld->bld_base.base.undef; + switch (inst->Texture.Texture) { case TGSI_TEXTURE_1D: num_coords = 1; + dims = 1; break; case TGSI_TEXTURE_1D_ARRAY: + num_coords = 2; + dims = 1; + break; case TGSI_TEXTURE_2D: case TGSI_TEXTURE_RECT: num_coords = 2; + dims = 2; break; case TGSI_TEXTURE_SHADOW1D: case TGSI_TEXTURE_SHADOW1D_ARRAY: + num_coords = 3; + dims = 1; + break; case TGSI_TEXTURE_SHADOW2D: case TGSI_TEXTURE_SHADOWRECT: case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_3D: case TGSI_TEXTURE_CUBE: num_coords = 3; + dims = 2; + break; + case TGSI_TEXTURE_3D: + num_coords = 3; + dims = 3; break; case TGSI_TEXTURE_SHADOW2D_ARRAY: num_coords = 4; + dims = 2; break; default: assert(0); @@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, } if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { - LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0); - for (i = 0; i < num_coords; i++) { - LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i ); - LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i ); - ddx[i] = LLVMBuildExtractElement(builder, src1, index0, ""); - ddy[i] = LLVMBuildExtractElement(builder, src2, index0, ""); + LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef ddxdyonec[3]; + unsigned length = bld->bld_base.base.type.length; + unsigned num_quads = length / 4; + unsigned dim; + unsigned quad; + + for (dim = 0; dim < dims; ++dim) { + LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim ); + LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim ); + for (quad = 0; quad < num_quads; ++quad) { + unsigned s1 = 4*quad; + unsigned s2 = 4*quad + length; + shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1); + shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2); + shuffles[4*quad + 2] = i32undef; + shuffles[4*quad + 3] = i32undef; + } + ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy, + LLVMConstVector(shuffles, length), ""); + } + if (dims == 1) { + derivs.ddx_ddy[0] = ddxdyonec[0]; + } + else if (dims >= 2) { + for (quad = 0; quad < num_quads; ++quad) { + unsigned s1 = 4*quad; + unsigned s2 = 4*quad + length; + shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1); + shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1); + shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2); + shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1); + } + derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1], + LLVMConstVector(shuffles, length), ""); + if (dims == 3) { + derivs.ddx_ddy[1] = ddxdyonec[2]; + } } unit = inst->Src[3].Register.Index; } else { - for (i = 0; i < num_coords; i++) { - ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] ); - ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] ); + if (dims == 1) { + derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]); + } + else if (dims >= 2) { + derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base, + coords[0], coords[1]); + if (dims == 3) { + derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]); + } } unit = inst->Src[1].Register.Index; } - for (i = num_coords; i < 3; i++) { - ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type); - ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type); - } bld->sampler->emit_fetch_texel(bld->sampler, bld->bld_base.base.gallivm, bld->bld_base.base.type, unit, num_coords, coords, - ddx, ddy, + &derivs, lod_bias, explicit_lod, texel); } @@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld, bld->sampler->emit_size_query(bld->sampler, bld->bld_base.base.gallivm, + bld->bld_base.int_bld.type, inst->Src[1].Register.Index, explicit_lod, sizes_out); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c index 413e69b..6c3aa38 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c @@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type) { if (type.floating) { switch(type.width) { + case 16: + return LLVMIntTypeInContext(gallivm->context, 16); + break; case 32: return LLVMFloatTypeInContext(gallivm->context); break; @@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) if (type.floating) { switch(type.width) { + case 16: + if(elem_kind != LLVMIntegerTypeKind) + return FALSE; + break; case 32: if(elem_kind != LLVMFloatTypeKind) return FALSE; @@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type) /** - * Build int32[4] vector type - */ -LLVMTypeRef -lp_build_int32_vec4_type(struct gallivm_state *gallivm) -{ - struct lp_type t; - LLVMTypeRef type; - - memset(&t, 0, sizeof(t)); - t.floating = FALSE; /* floating point values */ - t.sign = TRUE; /* values are signed */ - t.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ - t.width = 32; /* 32-bit int */ - t.length = 4; /* 4 elements per vector */ - - type = lp_build_int_elem_type(gallivm, t); - return LLVMVectorType(type, t.length); -} - - -/** * Create element of vector type */ struct lp_type diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index f11a190..75310e0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -40,21 +40,35 @@ #include "pipe/p_compiler.h" #include "gallivm/lp_bld.h" +/** + * Native SIMD architecture width available at runtime. + * + * Using this width should give the best performance, + * and it determines the necessary alignment of vector variables. + */ +extern unsigned lp_native_vector_width; +/** + * Maximum supported vector width (not necessarily supported at run-time). + * + * Should only be used when lp_native_vector_width isn't available, + * i.e. sizing/alignment of non-malloced variables. + */ +#define LP_MAX_VECTOR_WIDTH 256 /** - * Native SIMD register width. + * Minimum vector alignment for static variable alignment * - * 128 for all architectures we care about. + * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8. An + * expression is non-portable. */ -#define LP_NATIVE_VECTOR_WIDTH 128 +#define LP_MIN_VECTOR_ALIGN 32 /** * Several functions can only cope with vectors of length up to this value. * You may need to increase that value if you want to represent bigger vectors. */ -#define LP_MAX_VECTOR_LENGTH 16 - +#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8) /** * The LLVM type system can't conveniently express all the things we care about @@ -151,6 +165,13 @@ struct lp_build_context }; +static INLINE unsigned +lp_type_width(struct lp_type type) +{ + return type.width * type.length; +} + + /** Create scalar float type */ static INLINE struct lp_type lp_type_float(unsigned width) @@ -169,7 +190,7 @@ lp_type_float(unsigned width) /** Create vector of float type */ static INLINE struct lp_type -lp_type_float_vec(unsigned width) +lp_type_float_vec(unsigned width, unsigned total_width) { struct lp_type res_type; @@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width) res_type.floating = TRUE; res_type.sign = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -200,14 +221,14 @@ lp_type_int(unsigned width) /** Create vector int type */ static INLINE struct lp_type -lp_type_int_vec(unsigned width) +lp_type_int_vec(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.sign = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -229,34 +250,34 @@ lp_type_uint(unsigned width) /** Create vector uint type */ static INLINE struct lp_type -lp_type_uint_vec(unsigned width) +lp_type_uint_vec(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_unorm(unsigned width) +lp_type_unorm(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.norm = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_fixed(unsigned width) +lp_type_fixed(unsigned width, unsigned total_width) { struct lp_type res_type; @@ -264,21 +285,21 @@ lp_type_fixed(unsigned width) res_type.sign = TRUE; res_type.fixed = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } static INLINE struct lp_type -lp_type_ufixed(unsigned width) +lp_type_ufixed(unsigned width, unsigned total_width) { struct lp_type res_type; memset(&res_type, 0, sizeof res_type); res_type.fixed = TRUE; res_type.width = width; - res_type.length = LP_NATIVE_VECTOR_WIDTH / width; + res_type.length = total_width / width; return res_type; } @@ -312,10 +333,6 @@ LLVMTypeRef lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type); -LLVMTypeRef -lp_build_int32_vec4_type(struct gallivm_state *gallivm); - - static INLINE struct lp_type lp_float32_vec4_type(void) { diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index 856e8d7..b44d9d9 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -35,9 +35,16 @@ #ifndef _UTIL_CPU_DETECT_H #define _UTIL_CPU_DETECT_H + #include "pipe/p_compiler.h" #include "pipe/p_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + + struct util_cpu_caps { unsigned nr_cpus; @@ -66,4 +73,9 @@ util_cpu_caps; void util_cpu_detect(void); +#ifdef __cplusplus +} +#endif + + #endif /* _UTIL_CPU_DETECT_H */ diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore index f6973b5..21cd3cf 100644 --- a/src/gallium/drivers/llvmpipe/.gitignore +++ b/src/gallium/drivers/llvmpipe/.gitignore @@ -4,4 +4,3 @@ lp_test_blend lp_test_conv lp_test_format lp_test_printf -lp_test_round diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile index 26fbde9..ef16fc7 100644 --- a/src/gallium/drivers/llvmpipe/Makefile +++ b/src/gallium/drivers/llvmpipe/Makefile @@ -55,8 +55,7 @@ PROGS := lp_test_format \ lp_test_arit \ lp_test_blend \ lp_test_conv \ - lp_test_printf \ - lp_test_round + lp_test_printf # Need this for the lp_test_*.o files CLEAN_EXTRA = *.o diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index 85560a1..cea44a7 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -94,7 +94,6 @@ if not env['embedded']: if not env['msvc']: tests.append('arit') - tests.append('round') for test in tests: testname = 'lp_test_' + test diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c index 87a6a27..8efa75c 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c @@ -59,6 +59,7 @@ #include "pipe/p_state.h" #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_arit.h" @@ -102,7 +103,16 @@ lp_build_stencil_test_single(struct lp_build_context *bld, struct lp_type type = bld->type; LLVMValueRef res; - assert(type.sign); + /* + * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values + * are between 0..255 so ensure we generate the fastest comparisons for + * wider elements. + */ + if (type.width <= 8) { + assert(!type.sign); + } else { + assert(type.sign); + } assert(stencil->enabled); @@ -424,29 +434,86 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, LLVMBuilderRef builder = gallivm->builder; LLVMContextRef context = gallivm->context; LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); - LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); - LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8TypeInContext(context), 16); - LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti"); - LLVMValueRef maskarray[4] = { - lp_build_const_int32(gallivm, 0), - lp_build_const_int32(gallivm, 4), - lp_build_const_int32(gallivm, 8), - lp_build_const_int32(gallivm, 12) - }; - LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4); - LLVMValueRef shufflev = LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev"); - LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32TypeInContext(context), "shuffle"); - LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32TypeInContext(context), shuffle); - LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig"); - LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr"); - LLVMBuildStore(builder, incr, counter); + LLVMValueRef count, newcount; + + assert(type.length <= 16); + assert(type.floating); + + if(util_cpu_caps.has_sse && type.length == 4) { + const char *movmskintr = "llvm.x86.sse.movmsk.ps"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, + lp_build_vec_type(gallivm, type), ""); + bits = lp_build_intrinsic_unary(builder, movmskintr, + LLVMInt32TypeInContext(context), bits); + count = lp_build_intrinsic_unary(builder, popcntintr, + LLVMInt32TypeInContext(context), bits); + } + else if(util_cpu_caps.has_avx && type.length == 8) { + const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, + lp_build_vec_type(gallivm, type), ""); + bits = lp_build_intrinsic_unary(builder, movmskintr, + LLVMInt32TypeInContext(context), bits); + count = lp_build_intrinsic_unary(builder, popcntintr, + LLVMInt32TypeInContext(context), bits); + } + else { + unsigned i; + LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); + LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); + LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); + LLVMValueRef shufflev, countd; + LLVMValueRef shuffles[16]; + const char *popcntintr = NULL; + + countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); + + for (i = 0; i < type.length; i++) { + shuffles[i] = lp_build_const_int32(gallivm, 4*i); + } + + shufflev = LLVMConstVector(shuffles, type.length); + countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); + countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); + + /* + * XXX FIXME + * this is bad on cpus without popcount (on x86 supported by intel + * nehalem, amd barcelona, and up - not tied to sse42). + * Would be much faster to just sum the 4 elements of the vector with + * some horizontal add (shuffle/add/shuffle/add after the initial and). + */ + switch (type.length) { + case 4: + popcntintr = "llvm.ctpop.i32"; + break; + case 8: + popcntintr = "llvm.ctpop.i64"; + break; + case 16: + popcntintr = "llvm.ctpop.i128"; + break; + default: + assert(0); + } + count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); + + if (type.length > 4) { + count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), ""); + } + } + newcount = LLVMBuildLoad(builder, counter, "origcount"); + newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); + LLVMBuildStore(builder, newcount, counter); } /** * Generate code for performing depth and/or stencil tests. - * We operate on a vector of values (typically a 2x2 quad). + * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state @@ -454,9 +521,9 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) - * \param z_src the incoming depth/stencil values (a 2x2 quad, float32) + * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) * \param zs_dst_ptr pointer to depth/stencil values in framebuffer - * \param facing contains boolean value indicating front/back facing polygon + * \param face contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(struct gallivm_state *gallivm, @@ -507,6 +574,12 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm, assert(z_type.width == z_src_type.width); assert(z_type.length == z_src_type.length); + /* FIXME: for non-float depth/stencil might generate better code + * if we'd always split it up to use 128bit operations. + * For stencil we'd almost certainly want to pack to 8xi16 values, + * for z just run twice. + */ + /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; @@ -548,7 +621,7 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm, lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ - s_type = lp_type_int_vec(z_type.width); + s_type = lp_int_type(z_type); lp_build_context_init(&s_bld, gallivm, s_type); /* Load current z/stencil value from z/stencil buffer */ diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index 0d51ccb..d108f35 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -61,6 +61,9 @@ * # | # | # * ################# * + * If we iterate over multiple quads at once, quads 01 and 23 are processed + * together. + * * Within each quad, we have four pixels which are represented in SOA * order: * @@ -72,6 +75,10 @@ * * So the green channel (for example) of the four pixels is stored in * a single vector register: {g0, g1, g2, g3}. + * The order stays the same even with multiple quads: + * 0 1 4 5 + * 2 3 6 7 + * is stored as g0..g7 */ @@ -102,8 +109,8 @@ #define PERSPECTIVE_DIVIDE_PER_QUAD 0 -static const unsigned char quad_offset_x[4] = {0, 1, 0, 1}; -static const unsigned char quad_offset_y[4] = {0, 0, 1, 1}; +static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3}; +static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3}; static void @@ -115,132 +122,353 @@ attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix); } - -/** - * Initialize the bld->a0, dadx, dady fields. This involves fetching - * those values from the arrays which are passed into the JIT function. +/* Much easier, and significantly less instructions in the per-stamp + * part (less than half) but overall more instructions so a loss if + * most quads are active. Might be a win though with larger vectors. + * No ability to do per-quad divide (doable but not implemented) + * Could be made to work with passed in pixel offsets (i.e. active quad merging). */ static void -coeffs_init(struct lp_build_interp_soa_context *bld, - LLVMValueRef a0_ptr, - LLVMValueRef dadx_ptr, - LLVMValueRef dady_ptr) +coeffs_init_simple(struct lp_build_interp_soa_context *bld, + LLVMValueRef a0_ptr, + LLVMValueRef dadx_ptr, + LLVMValueRef dady_ptr) { struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; struct gallivm_state *gallivm = coeff_bld->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef zero = LLVMConstNull(coeff_bld->elem_type); - LLVMValueRef one = LLVMConstReal(coeff_bld->elem_type, 1.0); - LLVMValueRef i0 = lp_build_const_int32(gallivm, 0); - LLVMValueRef i1 = lp_build_const_int32(gallivm, 1); - LLVMValueRef i2 = lp_build_const_int32(gallivm, 2); - LLVMValueRef i3 = lp_build_const_int32(gallivm, 3); unsigned attrib; - unsigned chan; - - /* TODO: Use more vector operations */ for (attrib = 0; attrib < bld->num_attribs; ++attrib) { + /* + * always fetch all 4 values for performance/simplicity + * Note: we do that here because it seems to generate better + * code. It generates a lot of moves initially but less + * moves later. As far as I can tell this looks like a + * llvm issue, instead of simply reloading the values from + * the passed in pointers it if it runs out of registers + * it spills/reloads them. Maybe some optimization passes + * would help. + * Might want to investigate this again later. + */ + const unsigned interp = bld->interp[attrib]; + LLVMValueRef index = lp_build_const_int32(gallivm, + attrib * TGSI_NUM_CHANNELS); + LLVMValueRef ptr; + LLVMValueRef dadxaos = setup_bld->zero; + LLVMValueRef dadyaos = setup_bld->zero; + LLVMValueRef a0aos = setup_bld->zero; + + switch (interp) { + case LP_INTERP_PERSPECTIVE: + /* fall-through */ + + case LP_INTERP_LINEAR: + ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadxaos = LLVMBuildLoad(builder, ptr, ""); + + ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadyaos = LLVMBuildLoad(builder, ptr, ""); + + attrib_name(dadxaos, attrib, 0, ".dadxaos"); + attrib_name(dadyaos, attrib, 0, ".dadyaos"); + /* fall-through */ + + case LP_INTERP_CONSTANT: + case LP_INTERP_FACING: + ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + a0aos = LLVMBuildLoad(builder, ptr, ""); + attrib_name(a0aos, attrib, 0, ".a0aos"); + break; + + case LP_INTERP_POSITION: + /* Nothing to do as the position coeffs are already setup in slot 0 */ + continue; + + default: + assert(0); + break; + } + bld->a0aos[attrib] = a0aos; + bld->dadxaos[attrib] = dadxaos; + bld->dadyaos[attrib] = dadyaos; + } +} + +/** + * Interpolate the shader input attribute values. + * This is called for each (group of) quad(s). + */ +static void +attribs_update_simple(struct lp_build_interp_soa_context *bld, + struct gallivm_state *gallivm, + int quad_start_index, + int start, + int end) +{ + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; + LLVMValueRef oow = NULL; + unsigned attrib, i; + LLVMValueRef pixoffx; + LLVMValueRef pixoffy; + unsigned num_pix = coeff_bld->type.length; + + /* could do this with code-generated passed in pixel offsets */ + pixoffx = coeff_bld->undef; + pixoffy = coeff_bld->undef; + for (i = 0; i < coeff_bld->type.length; i++) { + LLVMValueRef nr = lp_build_const_int32(gallivm, i); + LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] + + (quad_start_index & 1) * 2); + LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] + + (quad_start_index & 2)); + pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, ""); + pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, ""); + } + + pixoffx = LLVMBuildFAdd(builder, pixoffx, + lp_build_broadcast_scalar(coeff_bld, bld->x), ""); + pixoffy = LLVMBuildFAdd(builder, pixoffy, + lp_build_broadcast_scalar(coeff_bld, bld->y), ""); + + for (attrib = start; attrib < end; attrib++) { const unsigned mask = bld->mask[attrib]; const unsigned interp = bld->interp[attrib]; - for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + unsigned chan; + + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { if (mask & (1 << chan)) { - LLVMValueRef index = lp_build_const_int32(gallivm, - attrib * TGSI_NUM_CHANNELS + chan); - LLVMValueRef a0 = zero; - LLVMValueRef dadx = zero; - LLVMValueRef dady = zero; - LLVMValueRef dadxy = zero; - LLVMValueRef dadq; - LLVMValueRef dadq2; - LLVMValueRef a; + LLVMValueRef index; + LLVMValueRef dadx = coeff_bld->zero; + LLVMValueRef dady = coeff_bld->zero; + LLVMValueRef a = coeff_bld->zero; + index = lp_build_const_int32(gallivm, chan); switch (interp) { case LP_INTERP_PERSPECTIVE: /* fall-through */ case LP_INTERP_LINEAR: if (attrib == 0 && chan == 0) { - dadxy = dadx = one; + dadx = coeff_bld->one; } else if (attrib == 0 && chan == 1) { - dadxy = dady = one; + dady = coeff_bld->one; } else { - dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), ""); - dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), ""); - dadxy = LLVMBuildFAdd(builder, dadx, dady, ""); - attrib_name(dadx, attrib, chan, ".dadx"); - attrib_name(dady, attrib, chan, ".dady"); - attrib_name(dadxy, attrib, chan, ".dadxy"); + dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->dadxaos[attrib], + index); + dady = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->dadyaos[attrib], + index); + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->a0aos[attrib], + index); } - /* fall-through */ + /* + * a = a0 + (x * dadx + y * dady) + */ + dadx = LLVMBuildFMul(builder, dadx, pixoffx, ""); + dady = LLVMBuildFMul(builder, dady, pixoffy, ""); + a = LLVMBuildFAdd(builder, a, dadx, ""); + a = LLVMBuildFAdd(builder, a, dady, ""); + + if (interp == LP_INTERP_PERSPECTIVE) { + if (oow == NULL) { + LLVMValueRef w = bld->attribs[0][3]; + assert(attrib != 0); + assert(bld->mask[0] & TGSI_WRITEMASK_W); + oow = lp_build_rcp(coeff_bld, w); + } + a = lp_build_mul(coeff_bld, a, oow); + } + break; case LP_INTERP_CONSTANT: case LP_INTERP_FACING: - a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), ""); - attrib_name(a0, attrib, chan, ".a0"); + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, bld->a0aos[attrib], + index); break; case LP_INTERP_POSITION: - /* Nothing to do as the position coeffs are already setup in slot 0 */ - continue; + assert(attrib > 0); + a = bld->attribs[0][chan]; + break; default: assert(0); break; } - /* - * dadq = {0, dadx, dady, dadx + dady} - */ + if ((attrib == 0) && (chan == 2)){ + /* FIXME: Depth values can exceed 1.0, due to the fact that + * setup interpolation coefficients refer to (0,0) which causes + * precision loss. So we must clamp to 1.0 here to avoid artifacts + */ + a = lp_build_min(coeff_bld, a, coeff_bld->one); + } + bld->attribs[attrib][chan] = a; + } + } + } +} - dadq = coeff_bld->undef; - dadq = LLVMBuildInsertElement(builder, dadq, zero, i0, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dadx, i1, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dady, i2, ""); - dadq = LLVMBuildInsertElement(builder, dadq, dadxy, i3, ""); +/** + * Initialize the bld->a, dadq fields. This involves fetching + * those values from the arrays which are passed into the JIT function. + */ +static void +coeffs_init(struct lp_build_interp_soa_context *bld, + LLVMValueRef a0_ptr, + LLVMValueRef dadx_ptr, + LLVMValueRef dady_ptr) +{ + struct lp_build_context *coeff_bld = &bld->coeff_bld; + struct lp_build_context *setup_bld = &bld->setup_bld; + struct gallivm_state *gallivm = coeff_bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef pixoffx, pixoffy; + unsigned attrib; + unsigned chan; + unsigned i; + + pixoffx = coeff_bld->undef; + pixoffy = coeff_bld->undef; + for (i = 0; i < coeff_bld->type.length; i++) { + LLVMValueRef nr = lp_build_const_int32(gallivm, i); + LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]); + LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]); + pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, ""); + pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, ""); + } - /* - * dadq2 = 2 * dq - */ - dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); + for (attrib = 0; attrib < bld->num_attribs; ++attrib) { + const unsigned mask = bld->mask[attrib]; + const unsigned interp = bld->interp[attrib]; + LLVMValueRef index = lp_build_const_int32(gallivm, + attrib * TGSI_NUM_CHANNELS); + LLVMValueRef ptr; + LLVMValueRef dadxaos = setup_bld->zero; + LLVMValueRef dadyaos = setup_bld->zero; + LLVMValueRef a0aos = setup_bld->zero; + + /* always fetch all 4 values for performance/simplicity */ + switch (interp) { + case LP_INTERP_PERSPECTIVE: + /* fall-through */ + + case LP_INTERP_LINEAR: + ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadxaos = LLVMBuildLoad(builder, ptr, ""); + + ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + dadyaos = LLVMBuildLoad(builder, ptr, ""); + + attrib_name(dadxaos, attrib, 0, ".dadxaos"); + attrib_name(dadyaos, attrib, 0, ".dadyaos"); + /* fall-through */ + + case LP_INTERP_CONSTANT: + case LP_INTERP_FACING: + ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, ""); + ptr = LLVMBuildBitCast(builder, ptr, + LLVMPointerType(setup_bld->vec_type, 0), ""); + a0aos = LLVMBuildLoad(builder, ptr, ""); + attrib_name(a0aos, attrib, 0, ".a0aos"); + break; + + case LP_INTERP_POSITION: + /* Nothing to do as the position coeffs are already setup in slot 0 */ + continue; + + default: + assert(0); + break; + } - /* - * a = a0 + (x * dadx + y * dady) - */ + /* + * a = a0 + (x * dadx + y * dady) + * a0aos is the attrib value at top left corner of stamp + */ + if (interp != LP_INTERP_CONSTANT && + interp != LP_INTERP_FACING) { + LLVMValueRef axaos, ayaos; + axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x), + dadxaos, ""); + ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y), + dadyaos, ""); + a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, ""); + a0aos = LLVMBuildFAdd(builder, a0aos, axaos, ""); + } + + /* + * dadq = {0, dadx, dady, dadx + dady} + * for two quads (side by side) this is: + * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady} + */ + for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + /* this generates a CRAPLOAD of shuffles... */ + if (mask & (1 << chan)) { + LLVMValueRef dadx, dady; + LLVMValueRef dadq, dadq2; + LLVMValueRef a; + LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan); if (attrib == 0 && chan == 0) { - a = bld->x; + a = lp_build_broadcast_scalar(coeff_bld, bld->x); + dadx = coeff_bld->one; + dady = coeff_bld->zero; } else if (attrib == 0 && chan == 1) { - a = bld->y; + a = lp_build_broadcast_scalar(coeff_bld, bld->y); + dady = coeff_bld->one; + dadx = coeff_bld->zero; } else { - a = a0; - if (interp != LP_INTERP_CONSTANT && - interp != LP_INTERP_FACING) { - LLVMValueRef ax, ay, axy; - ax = LLVMBuildFMul(builder, bld->x, dadx, ""); - ay = LLVMBuildFMul(builder, bld->y, dady, ""); - axy = LLVMBuildFAdd(builder, ax, ay, ""); - a = LLVMBuildFAdd(builder, a, axy, ""); - } - } + dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, dadxaos, chan_index); + dady = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, dadyaos, chan_index); - /* - * a = {a, a, a, a} - */ + /* + * a = {a, a, a, a} + */ + a = lp_build_extract_broadcast(gallivm, setup_bld->type, + coeff_bld->type, a0aos, chan_index); + } - a = lp_build_broadcast(gallivm, coeff_bld->vec_type, a); + dadx = LLVMBuildFMul(builder, dadx, pixoffx, ""); + dady = LLVMBuildFMul(builder, dady, pixoffy, ""); + dadq = LLVMBuildFAdd(builder, dadx, dady, ""); /* - * Compute the attrib values on the upper-left corner of each quad. + * Compute the attrib values on the upper-left corner of each + * group of quads. + * Note that if we process 2 quads at once this doesn't + * really exactly to what we want. + * We need to access elem 0 and 2 respectively later if we process + * 2 quads at once. */ if (interp != LP_INTERP_CONSTANT && interp != LP_INTERP_FACING) { + dadq2 = LLVMBuildFAdd(builder, dadq, dadq, ""); a = LLVMBuildFAdd(builder, a, dadq2, ""); } @@ -249,6 +477,12 @@ coeffs_init(struct lp_build_interp_soa_context *bld, * a *= 1 / w */ + /* + * XXX since we're only going to access elements 0,2 out of 8 + * if we have 8-wide vectors we should do the division only 4-wide. + * a is really a 2-elements in a 4-wide vector disguised as 8-wide + * in this case. + */ if (interp == LP_INTERP_PERSPECTIVE) { LLVMValueRef w = bld->a[0][3]; assert(attrib != 0); @@ -279,18 +513,18 @@ coeffs_init(struct lp_build_interp_soa_context *bld, static void attribs_update(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index, + int quad_start_index, int start, int end) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context *coeff_bld = &bld->coeff_bld; - LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_index); + LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index); LLVMValueRef oow = NULL; unsigned attrib; unsigned chan; - assert(quad_index < 4); + assert(quad_start_index < 4); for(attrib = start; attrib < end; ++attrib) { const unsigned mask = bld->mask[attrib]; @@ -412,6 +646,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, LLVMValueRef y0) { struct lp_type coeff_type; + struct lp_type setup_type; unsigned attrib; unsigned chan; @@ -421,19 +656,26 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, coeff_type.floating = TRUE; coeff_type.sign = TRUE; coeff_type.width = 32; - coeff_type.length = TGSI_QUAD_SIZE; + coeff_type.length = type.length; + + memset(&setup_type, 0, sizeof setup_type); + setup_type.floating = TRUE; + setup_type.sign = TRUE; + setup_type.width = 32; + setup_type.length = TGSI_NUM_CHANNELS; + /* XXX: we don't support interpolating into any other types */ assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0); lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type); + lp_build_context_init(&bld->setup_bld, gallivm, setup_type); /* For convenience */ bld->pos = bld->attribs[0]; bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1]; /* Position */ - bld->num_attribs = 1; bld->mask[0] = TGSI_WRITEMASK_XYZW; bld->interp[0] = LP_INTERP_LINEAR; @@ -453,7 +695,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, pos_init(bld, x0, y0); - coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); + if (coeff_type.length > 4) { + coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr); + } + else { + coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr); + } } @@ -463,20 +710,30 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, void lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index) + int quad_start_index) { - assert(quad_index < 4); + assert(quad_start_index < 4); - attribs_update(bld, gallivm, quad_index, 1, bld->num_attribs); + if (bld->coeff_bld.type.length > 4) { + attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs); + } + else { + attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs); + } } void lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index) + int quad_start_index) { - assert(quad_index < 4); + assert(quad_start_index < 4); - attribs_update(bld, gallivm, quad_index, 0, 1); + if (bld->coeff_bld.type.length > 4) { + attribs_update_simple(bld, gallivm, quad_start_index, 0, 1); + } + else { + attribs_update(bld, gallivm, quad_start_index, 0, 1); + } } diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 6970a9b..f293b58 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -79,6 +79,7 @@ struct lp_build_interp_soa_context { /* TGSI_QUAD_SIZE x float */ struct lp_build_context coeff_bld; + struct lp_build_context setup_bld; unsigned num_attribs; unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */ @@ -87,8 +88,11 @@ struct lp_build_interp_soa_context LLVMValueRef x; LLVMValueRef y; - LLVMValueRef a [1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef a[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; LLVMValueRef dadq[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; + LLVMValueRef a0aos[1 + PIPE_MAX_SHADER_INPUTS]; + LLVMValueRef dadxaos[1 + PIPE_MAX_SHADER_INPUTS]; + LLVMValueRef dadyaos[1 + PIPE_MAX_SHADER_INPUTS]; LLVMValueRef oow; @@ -118,12 +122,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, void lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index); + int quad_start_index); void lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, - int quad_index); + int quad__start_index); #endif /* LP_BLD_INTERP_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 9e4c7d6..07cea91 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -51,42 +51,6 @@ unsigned llvmpipe_variant_count; -/** - * This function is called by the gallivm "garbage collector" when - * the LLVM global data structures are freed. We must free all LLVM-related - * data. Specifically, all JIT'd shader variants. - */ -static void -garbage_collect_callback(void *cb_data) -{ - struct llvmpipe_context *lp = (struct llvmpipe_context *) cb_data; - struct lp_fs_variant_list_item *li; - - /* Free all the context's shader variants */ - li = first_elem(&lp->fs_variants_list); - while (!at_end(&lp->fs_variants_list, li)) { - struct lp_fs_variant_list_item *next = next_elem(li); - llvmpipe_remove_shader_variant(lp, li->base); - li = next; - } - - /* Free all the context's primitive setup variants */ - lp_delete_setup_variants(lp); - - /* release references to setup variants, shaders */ - lp_setup_set_setup_variant(lp->setup, NULL); - lp_setup_set_fs_variant(lp->setup, NULL); - lp_setup_reset(lp->setup); - - /* This type will be recreated upon demand */ - lp->jit_context_ptr_type = NULL; - - /* mark all state as dirty to ensure new shaders are jit'd, etc. */ - lp->dirty = ~0; -} - - - static void llvmpipe_destroy( struct pipe_context *pipe ) { struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe ); @@ -94,9 +58,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) lp_print_counters(); - gallivm_remove_garbage_collector_callback(garbage_collect_callback, - llvmpipe); - /* This will also destroy llvmpipe->setup: */ if (llvmpipe->draw) @@ -128,8 +89,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) lp_delete_setup_variants(llvmpipe); - gallivm_destroy(llvmpipe->gallivm); - align_free( llvmpipe ); } @@ -195,12 +154,10 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) llvmpipe_init_context_resource_funcs( &llvmpipe->pipe ); llvmpipe_init_surface_functions(llvmpipe); - llvmpipe->gallivm = gallivm_create(); - /* * Create drawing context and plug our rendering stage into it. */ - llvmpipe->draw = draw_create_gallivm(&llvmpipe->pipe, llvmpipe->gallivm); + llvmpipe->draw = draw_create(&llvmpipe->pipe); if (!llvmpipe->draw) goto fail; @@ -226,9 +183,6 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) lp_reset_counters(); - gallivm_register_garbage_collector_callback(garbage_collect_callback, - llvmpipe); - return &llvmpipe->pipe; fail: diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index d475070..d0220e1 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -131,10 +131,6 @@ struct llvmpipe_context { unsigned nr_fs_variants; unsigned nr_fs_instrs; - /** JIT code generation */ - struct gallivm_state *gallivm; - LLVMTypeRef jit_context_ptr_type; - struct lp_setup_variant_list_item setup_variants_list; unsigned nr_setup_variants; diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c index 4243055..964b792 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.c +++ b/src/gallium/drivers/llvmpipe/lp_flush.c @@ -54,13 +54,6 @@ llvmpipe_flush( struct pipe_context *pipe, /* ask the setup module to flush */ lp_setup_flush(llvmpipe->setup, fence, reason); - - if (llvmpipe_variant_count > 1000) { - /* time to do a garbage collection */ - gallivm_garbage_collect(llvmpipe->gallivm); - llvmpipe_variant_count = 0; - } - /* Enable to dump BMPs of the color/depth buffers each frame */ if (0) { static unsigned frame_no = 1; diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index eb1db84..7a85eab 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -41,7 +41,7 @@ static void -lp_jit_create_types(struct llvmpipe_context *lp) +lp_jit_create_types(struct lp_fragment_shader_variant *lp) { struct gallivm_state *gallivm = lp->gallivm; LLVMContextRef lc = gallivm->context; @@ -183,11 +183,9 @@ lp_jit_screen_init(struct llvmpipe_screen *screen) } -LLVMTypeRef -lp_jit_get_context_type(struct llvmpipe_context *lp) +void +lp_jit_init_types(struct lp_fragment_shader_variant *lp) { if (!lp->jit_context_ptr_type) lp_jit_create_types(lp); - - return lp->jit_context_ptr_type; } diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 04e8dd5..584d2c8 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -42,6 +42,7 @@ #include "lp_texture.h" +struct lp_fragment_shader_variant; struct llvmpipe_screen; @@ -164,8 +165,8 @@ void lp_jit_screen_init(struct llvmpipe_screen *screen); -LLVMTypeRef -lp_jit_get_context_type(struct llvmpipe_context *lp); +void +lp_jit_init_types(struct lp_fragment_shader_variant *lp); #endif /* LP_JIT_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_memory.c b/src/gallium/drivers/llvmpipe/lp_memory.c index 0f55d4a..85f73e5 100644 --- a/src/gallium/drivers/llvmpipe/lp_memory.c +++ b/src/gallium/drivers/llvmpipe/lp_memory.c @@ -36,10 +36,12 @@ * number of threads or using a smaller tilesize when multiple * colorbuffers are bound. */ -PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; +PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) +uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; /* A single dummy tile used in a couple of out-of-memory situations. */ -PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; +PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) +uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; diff --git a/src/gallium/drivers/llvmpipe/lp_memory.h b/src/gallium/drivers/llvmpipe/lp_memory.h index f7418f5..5552c29 100644 --- a/src/gallium/drivers/llvmpipe/lp_memory.h +++ b/src/gallium/drivers/llvmpipe/lp_memory.h @@ -32,9 +32,12 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "lp_limits.h" +#include "gallivm/lp_bld_type.h" -extern PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; +extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) +uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; -extern PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; +extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) +uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; #endif /* LP_MEMORY_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 09af027..d743d76 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -42,6 +42,7 @@ #include "lp_tile_soa.h" #include "gallivm/lp_bld_debug.h" #include "lp_scene.h" +#include "lp_tex_sample.h" #ifdef DEBUG diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 03d15f6..54f4535 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -97,56 +97,56 @@ #include "lp_state_fs.h" -#include <llvm-c/Analysis.h> -#include <llvm-c/BitWriter.h> - - /** Fragment shader number (for debugging) */ static unsigned fs_no = 0; /** - * Expand the relevent bits of mask_input to a 4-dword mask for the - * four pixels in a 2x2 quad. This will set the four elements of the + * Expand the relevant bits of mask_input to a n*4-dword mask for the + * n*four pixels in n 2x2 quads. This will set the n*four elements of the * quad mask vector to 0 or ~0. + * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid + * quad arguments with fs length 8. * - * \param quad which quad of the quad group to test, in [0,3] + * \param first_quad which quad(s) of the quad group to test, in [0,3] * \param mask_input bitwise mask for the whole 4x4 stamp */ static LLVMValueRef generate_quad_mask(struct gallivm_state *gallivm, struct lp_type fs_type, - unsigned quad, + unsigned first_quad, LLVMValueRef mask_input) /* int32 */ { LLVMBuilderRef builder = gallivm->builder; struct lp_type mask_type; LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); - LLVMValueRef bits[4]; + LLVMValueRef bits[16]; LLVMValueRef mask; - int shift; + int shift, i; /* * XXX: We'll need a different path for 16 x u8 */ assert(fs_type.width == 32); - assert(fs_type.length == 4); + assert(fs_type.length <= Elements(bits)); mask_type = lp_int_type(fs_type); /* * mask_input >>= (quad * 4) */ - switch (quad) { + switch (first_quad) { case 0: shift = 0; break; case 1: + assert(fs_type.length == 4); shift = 2; break; case 2: shift = 8; break; case 3: + assert(fs_type.length == 4); shift = 10; break; default: @@ -166,12 +166,14 @@ generate_quad_mask(struct gallivm_state *gallivm, lp_build_vec_type(gallivm, mask_type), mask_input); - bits[0] = LLVMConstInt(i32t, 1 << 0, 0); - bits[1] = LLVMConstInt(i32t, 1 << 1, 0); - bits[2] = LLVMConstInt(i32t, 1 << 4, 0); - bits[3] = LLVMConstInt(i32t, 1 << 5, 0); - - mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); + for (i = 0; i < fs_type.length / 4; i++) { + unsigned j = 2 * (i % 2) + (i / 2) * 8; + bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0); + bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0); + bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0); + bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0); + } + mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), ""); /* * mask = mask != 0 ? ~0 : 0 @@ -300,7 +302,7 @@ generate_fs(struct gallivm_state *gallivm, /* do triangle edge testing */ if (partial_mask) { *pmask = generate_quad_mask(gallivm, type, - i, mask_input); + i*type.length/4, mask_input); } else { *pmask = lp_build_const_int_vec(gallivm, type, ~0); @@ -312,7 +314,7 @@ generate_fs(struct gallivm_state *gallivm, if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader) lp_build_mask_check(&mask); - lp_build_interp_soa_update_pos(interp, gallivm, i); + lp_build_interp_soa_update_pos(interp, gallivm, i*type.length/4); z = interp->pos[2]; if (depth_mode & EARLY_DEPTH_TEST) { @@ -333,7 +335,7 @@ generate_fs(struct gallivm_state *gallivm, } } - lp_build_interp_soa_update_inputs(interp, gallivm, i); + lp_build_interp_soa_update_inputs(interp, gallivm, i*type.length/4); /* Build the actual shader */ lp_build_tgsi_soa(gallivm, tokens, type, &mask, @@ -515,7 +517,7 @@ generate_fragment(struct llvmpipe_context *lp, struct lp_fragment_shader_variant *variant, unsigned partial_mask) { - struct gallivm_state *gallivm = lp->gallivm; + struct gallivm_state *gallivm = variant->gallivm; const struct lp_fragment_shader_variant_key *key = &variant->key; struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; char func_name[256]; @@ -541,8 +543,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMBuilderRef builder; struct lp_build_sampler_soa *sampler; struct lp_build_interp_soa_context interp; - LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][LP_MAX_VECTOR_LENGTH]; + LLVMValueRef fs_mask[16 / 4]; + LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4]; LLVMValueRef blend_mask; LLVMValueRef function; LLVMValueRef facing; @@ -553,6 +555,8 @@ generate_fragment(struct llvmpipe_context *lp, unsigned cbuf; boolean cbuf0_write_all; + assert(lp_native_vector_width / 32 >= 4); + /* Adjust color input interpolation according to flatshade state: */ memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]); @@ -579,12 +583,12 @@ generate_fragment(struct llvmpipe_context *lp, * characteristics. */ memset(&fs_type, 0, sizeof fs_type); - fs_type.floating = TRUE; /* floating point values */ - fs_type.sign = TRUE; /* values are signed */ - fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ - fs_type.width = 32; /* 32-bit float */ - fs_type.length = 4; /* 4 elements per vector */ - num_fs = 4; /* number of quads per block */ + fs_type.floating = TRUE; /* floating point values */ + fs_type.sign = TRUE; /* values are signed */ + fs_type.norm = FALSE; /* values are not limited to [0,1] or [-1,1] */ + fs_type.width = 32; /* 32-bit float */ + fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */ + num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ memset(&blend_type, 0, sizeof blend_type); blend_type.floating = FALSE; /* values are integers */ @@ -605,7 +609,7 @@ generate_fragment(struct llvmpipe_context *lp, util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", shader->no, variant->no, partial_mask ? "partial" : "whole"); - arg_types[0] = lp_jit_get_context_type(lp); /* context */ + arg_types[0] = variant->jit_context_ptr_type; /* context */ arg_types[1] = int32_type; /* x */ arg_types[2] = int32_type; /* y */ arg_types[3] = int32_type; /* facing */ @@ -738,20 +742,20 @@ generate_fragment(struct llvmpipe_context *lp, LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals"); } - lp_build_conv(gallivm, fs_type, blend_type, + lp_build_conv(gallivm, fs_type, blend_type, fs_color_vals, num_fs, - &blend_in_color[chan], 1); + &blend_in_color[chan], 1); - lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); + lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); } if (partial_mask || !variant->opaque) { - lp_build_conv_mask(lp->gallivm, fs_type, blend_type, + lp_build_conv_mask(variant->gallivm, fs_type, blend_type, fs_mask, num_fs, &blend_mask, 1); } else { - blend_mask = lp_build_const_int_vec(lp->gallivm, blend_type, ~0); + blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0); } color_ptr = LLVMBuildLoad(builder, @@ -772,7 +776,7 @@ generate_fragment(struct llvmpipe_context *lp, !key->alpha.enabled && !shader->info.base.uses_kill); - generate_blend(lp->gallivm, + generate_blend(variant->gallivm, &key->blend, rt, builder, @@ -787,43 +791,9 @@ generate_fragment(struct llvmpipe_context *lp, LLVMBuildRetVoid(builder); - /* Verify the LLVM IR. If invalid, dump and abort */ -#ifdef DEBUG - if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) { - if (1) - lp_debug_dump_value(function); - abort(); - } -#endif - - /* Apply optimizations to LLVM IR */ - LLVMRunFunctionPassManager(gallivm->passmgr, function); - - if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) { - /* Print the LLVM IR to stderr */ - lp_debug_dump_value(function); - debug_printf("\n"); - } - - /* Dump byte code to a file */ - if (0) { - LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc"); - } + gallivm_verify_function(gallivm, function); variant->nr_instrs += lp_build_count_instructions(function); - /* - * Translate the LLVM IR into machine code. - */ - { - void *f = LLVMGetPointerToGlobal(gallivm->engine, function); - - variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f); - - if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) { - lp_disassemble(f); - } - lp_func_delete_body(function); - } } @@ -937,6 +907,12 @@ generate_variant(struct llvmpipe_context *lp, if(!variant) return NULL; + variant->gallivm = gallivm_create(); + if (!variant->gallivm) { + FREE(variant); + return NULL; + } + variant->shader = shader; variant->list_item_global.base = variant; variant->list_item_local.base = variant; @@ -968,12 +944,35 @@ generate_variant(struct llvmpipe_context *lp, lp_debug_fs_variant(variant); } - generate_fragment(lp, shader, variant, RAST_EDGE_TEST); + lp_jit_init_types(variant); + + if (variant->jit_function[RAST_EDGE_TEST] == NULL) + generate_fragment(lp, shader, variant, RAST_EDGE_TEST); + + if (variant->jit_function[RAST_WHOLE] == NULL) { + if (variant->opaque) { + /* Specialized shader, which doesn't need to read the color buffer. */ + generate_fragment(lp, shader, variant, RAST_WHOLE); + } + } + + /* + * Compile everything + */ + + gallivm_compile_module(variant->gallivm); + + if (variant->function[RAST_EDGE_TEST]) { + variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func) + gallivm_jit_function(variant->gallivm, + variant->function[RAST_EDGE_TEST]); + } - if (variant->opaque) { - /* Specialized shader, which doesn't need to read the color buffer. */ - generate_fragment(lp, shader, variant, RAST_WHOLE); - } else { + if (variant->function[RAST_WHOLE]) { + variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func) + gallivm_jit_function(variant->gallivm, + variant->function[RAST_WHOLE]); + } else if (!variant->jit_function[RAST_WHOLE]) { variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; } @@ -1116,13 +1115,14 @@ llvmpipe_remove_shader_variant(struct llvmpipe_context *lp, /* free all the variant's JIT'd functions */ for (i = 0; i < Elements(variant->function); i++) { if (variant->function[i]) { - if (variant->jit_function[i]) - LLVMFreeMachineCodeForFunction(lp->gallivm->engine, - variant->function[i]); - LLVMDeleteFunction(variant->function[i]); + gallivm_free_function(variant->gallivm, + variant->function[i], + variant->jit_function[i]); } } + gallivm_destroy(variant->gallivm); + /* remove from shader's list */ remove_from_list(&variant->list_item_local); variant->shader->variants_cached--; diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 273d241..306f5f9 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -84,6 +84,12 @@ struct lp_fragment_shader_variant boolean opaque; + struct gallivm_state *gallivm; + + LLVMTypeRef jit_context_ptr_type; + LLVMTypeRef jit_thread_data_ptr_type; + LLVMTypeRef jit_linear_context_ptr_type; + LLVMValueRef function[2]; lp_jit_frag_func jit_function[2]; diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c index 299c1ef..1d5e50b 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -38,7 +38,6 @@ #include "gallivm/lp_bld_intr.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_type.h" -#include <llvm-c/Analysis.h> /* for LLVMVerifyFunction */ #include "lp_perf.h" #include "lp_debug.h" @@ -77,12 +76,6 @@ struct lp_setup_args LLVMValueRef dy01_ooa; LLVMValueRef dx20_ooa; LLVMValueRef dx01_ooa; - - /* Temporary, per-attribute: - */ - LLVMValueRef v0a; - LLVMValueRef v1a; - LLVMValueRef v2a; }; @@ -146,7 +139,7 @@ store_coef(struct gallivm_state *gallivm, { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef idx = lp_build_const_int32(gallivm, slot); - + LLVMBuildStore(builder, a0, LLVMBuildGEP(builder, args->a0, &idx, 1, "")); @@ -210,27 +203,13 @@ vert_attrib(struct gallivm_state *gallivm, return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name); } -static LLVMValueRef -vert_clamp(LLVMBuilderRef b, - LLVMValueRef x, - LLVMValueRef min, - LLVMValueRef max) -{ - LLVMValueRef min_result = LLVMBuildFCmp(b, LLVMRealUGT, min, x, ""); - LLVMValueRef max_result = LLVMBuildFCmp(b, LLVMRealUGT, x, max, ""); - LLVMValueRef clamp_value; - - clamp_value = LLVMBuildSelect(b, min_result, min, x, ""); - clamp_value = LLVMBuildSelect(b, max_result, max, x, ""); - - return clamp_value; -} static void lp_twoside(struct gallivm_state *gallivm, struct lp_setup_args *args, const struct lp_setup_variant_key *key, - int bcolor_slot) + int bcolor_slot, + LLVMValueRef attribv[3]) { LLVMBuilderRef b = gallivm->builder; LLVMValueRef a0_back, a1_back, a2_back; @@ -248,67 +227,66 @@ lp_twoside(struct gallivm_state *gallivm, * Prefer select to if so we don't have to worry about phis or * allocas. */ - args->v0a = LLVMBuildSelect(b, front_facing, a0_back, args->v0a, ""); - args->v1a = LLVMBuildSelect(b, front_facing, a1_back, args->v1a, ""); - args->v2a = LLVMBuildSelect(b, front_facing, a2_back, args->v2a, ""); + attribv[0] = LLVMBuildSelect(b, front_facing, a0_back, attribv[0], ""); + attribv[1] = LLVMBuildSelect(b, front_facing, a1_back, attribv[1], ""); + attribv[2] = LLVMBuildSelect(b, front_facing, a2_back, attribv[2], ""); } static void lp_do_offset_tri(struct gallivm_state *gallivm, struct lp_setup_args *args, - const struct lp_setup_variant_key *key) + const struct lp_setup_variant_key *key, + LLVMValueRef inv_det, + LLVMValueRef dxyz01, + LLVMValueRef dxyz20, + LLVMValueRef attribv[3]) { LLVMBuilderRef b = gallivm->builder; struct lp_build_context bld; LLVMValueRef zoffset, mult; LLVMValueRef z0_new, z1_new, z2_new; - LLVMValueRef dzdx0, dzdx, dzdy0, dzdy; - LLVMValueRef max, max_value; - - LLVMValueRef one = lp_build_const_float(gallivm, 1.0); - LLVMValueRef zero = lp_build_const_float(gallivm, 0.0); - LLVMValueRef two = lp_build_const_int32(gallivm, 2); - - /* edge vectors: e = v0 - v2, f = v1 - v2 */ - LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x"); - LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x"); - LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x"); - LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y"); - LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y"); - LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y"); - LLVMValueRef v0_z = vert_attrib(gallivm, args->v0, 0, 2, "v0_z"); - LLVMValueRef v1_z = vert_attrib(gallivm, args->v1, 0, 2, "v1_z"); - LLVMValueRef v2_z = vert_attrib(gallivm, args->v2, 0, 2, "v2_z"); - - /* edge vectors: e = v0 - v2, f = v1 - v2 */ - LLVMValueRef dx02 = LLVMBuildFSub(b, v0_x, v2_x, "dx02"); - LLVMValueRef dy02 = LLVMBuildFSub(b, v0_y, v2_y, "dy02"); - LLVMValueRef dz02 = LLVMBuildFSub(b, v0_z, v2_z, "dz02"); - LLVMValueRef dx12 = LLVMBuildFSub(b, v1_x, v2_x, "dx12"); - LLVMValueRef dy12 = LLVMBuildFSub(b, v1_y, v2_y, "dy12"); - LLVMValueRef dz12 = LLVMBuildFSub(b, v1_z, v2_z, "dz12"); - - /* det = cross(e,f).z */ - LLVMValueRef dx02_dy12 = LLVMBuildFMul(b, dx02, dy12, "dx02_dy12"); - LLVMValueRef dy02_dx12 = LLVMBuildFMul(b, dy02, dx12, "dy02_dx12"); - LLVMValueRef det = LLVMBuildFSub(b, dx02_dy12, dy02_dx12, "det"); - LLVMValueRef inv_det = LLVMBuildFDiv(b, one, det, "inv_det"); - - /* (res1,res2) = cross(e,f).xy */ - LLVMValueRef dy02_dz12 = LLVMBuildFMul(b, dy02, dz12, "dy02_dz12"); - LLVMValueRef dz02_dy12 = LLVMBuildFMul(b, dz02, dy12, "dz02_dy12"); - LLVMValueRef dz02_dx12 = LLVMBuildFMul(b, dz02, dx12, "dz02_dx12"); - LLVMValueRef dx02_dz12 = LLVMBuildFMul(b, dx02, dz12, "dx02_dz12"); - LLVMValueRef res1 = LLVMBuildFSub(b, dy02_dz12, dz02_dy12, "res1"); - LLVMValueRef res2 = LLVMBuildFSub(b, dz02_dx12, dx02_dz12, "res2"); + LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, dzx01_dyz20; + LLVMValueRef z0z1, z0z1z2; + LLVMValueRef max, max_value, res12; + LLVMValueRef shuffles[4]; + LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef onei = lp_build_const_int32(gallivm, 1); + LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0); + LLVMValueRef twoi = lp_build_const_int32(gallivm, 2); + LLVMValueRef threei = lp_build_const_int32(gallivm, 3); + + /* (res12) = cross(e,f).xy */ + shuffles[0] = twoi; + shuffles[1] = zeroi; + shuffles[2] = onei; + shuffles[3] = twoi; + dzxyz20 = LLVMBuildShuffleVector(b, dxyz20, dxyz20, LLVMConstVector(shuffles, 4), ""); + + shuffles[0] = onei; + shuffles[1] = twoi; + shuffles[2] = twoi; + shuffles[3] = zeroi; + dyzzx01 = LLVMBuildShuffleVector(b, dxyz01, dxyz01, LLVMConstVector(shuffles, 4), ""); + + dyzzx01_dzxyz20 = LLVMBuildFMul(b, dzxyz20, dyzzx01, "dyzzx01_dzxyz20"); + + shuffles[0] = twoi; + shuffles[1] = threei; + shuffles[2] = LLVMGetUndef(shuf_type); + shuffles[3] = LLVMGetUndef(shuf_type); + dzx01_dyz20 = LLVMBuildShuffleVector(b, dyzzx01_dzxyz20, dyzzx01_dzxyz20, + LLVMConstVector(shuffles, 4), ""); + + res12 = LLVMBuildFSub(b, dyzzx01_dzxyz20, dzx01_dyz20, "res12"); /* dzdx = fabsf(res1 * inv_det), dydx = fabsf(res2 * inv_det)*/ - lp_build_context_init(&bld, gallivm, lp_type_float(32)); - dzdx0 = LLVMBuildFMul(b, res1, inv_det, "dzdx"); - dzdx = lp_build_abs(&bld, dzdx0); - dzdy0 = LLVMBuildFMul(b, res2, inv_det, "dzdy"); - dzdy = lp_build_abs(&bld, dzdy0); + lp_build_context_init(&bld, gallivm, lp_type_float_vec(32, 128)); + dzdxdzdy = LLVMBuildFMul(b, res12, inv_det, "dzdxdzdy"); + dzdxdzdy = lp_build_abs(&bld, dzdxdzdy); + + dzdx = LLVMBuildExtractElement(b, dzdxdzdy, zeroi, ""); + dzdy = LLVMBuildExtractElement(b, dzdxdzdy, onei, ""); /* zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale */ max = LLVMBuildFCmp(b, LLVMRealUGT, dzdx, dzdy, ""); @@ -317,45 +295,56 @@ lp_do_offset_tri(struct gallivm_state *gallivm, mult = LLVMBuildFMul(b, max_value, lp_build_const_float(gallivm, key->scale), ""); zoffset = LLVMBuildFAdd(b, lp_build_const_float(gallivm, key->units), mult, "zoffset"); + /* yuck */ + shuffles[0] = twoi; + shuffles[1] = lp_build_const_int32(gallivm, 6); + shuffles[2] = LLVMGetUndef(shuf_type); + shuffles[3] = LLVMGetUndef(shuf_type); + z0z1 = LLVMBuildShuffleVector(b, attribv[0], attribv[1], LLVMConstVector(shuffles, 4), ""); + shuffles[0] = zeroi; + shuffles[1] = onei; + shuffles[2] = lp_build_const_int32(gallivm, 6); + shuffles[3] = LLVMGetUndef(shuf_type); + z0z1z2 = LLVMBuildShuffleVector(b, z0z1, attribv[2], LLVMConstVector(shuffles, 4), ""); + zoffset = vec4f_from_scalar(gallivm, zoffset, ""); + /* clamp and do offset */ - z0_new = vert_clamp(b, LLVMBuildFAdd(b, v0_z, zoffset, ""), zero, one); - z1_new = vert_clamp(b, LLVMBuildFAdd(b, v1_z, zoffset, ""), zero, one); - z2_new = vert_clamp(b, LLVMBuildFAdd(b, v2_z, zoffset, ""), zero, one); + z0z1z2 = lp_build_clamp(&bld, LLVMBuildFAdd(b, z0z1z2, zoffset, ""), bld.zero, bld.one); /* insert into args->a0.z, a1.z, a2.z: - */ - args->v0a = LLVMBuildInsertElement(b, args->v0a, z0_new, two, ""); - args->v1a = LLVMBuildInsertElement(b, args->v1a, z1_new, two, ""); - args->v2a = LLVMBuildInsertElement(b, args->v2a, z2_new, two, ""); + */ + z0_new = LLVMBuildExtractElement(b, z0z1z2, zeroi, ""); + z1_new = LLVMBuildExtractElement(b, z0z1z2, onei, ""); + z2_new = LLVMBuildExtractElement(b, z0z1z2, twoi, ""); + attribv[0] = LLVMBuildInsertElement(b, attribv[0], z0_new, twoi, ""); + attribv[1] = LLVMBuildInsertElement(b, attribv[1], z1_new, twoi, ""); + attribv[2] = LLVMBuildInsertElement(b, attribv[2], z2_new, twoi, ""); } static void load_attribute(struct gallivm_state *gallivm, struct lp_setup_args *args, const struct lp_setup_variant_key *key, - unsigned vert_attr) + unsigned vert_attr, + LLVMValueRef attribv[3]) { LLVMBuilderRef b = gallivm->builder; LLVMValueRef idx = lp_build_const_int32(gallivm, vert_attr); /* Load the vertex data */ - args->v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); - args->v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); - args->v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + attribv[0] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + attribv[1] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + attribv[2] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); - /* Potentially modify it according to twoside, offset, etc: + /* Potentially modify it according to twoside, etc: */ - if (vert_attr == 0 && (key->scale != 0.0f || key->units != 0.0f)) { - lp_do_offset_tri(gallivm, args, key); - } - if (key->twoside) { if (vert_attr == key->color_slot && key->bcolor_slot >= 0) - lp_twoside(gallivm, args, key, key->bcolor_slot); + lp_twoside(gallivm, args, key, key->bcolor_slot, attribv); else if (vert_attr == key->spec_slot && key->bspec_slot >= 0) - lp_twoside(gallivm, args, key, key->bspec_slot); + lp_twoside(gallivm, args, key, key->bspec_slot, attribv); } } @@ -375,8 +364,6 @@ emit_coef4( struct gallivm_state *gallivm, LLVMValueRef x0_center = args->x0_center; LLVMValueRef y0_center = args->y0_center; - /* XXX: using fsub, fmul on vector types -- does this work?? - */ LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01"); LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20"); @@ -406,14 +393,15 @@ emit_coef4( struct gallivm_state *gallivm, static void emit_linear_coef( struct gallivm_state *gallivm, struct lp_setup_args *args, - unsigned slot) + unsigned slot, + LLVMValueRef attribv[3]) { /* nothing to do anymore */ emit_coef4(gallivm, args, slot, - args->v0a, - args->v1a, - args->v2a); + attribv[0], + attribv[1], + attribv[2]); } @@ -426,9 +414,10 @@ emit_linear_coef( struct gallivm_state *gallivm, * divide the interpolated value by the interpolated W at that fragment. */ static void -emit_perspective_coef( struct gallivm_state *gallivm, - struct lp_setup_args *args, - unsigned slot) +apply_perspective_corr( struct gallivm_state *gallivm, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef attribv[3]) { LLVMBuilderRef b = gallivm->builder; @@ -438,20 +427,19 @@ emit_perspective_coef( struct gallivm_state *gallivm, LLVMValueRef v1_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v1, 0, 3, ""), "v1_oow"); LLVMValueRef v2_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v2, 0, 3, ""), "v2_oow"); - LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, args->v0a, v0_oow, "v0_oow_v0a"); - LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, args->v1a, v1_oow, "v1_oow_v1a"); - LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, args->v2a, v2_oow, "v2_oow_v2a"); - - emit_coef4(gallivm, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a); + attribv[0] = LLVMBuildFMul(b, attribv[0], v0_oow, "v0_oow_v0a"); + attribv[1] = LLVMBuildFMul(b, attribv[1], v1_oow, "v1_oow_v1a"); + attribv[2] = LLVMBuildFMul(b, attribv[2], v2_oow, "v2_oow_v2a"); } static void emit_position_coef( struct gallivm_state *gallivm, struct lp_setup_args *args, - int slot ) + int slot, + LLVMValueRef attribv[3]) { - emit_linear_coef(gallivm, args, slot); + emit_linear_coef(gallivm, args, slot, attribv); } @@ -464,7 +452,9 @@ emit_position_coef( struct gallivm_state *gallivm, static void emit_apply_cyl_wrap(struct gallivm_state *gallivm, struct lp_setup_args *args, - uint cyl_wrap) + uint cyl_wrap, + LLVMValueRef attribv[3]) + { LLVMBuilderRef builder = gallivm->builder; struct lp_type type = lp_float32_vec4_type(); @@ -489,43 +479,43 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm, one = LLVMBuildAnd(builder, one, cyl_mask, ""); /* Edge v0 -> v1 */ - delta = LLVMBuildFSub(builder, args->v1a, args->v0a, ""); + delta = LLVMBuildFSub(builder, attribv[1], attribv[0], ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, ""); /* Edge v1 -> v2 */ - delta = LLVMBuildFSub(builder, args->v2a, args->v1a, ""); + delta = LLVMBuildFSub(builder, attribv[2], attribv[1], ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, ""); /* Edge v2 -> v0 */ - delta = LLVMBuildFSub(builder, args->v0a, args->v2a, ""); + delta = LLVMBuildFSub(builder, attribv[0], attribv[2], ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, ""); - offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); - offset = LLVMBuildAnd(builder, offset, one, ""); - offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); - args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, ""); + offset = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half); + offset = LLVMBuildAnd(builder, offset, one, ""); + offset = LLVMBuildBitCast(builder, offset, float_vec_type, ""); + attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, ""); } @@ -534,43 +524,38 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm, */ static void emit_tri_coef( struct gallivm_state *gallivm, - const struct lp_setup_variant_key *key, - struct lp_setup_args *args ) + const struct lp_setup_variant_key *key, + struct lp_setup_args *args) { unsigned slot; - /* The internal position input is in slot zero: - */ - load_attribute(gallivm, args, key, 0); - emit_position_coef(gallivm, args, 0); + LLVMValueRef attribs[3]; - /* setup interpolation for all the remaining attributes: + /* setup interpolation for all the remaining attributes: */ for (slot = 0; slot < key->num_inputs; slot++) { - - if (key->inputs[slot].interp == LP_INTERP_CONSTANT || - key->inputs[slot].interp == LP_INTERP_LINEAR || - key->inputs[slot].interp == LP_INTERP_PERSPECTIVE) - load_attribute(gallivm, args, key, key->inputs[slot].src_index); - switch (key->inputs[slot].interp) { case LP_INTERP_CONSTANT: - if (key->flatshade_first) { - emit_constant_coef4(gallivm, args, slot+1, args->v0a); - } - else { - emit_constant_coef4(gallivm, args, slot+1, args->v2a); - } - break; + load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs); + if (key->flatshade_first) { + emit_constant_coef4(gallivm, args, slot+1, attribs[0]); + } + else { + emit_constant_coef4(gallivm, args, slot+1, attribs[2]); + } + break; case LP_INTERP_LINEAR: - emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap); - emit_linear_coef(gallivm, args, slot+1); + load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs); + emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs); + emit_linear_coef(gallivm, args, slot+1, attribs); break; case LP_INTERP_PERSPECTIVE: - emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap); - emit_perspective_coef(gallivm, args, slot+1); + load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs); + emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs); + apply_perspective_corr(gallivm, args, slot+1, attribs); + emit_linear_coef(gallivm, args, slot+1, attribs); break; case LP_INTERP_POSITION: @@ -591,62 +576,6 @@ emit_tri_coef( struct gallivm_state *gallivm, } -/* XXX: This is generic code, share with fs/vs codegen: - */ -static lp_jit_setup_triangle -finalize_function(struct gallivm_state *gallivm, - LLVMBuilderRef builder, - LLVMValueRef function) -{ - void *f; - - /* Verify the LLVM IR. If invalid, dump and abort */ -#ifdef DEBUG - if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) { - if (1) - lp_debug_dump_value(function); - abort(); - } -#endif - - /* Apply optimizations to LLVM IR */ - LLVMRunFunctionPassManager(gallivm->passmgr, function); - - if (gallivm_debug & GALLIVM_DEBUG_IR) - { - /* Print the LLVM IR to stderr */ - lp_debug_dump_value(function); - debug_printf("\n"); - } - - /* - * Translate the LLVM IR into machine code. - */ - f = LLVMGetPointerToGlobal(gallivm->engine, function); - - if (gallivm_debug & GALLIVM_DEBUG_ASM) - { - lp_disassemble(f); - } - - lp_func_delete_body(function); - - return (lp_jit_setup_triangle) pointer_to_func(f); -} - -/* XXX: Generic code: - */ -static void -lp_emit_emms(struct gallivm_state *gallivm) -{ -#ifdef PIPE_ARCH_X86 - /* Avoid corrupting the FPU stack on 32bit OSes. */ - lp_build_intrinsic(gallivm->builder, "llvm.x86.mmx.emms", - LLVMVoidTypeInContext(gallivm->context), NULL, 0); -#endif -} - - /* XXX: generic code: */ static void @@ -664,49 +593,70 @@ set_noalias(LLVMBuilderRef builder, static void init_args(struct gallivm_state *gallivm, - struct lp_setup_args *args, - const struct lp_setup_variant *variant) + const struct lp_setup_variant_key *key, + struct lp_setup_args *args) { LLVMBuilderRef b = gallivm->builder; + LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef onef = lp_build_const_float(gallivm, 1.0); + LLVMValueRef onei = lp_build_const_int32(gallivm, 1); + LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0); + LLVMValueRef pixel_center, xy0_center, dxy01, dxy20, dyx20; + LLVMValueRef e, f, ef, ooa; + LLVMValueRef shuffles[4]; + LLVMValueRef attr_pos[3]; + struct lp_type typef4 = lp_type_float_vec(32, 128); - LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x"); - LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y"); + /* The internal position input is in slot zero: + */ + load_attribute(gallivm, args, key, 0, attr_pos); - LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x"); - LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y"); + pixel_center = lp_build_const_vec(gallivm, typef4, + key->pixel_center_half ? 0.5 : 0.0); - LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x"); - LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y"); + /* + * xy are first two elems in v0a/v1a/v2a but just use vec4 arit + * also offset_tri uses actually xyz in them + */ + xy0_center = LLVMBuildFSub(b, attr_pos[0], pixel_center, "xy0_center" ); - LLVMValueRef pixel_center = lp_build_const_float(gallivm, - variant->key.pixel_center_half ? 0.5 : 0); + dxy01 = LLVMBuildFSub(b, attr_pos[0], attr_pos[1], "dxy01"); + dxy20 = LLVMBuildFSub(b, attr_pos[2], attr_pos[0], "dxy20"); - LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" ); - LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" ); - - LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01"); - LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01"); - LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20"); - LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20"); + shuffles[0] = onei; + shuffles[1] = zeroi; + shuffles[2] = LLVMGetUndef(shuf_type); + shuffles[3] = LLVMGetUndef(shuf_type); + + dyx20 = LLVMBuildShuffleVector(b, dxy20, dxy20, LLVMConstVector(shuffles, 4), ""); + + ef = LLVMBuildFMul(b, dxy01, dyx20, "ef"); + e = LLVMBuildExtractElement(b, ef, zeroi, ""); + f = LLVMBuildExtractElement(b, ef, onei, ""); - LLVMValueRef one = lp_build_const_float(gallivm, 1.0); - LLVMValueRef e = LLVMBuildFMul(b, dx01, dy20, "e"); - LLVMValueRef f = LLVMBuildFMul(b, dx20, dy01, "f"); - LLVMValueRef ooa = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa"); + ooa = LLVMBuildFDiv(b, onef, LLVMBuildFSub(b, e, f, ""), "ooa"); - LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa"); - LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa"); - LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa"); - LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa"); + ooa = vec4f_from_scalar(gallivm, ooa, ""); + + /* tri offset calc shares a lot of arithmetic, do it here */ + if (key->scale != 0.0f || key->units != 0.0f) { + lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos); + } - args->dy20_ooa = vec4f_from_scalar(gallivm, dy20_ooa, "dy20_ooa_4f"); - args->dy01_ooa = vec4f_from_scalar(gallivm, dy01_ooa, "dy01_ooa_4f"); + dxy20 = LLVMBuildFMul(b, dxy20, ooa, ""); + dxy01 = LLVMBuildFMul(b, dxy01, ooa, ""); - args->dx20_ooa = vec4f_from_scalar(gallivm, dx20_ooa, "dx20_ooa_4f"); - args->dx01_ooa = vec4f_from_scalar(gallivm, dx01_ooa, "dx01_ooa_4f"); + args->dy20_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, onei); + args->dy01_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, onei); - args->x0_center = vec4f_from_scalar(gallivm, x0_center, "x0_center_4f"); - args->y0_center = vec4f_from_scalar(gallivm, y0_center, "y0_center_4f"); + args->dx20_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, zeroi); + args->dx01_ooa = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, zeroi); + + args->x0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, zeroi); + args->y0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, onei); + + /* might want to merge that with other coef emit in the future */ + emit_position_coef(gallivm, args, 0, attr_pos); } /** @@ -714,18 +664,18 @@ init_args(struct gallivm_state *gallivm, * */ static struct lp_setup_variant * -generate_setup_variant(struct gallivm_state *gallivm, - struct lp_setup_variant_key *key, +generate_setup_variant(struct lp_setup_variant_key *key, struct llvmpipe_context *lp) { struct lp_setup_variant *variant = NULL; + struct gallivm_state *gallivm; struct lp_setup_args args; char func_name[256]; LLVMTypeRef vec4f_type; LLVMTypeRef func_type; LLVMTypeRef arg_types[7]; LLVMBasicBlockRef block; - LLVMBuilderRef builder = gallivm->builder; + LLVMBuilderRef builder; int64_t t0 = 0, t1; if (0) @@ -735,6 +685,13 @@ generate_setup_variant(struct gallivm_state *gallivm, if (variant == NULL) goto fail; + variant->gallivm = gallivm = gallivm_create(); + if (!variant->gallivm) { + goto fail; + } + + builder = gallivm->builder; + if (LP_DEBUG & DEBUG_COUNTERS) { t0 = os_time_get(); } @@ -793,14 +750,17 @@ generate_setup_variant(struct gallivm_state *gallivm, LLVMPositionBuilderAtEnd(builder, block); set_noalias(builder, variant->function, arg_types, Elements(arg_types)); - init_args(gallivm, &args, variant); + init_args(gallivm, &variant->key, &args); emit_tri_coef(gallivm, &variant->key, &args); - lp_emit_emms(gallivm); LLVMBuildRetVoid(builder); - variant->jit_function = finalize_function(gallivm, builder, - variant->function); + gallivm_verify_function(gallivm, variant->function); + + gallivm_compile_module(gallivm); + + variant->jit_function = (lp_jit_setup_triangle) + gallivm_jit_function(gallivm, variant->function); if (!variant->jit_function) goto fail; @@ -818,10 +778,12 @@ generate_setup_variant(struct gallivm_state *gallivm, fail: if (variant) { if (variant->function) { - if (variant->jit_function) - LLVMFreeMachineCodeForFunction(gallivm->engine, - variant->function); - LLVMDeleteFunction(variant->function); + gallivm_free_function(gallivm, + variant->function, + variant->jit_function); + } + if (variant->gallivm) { + gallivm_destroy(variant->gallivm); } FREE(variant); } @@ -882,10 +844,13 @@ remove_setup_variant(struct llvmpipe_context *lp, } if (variant->function) { - if (variant->jit_function) - LLVMFreeMachineCodeForFunction(lp->gallivm->engine, - variant->function); - LLVMDeleteFunction(variant->function); + gallivm_free_function(variant->gallivm, + variant->function, + variant->jit_function); + } + + if (variant->gallivm) { + gallivm_destroy(variant->gallivm); } remove_from_list(&variant->list_item_global); @@ -954,7 +919,7 @@ llvmpipe_update_setup(struct llvmpipe_context *lp) cull_setup_variants(lp); } - variant = generate_setup_variant(lp->gallivm, key, lp); + variant = generate_setup_variant(key, lp); if (variant) { insert_at_head(&lp->setup_variants_list, &variant->list_item_global); lp->nr_setup_variants++; diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h index 609c4f6..e0abe46 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -55,6 +55,8 @@ struct lp_setup_variant { struct lp_setup_variant_list_item list_item_global; + struct gallivm_state *gallivm; + /* XXX: this is a pointer to the LLVM IR. Once jit_function is * generated, we never need to use the IR again - need to find a * way to release this data without destroying the generated @@ -69,15 +71,6 @@ struct lp_setup_variant { unsigned no; }; -void lp_setup_tri_fallback( const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean front_facing, - float (*a0)[4], - float (*dadx)[4], - float (*dady)[4], - const struct lp_setup_variant_key *key ); - void lp_delete_setup_variants(struct llvmpipe_context *lp); void diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h index c64f3e1..4b6c8a7 100644 --- a/src/gallium/drivers/llvmpipe/lp_test.h +++ b/src/gallium/drivers/llvmpipe/lp_test.h @@ -42,11 +42,6 @@ #include <float.h> #include "gallivm/lp_bld.h" -#include <llvm-c/Analysis.h> -#include <llvm-c/ExecutionEngine.h> -#include <llvm-c/Target.h> -#include <llvm-c/BitWriter.h> -#include <llvm-c/Transforms/Scalar.h> #include "pipe/p_state.h" #include "util/u_format.h" @@ -64,14 +59,14 @@ write_tsv_header(FILE *fp); boolean -test_some(struct gallivm_state *gallivm,unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n); boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp); +test_single(unsigned verbose, FILE *fp); boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp); +test_all(unsigned verbose, FILE *fp); #if defined(PIPE_CC_MSVC) diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c index 45ca32f..6e09f7e 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_arit.c +++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c @@ -53,7 +53,7 @@ write_tsv_header(FILE *fp) } -typedef float (*unary_func_t)(float); +typedef void (*unary_func_t)(float *out, const float *in); /** @@ -180,6 +180,45 @@ const float sincos_values[] = { 5*M_PI/4, }; +const float round_values[] = { + -10.0, -1, 0.0, 12.0, + -1.49, -0.25, 1.25, 2.51, + -0.99, -0.01, 0.01, 0.99, +}; + +static float fractf(float x) +{ + x -= floorf(x); + if (x >= 1.0f) { + // clamp to the largest number smaller than one + x = 1.0f - 0.5f*FLT_EPSILON; + } + return x; +} + + +const float fract_values[] = { + // http://en.wikipedia.org/wiki/IEEE_754-1985#Examples + 0.0f, + -0.0f, + 1.0f, + -1.0f, + 0.5f, + -0.5f, + 1.401298464324817e-45f, // smallest denormal + -1.401298464324817e-45f, + 5.88e-39f, // middle denormal + 1.18e-38f, // largest denormal + -1.18e-38f, + -1.62981451e-08f, + FLT_EPSILON, + -FLT_EPSILON, + 1.0f - 0.5f*FLT_EPSILON, + -1.0f + FLT_EPSILON, + FLT_MAX, + -FLT_MAX +}; + /* * Unary test cases. @@ -196,6 +235,11 @@ unary_tests[] = { {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 }, {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 }, {"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 }, + {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 }, + {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 }, + {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 }, + {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 }, + {"fract", &lp_build_fract_safe, &fractf, fract_values, Elements(fract_values), 24.0 }, }; @@ -204,39 +248,40 @@ unary_tests[] = { */ static LLVMValueRef build_unary_test_func(struct gallivm_state *gallivm, - LLVMModuleRef module, - LLVMContextRef context, const struct unary_test_t *test) { - struct lp_type type = lp_type_float_vec(32); - LLVMTypeRef i32t = LLVMInt32TypeInContext(context); - LLVMTypeRef f32t = LLVMFloatTypeInContext(context); + struct lp_type type = lp_type_float_vec(32, lp_native_vector_width); + LLVMContextRef context = gallivm->context; + LLVMModuleRef module = gallivm->module; LLVMTypeRef vf32t = lp_build_vec_type(gallivm, type); - LLVMTypeRef args[1] = { f32t }; - LLVMValueRef func = LLVMAddFunction(module, test->name, LLVMFunctionType(f32t, args, Elements(args), 0)); - LLVMValueRef arg1 = LLVMGetParam(func, 0); + LLVMTypeRef args[2] = { LLVMPointerType(vf32t, 0), LLVMPointerType(vf32t, 0) }; + LLVMValueRef func = LLVMAddFunction(module, test->name, + LLVMFunctionType(LLVMVoidTypeInContext(context), + args, Elements(args), 0)); + LLVMValueRef arg0 = LLVMGetParam(func, 0); + LLVMValueRef arg1 = LLVMGetParam(func, 1); LLVMBuilderRef builder = gallivm->builder; LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry"); - LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); LLVMValueRef ret; struct lp_build_context bld; - lp_build_context_init(&bld, gallivm, lp_type_float_vec(32)); + lp_build_context_init(&bld, gallivm, type); LLVMSetFunctionCallConv(func, LLVMCCallConv); LLVMPositionBuilderAtEnd(builder, block); - /* scalar to vector */ - arg1 = LLVMBuildInsertElement(builder, LLVMGetUndef(vf32t), arg1, index0, ""); + arg1 = LLVMBuildLoad(builder, arg1, ""); ret = test->builder(&bld, arg1); - /* vector to scalar */ - ret = LLVMBuildExtractElement(builder, ret, index0, ""); + LLVMBuildStore(builder, ret, arg0); + + LLVMBuildRetVoid(builder); + + gallivm_verify_function(gallivm, func); - LLVMBuildRet(builder, ret); return func; } @@ -245,67 +290,86 @@ build_unary_test_func(struct gallivm_state *gallivm, * Test one LLVM unary arithmetic builder function. */ static boolean -test_unary(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, const struct unary_test_t *test) +test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test) { - LLVMModuleRef module = gallivm->module; + struct gallivm_state *gallivm; LLVMValueRef test_func; - LLVMExecutionEngineRef engine = gallivm->engine; - LLVMContextRef context = gallivm->context; - char *error = NULL; unary_func_t test_func_jit; boolean success = TRUE; - int i; + int i, j; + int length = lp_native_vector_width / 32; + float *in, *out; - test_func = build_unary_test_func(gallivm, module, context, test); + in = align_malloc(length * 4, length * 4); + out = align_malloc(length * 4, length * 4); - if (LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { - printf("LLVMVerifyModule: %s\n", error); - LLVMDumpModule(module); - abort(); + /* random NaNs or 0s could wreak havoc */ + for (i = 0; i < length; i++) { + in[i] = 1.0; } - LLVMDisposeMessage(error); - test_func_jit = (unary_func_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_func)); + gallivm = gallivm_create(); - for (i = 0; i < test->num_values; ++i) { - float value = test->values[i]; - float ref = test->ref(value); - float src = test_func_jit(value); + test_func = build_unary_test_func(gallivm, test); - double error = fabs(src - ref); - double precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG; + gallivm_compile_module(gallivm); - bool pass = precision >= test->precision; + test_func_jit = (unary_func_t) gallivm_jit_function(gallivm, test_func); - if (isnan(ref)) { - continue; - } + for (j = 0; j < (test->num_values + length - 1) / length; j++) { + int num_vals = ((j + 1) * length <= test->num_values) ? length : + test->num_values % length; - if (!pass || verbose) { - printf("%s(%.9g): ref = %.9g, src = %.9g, precision = %f bits, %s\n", - test->name, value, ref, src, precision, - pass ? "PASS" : "FAIL"); + for (i = 0; i < num_vals; ++i) { + in[i] = test->values[i+j*length]; } - if (!pass) { - success = FALSE; + test_func_jit(out, in); + for (i = 0; i < num_vals; ++i) { + float ref = test->ref(in[i]); + double error, precision; + bool pass; + + error = fabs(out[i] - ref); + precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG; + + pass = precision >= test->precision; + + if (isnan(ref)) { + continue; + } + + if (!pass || verbose) { + printf("%s(%.9g): ref = %.9g, out = %.9g, precision = %f bits, %s\n", + test->name, in[i], ref, out[i], precision, + pass ? "PASS" : "FAIL"); + } + + if (!pass) { + success = FALSE; + } } } - LLVMFreeMachineCodeForFunction(engine, test_func); + gallivm_free_function(gallivm, test_func, test_func_jit); + + gallivm_destroy(gallivm); + + align_free(in); + align_free(out); return success; } boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_all(unsigned verbose, FILE *fp) { boolean success = TRUE; int i; for (i = 0; i < Elements(unary_tests); ++i) { - if (!test_unary(gallivm, verbose, fp, &unary_tests[i])) { + if (!test_unary(verbose, fp, &unary_tests[i])) { success = FALSE; } } @@ -315,19 +379,19 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n) { /* * Not randomly generated test cases, so test all. */ - return test_all(gallivm, verbose, fp); + return test_all(verbose, fp); } boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_single(unsigned verbose, FILE *fp) { return TRUE; } diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c index 51324cb..37b37fd 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_blend.c +++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c @@ -36,6 +36,7 @@ * @author Brian Paul <brian@vmware.com> */ +#include "util/u_memory.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_type.h" @@ -53,19 +54,6 @@ enum vector_mode typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res); -/** cast wrapper */ -static blend_test_ptr_t -voidptr_to_blend_test_ptr_t(void *p) -{ - union { - void *v; - blend_test_ptr_t f; - } u; - u.v = p; - return u.f; -} - - void write_tsv_header(FILE *fp) @@ -468,50 +456,43 @@ compute_blend_ref(const struct pipe_blend_state *blend, PIPE_ALIGN_STACK static boolean -test_one(struct gallivm_state *gallivm, - unsigned verbose, +test_one(unsigned verbose, FILE *fp, const struct pipe_blend_state *blend, enum vector_mode mode, struct lp_type type) { - LLVMModuleRef module = gallivm->module; + struct gallivm_state *gallivm; LLVMValueRef func = NULL; - LLVMExecutionEngineRef engine = gallivm->engine; - char *error = NULL; blend_test_ptr_t blend_test_ptr; boolean success; const unsigned n = LP_TEST_NUM_SAMPLES; int64_t cycles[LP_TEST_NUM_SAMPLES]; double cycles_avg = 0.0; unsigned i, j; - void *code; + const unsigned stride = lp_type_width(type)/8; if(verbose >= 1) dump_blend_type(stdout, blend, mode, type); - func = add_blend_test(gallivm, blend, mode, type); + gallivm = gallivm_create(); - if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { - LLVMDumpModule(module); - abort(); - } - LLVMDisposeMessage(error); + func = add_blend_test(gallivm, blend, mode, type); - code = LLVMGetPointerToGlobal(engine, func); - blend_test_ptr = voidptr_to_blend_test_ptr_t(code); + gallivm_compile_module(gallivm); - if(verbose >= 2) - lp_disassemble(code); + blend_test_ptr = (blend_test_ptr_t)gallivm_jit_function(gallivm, func); success = TRUE; - for(i = 0; i < n && success; ++i) { - if(mode == AoS) { - PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8]; + if(mode == AoS) { + uint8_t *src, *dst, *con, *res, *ref; + src = align_malloc(stride, stride); + dst = align_malloc(stride, stride); + con = align_malloc(stride, stride); + res = align_malloc(stride, stride); + ref = align_malloc(stride, stride); + + for(i = 0; i < n && success; ++i) { int64_t start_counter = 0; int64_t end_counter = 0; @@ -569,14 +550,21 @@ test_one(struct gallivm_state *gallivm, fprintf(stderr, "\n"); } } - - if(mode == SoA) { - const unsigned stride = type.length*type.width/8; - PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8]; - PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8]; + align_free(src); + align_free(dst); + align_free(con); + align_free(res); + align_free(ref); + } + else if(mode == SoA) { + uint8_t *src, *dst, *con, *res, *ref; + src = align_malloc(4*stride, stride); + dst = align_malloc(4*stride, stride); + con = align_malloc(4*stride, stride); + res = align_malloc(4*stride, stride); + ref = align_malloc(4*stride, stride); + + for(i = 0; i < n && success; ++i) { int64_t start_counter = 0; int64_t end_counter = 0; boolean mismatch; @@ -651,6 +639,11 @@ test_one(struct gallivm_state *gallivm, } } } + align_free(src); + align_free(dst); + align_free(con); + align_free(res); + align_free(ref); } /* @@ -687,16 +680,9 @@ test_one(struct gallivm_state *gallivm, if(fp) write_tsv_row(fp, blend, mode, type, cycles_avg, success); - if (!success) { - if(verbose < 2) - LLVMDumpModule(module); - LLVMWriteBitcodeToFile(module, "blend.bc"); - fprintf(stderr, "blend.bc written\n"); - fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n"); - abort(); - } + gallivm_free_function(gallivm, func, blend_test_ptr); - LLVMFreeMachineCodeForFunction(engine, func); + gallivm_destroy(gallivm); return success; } @@ -753,7 +739,7 @@ const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]); boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_all(unsigned verbose, FILE *fp) { const unsigned *rgb_func; const unsigned *rgb_src_factor; @@ -789,7 +775,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) blend.rt[0].alpha_dst_factor = *alpha_dst_factor; blend.rt[0].colormask = PIPE_MASK_RGBA; - if(!test_one(gallivm, verbose, fp, &blend, mode, *type)) + if(!test_one(verbose, fp, &blend, mode, *type)) success = FALSE; } @@ -806,7 +792,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n) { const unsigned *rgb_func; @@ -849,7 +835,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, blend.rt[0].alpha_dst_factor = *alpha_dst_factor; blend.rt[0].colormask = PIPE_MASK_RGBA; - if(!test_one(gallivm, verbose, fp, &blend, mode, *type)) + if(!test_one(verbose, fp, &blend, mode, *type)) success = FALSE; } @@ -858,7 +844,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_single(unsigned verbose, FILE *fp) { printf("no test_single()"); return TRUE; diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c index 0dcb542..71d45bd 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_conv.c +++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c @@ -142,21 +142,21 @@ add_conv_test(struct gallivm_state *gallivm, LLVMBuildRetVoid(builder);; + gallivm_verify_function(gallivm, func); + return func; } PIPE_ALIGN_STACK static boolean -test_one(struct gallivm_state *gallivm, unsigned verbose, +test_one(unsigned verbose, FILE *fp, struct lp_type src_type, struct lp_type dst_type) { - LLVMModuleRef module = gallivm->module; - LLVMExecutionEngineRef engine = gallivm->engine; + struct gallivm_state *gallivm; LLVMValueRef func = NULL; - char *error = NULL; conv_test_ptr_t conv_test_ptr; boolean success; const unsigned n = LP_TEST_NUM_SAMPLES; @@ -166,10 +166,18 @@ test_one(struct gallivm_state *gallivm, unsigned verbose, unsigned num_dsts; double eps; unsigned i, j; - void *code; - if (src_type.width * src_type.length != dst_type.width * dst_type.length && - src_type.length != dst_type.length) { + if ((src_type.width >= dst_type.width && src_type.length > dst_type.length) || + (src_type.width <= dst_type.width && src_type.length < dst_type.length)) { + return TRUE; + } + + /* Known failures + * - fixed point 32 -> float 32 + * - float 32 -> signed normalised integer 32 + */ + if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) || + (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) { return TRUE; } @@ -183,7 +191,7 @@ test_one(struct gallivm_state *gallivm, unsigned verbose, } if(verbose >= 1) - dump_conv_types(stdout, src_type, dst_type); + dump_conv_types(stderr, src_type, dst_type); if (src_type.length > dst_type.length) { num_srcs = 1; @@ -203,29 +211,20 @@ test_one(struct gallivm_state *gallivm, unsigned verbose, eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type)); - func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts); + gallivm = gallivm_create(); - if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { - LLVMDumpModule(module); - abort(); - } - LLVMDisposeMessage(error); - - if(verbose >= 2) - LLVMDumpModule(module); + func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts); - code = LLVMGetPointerToGlobal(engine, func); - conv_test_ptr = (conv_test_ptr_t)pointer_to_func(code); + gallivm_compile_module(gallivm); - if(verbose >= 2) - lp_disassemble(code); + conv_test_ptr = (conv_test_ptr_t)gallivm_jit_function(gallivm, func); success = TRUE; for(i = 0; i < n && success; ++i) { unsigned src_stride = src_type.length*src_type.width/8; unsigned dst_stride = dst_type.length*dst_type.width/8; - PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; - PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; + PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; + PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH]; int64_t start_counter = 0; @@ -320,20 +319,9 @@ test_one(struct gallivm_state *gallivm, unsigned verbose, if(fp) write_tsv_row(fp, src_type, dst_type, cycles_avg, success); - if (!success) { - static boolean firsttime = TRUE; - if(firsttime) { - if(verbose < 2) - LLVMDumpModule(module); - LLVMWriteBitcodeToFile(module, "conv.bc"); - fprintf(stderr, "conv.bc written\n"); - fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n"); - firsttime = FALSE; - /* abort(); */ - } - } + gallivm_free_function(gallivm, func, conv_test_ptr); - LLVMFreeMachineCodeForFunction(engine, func); + gallivm_destroy(gallivm); return success; } @@ -348,18 +336,33 @@ const struct lp_type conv_types[] = { { TRUE, FALSE, FALSE, TRUE, 32, 4 }, { TRUE, FALSE, FALSE, FALSE, 32, 4 }, + { TRUE, FALSE, TRUE, TRUE, 32, 8 }, + { TRUE, FALSE, TRUE, FALSE, 32, 8 }, + { TRUE, FALSE, FALSE, TRUE, 32, 8 }, + { TRUE, FALSE, FALSE, FALSE, 32, 8 }, + /* Fixed */ { FALSE, TRUE, TRUE, TRUE, 32, 4 }, { FALSE, TRUE, TRUE, FALSE, 32, 4 }, { FALSE, TRUE, FALSE, TRUE, 32, 4 }, { FALSE, TRUE, FALSE, FALSE, 32, 4 }, + { FALSE, TRUE, TRUE, TRUE, 32, 8 }, + { FALSE, TRUE, TRUE, FALSE, 32, 8 }, + { FALSE, TRUE, FALSE, TRUE, 32, 8 }, + { FALSE, TRUE, FALSE, FALSE, 32, 8 }, + /* Integer */ { FALSE, FALSE, TRUE, TRUE, 32, 4 }, { FALSE, FALSE, TRUE, FALSE, 32, 4 }, { FALSE, FALSE, FALSE, TRUE, 32, 4 }, { FALSE, FALSE, FALSE, FALSE, 32, 4 }, + { FALSE, FALSE, TRUE, TRUE, 32, 8 }, + { FALSE, FALSE, TRUE, FALSE, 32, 8 }, + { FALSE, FALSE, FALSE, TRUE, 32, 8 }, + { FALSE, FALSE, FALSE, FALSE, 32, 8 }, + { FALSE, FALSE, TRUE, TRUE, 16, 8 }, { FALSE, FALSE, TRUE, FALSE, 16, 8 }, { FALSE, FALSE, FALSE, TRUE, 16, 8 }, @@ -381,7 +384,7 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]); boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_all(unsigned verbose, FILE *fp) { const struct lp_type *src_type; const struct lp_type *dst_type; @@ -394,7 +397,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) if(src_type == dst_type) continue; - if(!test_one(gallivm, verbose, fp, *src_type, *dst_type)){ + if(!test_one(verbose, fp, *src_type, *dst_type)){ success = FALSE; ++error_count; } @@ -408,7 +411,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n) { const struct lp_type *src_type; @@ -423,7 +426,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, dst_type = &conv_types[rand() % num_types]; } while (src_type == dst_type || src_type->norm != dst_type->norm); - if(!test_one(gallivm, verbose, fp, *src_type, *dst_type)) + if(!test_one(verbose, fp, *src_type, *dst_type)) success = FALSE; } @@ -432,7 +435,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_single(unsigned verbose, FILE *fp) { /* float, fixed, sign, norm, width, len */ struct lp_type f32x4_type = @@ -442,7 +445,7 @@ test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) boolean success; - success = test_one(gallivm, verbose, fp, f32x4_type, ub8x4_type); + success = test_one(verbose, fp, f32x4_type, ub8x4_type); return success; } diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c index daf6ded..34cbdbd 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_format.c +++ b/src/gallium/drivers/llvmpipe/lp_test_format.c @@ -83,7 +83,6 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMContextRef context = gallivm->context; LLVMModuleRef module = gallivm->module; LLVMBuilderRef builder = gallivm->builder; - LLVMPassManagerRef passmgr = gallivm->passmgr; LLVMTypeRef args[4]; LLVMValueRef func; LLVMValueRef packed_ptr; @@ -120,16 +119,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMBuildRetVoid(builder); - if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) { - LLVMDumpValue(func); - abort(); - } - - LLVMRunFunctionPassManager(passmgr, func); - - if (verbose >= 1) { - LLVMDumpValue(func); - } + gallivm_verify_function(gallivm, func); return func; } @@ -137,26 +127,24 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, PIPE_ALIGN_STACK static boolean -test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_format_float(unsigned verbose, FILE *fp, const struct util_format_description *desc) { + struct gallivm_state *gallivm; LLVMValueRef fetch = NULL; - LLVMExecutionEngineRef engine = gallivm->engine; fetch_ptr_t fetch_ptr; PIPE_ALIGN_VAR(16) float unpacked[4]; boolean first = TRUE; boolean success = TRUE; unsigned i, j, k, l; - void *f; + + gallivm = gallivm_create(); fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_float32_vec4_type()); - f = LLVMGetPointerToGlobal(engine, fetch); - fetch_ptr = (fetch_ptr_t) pointer_to_func(f); + gallivm_compile_module(gallivm); - if (verbose >= 2) { - lp_disassemble(f); - } + fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch); for (l = 0; l < util_format_nr_test_cases; ++l) { const struct util_format_test_case *test = &util_format_test_cases[l]; @@ -171,25 +159,35 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, for (i = 0; i < desc->block.height; ++i) { for (j = 0; j < desc->block.width; ++j) { - boolean match; + boolean match = TRUE; memset(unpacked, 0, sizeof unpacked); fetch_ptr(unpacked, test->packed, j, i); - match = TRUE; - for(k = 0; k < 4; ++k) - if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) + for(k = 0; k < 4; ++k) { + if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) { match = FALSE; + } + + if (util_is_double_nan(test->unpacked[i][j][k]) != util_is_nan(unpacked[k])) { + match = FALSE; + } + + if (!util_is_double_inf_or_nan(test->unpacked[i][j][k]) && + fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) { + match = FALSE; + } + } if (!match) { printf("FAILED\n"); printf(" Packed: %02x %02x %02x %02x\n", test->packed[0], test->packed[1], test->packed[2], test->packed[3]); - printf(" Unpacked (%u,%u): %f %f %f %f obtained\n", + printf(" Unpacked (%u,%u): %.9g %.9g %.9g %.9g obtained\n", j, i, unpacked[0], unpacked[1], unpacked[2], unpacked[3]); - printf(" %f %f %f %f expected\n", + printf(" %.9g %.9g %.9g %.9g expected\n", test->unpacked[i][j][0], test->unpacked[i][j][1], test->unpacked[i][j][2], @@ -201,14 +199,9 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, } } - if (!success) { - if (verbose < 1) { - LLVMDumpValue(fetch); - } - } + gallivm_free_function(gallivm, fetch, fetch_ptr); - LLVMFreeMachineCodeForFunction(engine, fetch); - LLVMDeleteFunction(fetch); + gallivm_destroy(gallivm); if(fp) write_tsv_row(fp, desc, success); @@ -219,26 +212,24 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, PIPE_ALIGN_STACK static boolean -test_format_unorm8(struct gallivm_state *gallivm, - unsigned verbose, FILE *fp, +test_format_unorm8(unsigned verbose, FILE *fp, const struct util_format_description *desc) { + struct gallivm_state *gallivm; LLVMValueRef fetch = NULL; fetch_ptr_t fetch_ptr; uint8_t unpacked[4]; boolean first = TRUE; boolean success = TRUE; unsigned i, j, k, l; - void *f; + + gallivm = gallivm_create(); fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_unorm8_vec4_type()); - f = LLVMGetPointerToGlobal(gallivm->engine, fetch); - fetch_ptr = (fetch_ptr_t) pointer_to_func(f); + gallivm_compile_module(gallivm); - if (verbose >= 2) { - lp_disassemble(f); - } + fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch); for (l = 0; l < util_format_nr_test_cases; ++l) { const struct util_format_test_case *test = &util_format_test_cases[l]; @@ -285,6 +276,7 @@ test_format_unorm8(struct gallivm_state *gallivm, float_to_ubyte(test->unpacked[i][j][1]), float_to_ubyte(test->unpacked[i][j][2]), float_to_ubyte(test->unpacked[i][j][3])); + success = FALSE; } } @@ -292,11 +284,9 @@ test_format_unorm8(struct gallivm_state *gallivm, } } - if (!success) - LLVMDumpValue(fetch); + gallivm_free_function(gallivm, fetch, fetch_ptr); - LLVMFreeMachineCodeForFunction(gallivm->engine, fetch); - LLVMDeleteFunction(fetch); + gallivm_destroy(gallivm); if(fp) write_tsv_row(fp, desc, success); @@ -308,17 +298,16 @@ test_format_unorm8(struct gallivm_state *gallivm, static boolean -test_one(struct gallivm_state *gallivm, - unsigned verbose, FILE *fp, +test_one(unsigned verbose, FILE *fp, const struct util_format_description *format_desc) { boolean success = TRUE; - if (!test_format_float(gallivm, verbose, fp, format_desc)) { + if (!test_format_float(verbose, fp, format_desc)) { success = FALSE; } - if (!test_format_unorm8(gallivm, verbose, fp, format_desc)) { + if (!test_format_unorm8(verbose, fp, format_desc)) { success = FALSE; } @@ -327,7 +316,7 @@ test_one(struct gallivm_state *gallivm, boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_all(unsigned verbose, FILE *fp) { enum pipe_format format; boolean success = TRUE; @@ -359,7 +348,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) continue; } - if (!test_one(gallivm, verbose, fp, format_desc)) { + if (!test_one(verbose, fp, format_desc)) { success = FALSE; } } @@ -369,15 +358,15 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n) { - return test_all(gallivm, verbose, fp); + return test_all(verbose, fp); } boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_single(unsigned verbose, FILE *fp) { printf("no test_single()"); return TRUE; diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c index d229c62..4c61092 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_main.c +++ b/src/gallium/drivers/llvmpipe/lp_test_main.c @@ -39,6 +39,7 @@ #include "gallivm/lp_bld_const.h" #include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_debug.h" #include "lp_test.h" @@ -369,7 +370,6 @@ int main(int argc, char **argv) unsigned i; boolean success; boolean single = FALSE; - struct gallivm_state *gallivm; for(i = 1; i < argc; ++i) { if(strcmp(argv[i], "-v") == 0) @@ -384,23 +384,28 @@ int main(int argc, char **argv) lp_build_init(); - gallivm = gallivm_create(); +#ifdef DEBUG + if (verbose >= 2) { + gallivm_debug |= GALLIVM_DEBUG_IR; + gallivm_debug |= GALLIVM_DEBUG_ASM; + } +#endif util_cpu_detect(); if(fp) { /* Warm up the caches */ - test_some(gallivm, 0, NULL, 100); + test_some(0, NULL, 100); write_tsv_header(fp); } if (single) - success = test_single(gallivm, verbose, fp); + success = test_single(verbose, fp); else if (n) - success = test_some(gallivm, verbose, fp, n); + success = test_some(verbose, fp, n); else - success = test_all(gallivm, verbose, fp); + success = test_all(verbose, fp); if(fp) fclose(fp); diff --git a/src/gallium/drivers/llvmpipe/lp_test_printf.c b/src/gallium/drivers/llvmpipe/lp_test_printf.c index 620cdb5..c483de9 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_printf.c +++ b/src/gallium/drivers/llvmpipe/lp_test_printf.c @@ -78,66 +78,61 @@ add_printf_test(struct gallivm_state *gallivm) LLVMBuildRetVoid(builder); + gallivm_verify_function(gallivm, func); + return func; } PIPE_ALIGN_STACK static boolean -test_printf(struct gallivm_state *gallivm, - unsigned verbose, FILE *fp, +test_printf(unsigned verbose, FILE *fp, const struct printf_test_case *testcase) { - LLVMExecutionEngineRef engine = gallivm->engine; - LLVMModuleRef module = gallivm->module; + struct gallivm_state *gallivm; LLVMValueRef test; - char *error = NULL; test_printf_t test_printf_func; boolean success = TRUE; - void *code; - test = add_printf_test(gallivm); + gallivm = gallivm_create(); - if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { - LLVMDumpModule(module); - abort(); - } - LLVMDisposeMessage(error); + test = add_printf_test(gallivm); - code = LLVMGetPointerToGlobal(engine, test); - test_printf_func = (test_printf_t) pointer_to_func(code); + gallivm_compile_module(gallivm); - // LLVMDumpModule(module); + test_printf_func = (test_printf_t) gallivm_jit_function(gallivm, test); test_printf_func(0); - LLVMFreeMachineCodeForFunction(engine, test); + gallivm_free_function(gallivm, test, test_printf_func); + + gallivm_destroy(gallivm); return success; } boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_all(unsigned verbose, FILE *fp) { boolean success = TRUE; - test_printf(gallivm, verbose, fp, NULL); + test_printf(verbose, fp, NULL); return success; } boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, +test_some(unsigned verbose, FILE *fp, unsigned long n) { - return test_all(gallivm, verbose, fp); + return test_all(verbose, fp); } boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) +test_single(unsigned verbose, FILE *fp) { printf("no test_single()"); return TRUE; diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c deleted file mode 100644 index fc3edf3..0000000 --- a/src/gallium/drivers/llvmpipe/lp_test_round.c +++ /dev/null @@ -1,242 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include <stdlib.h> -#include <stdio.h> - -#include "util/u_pointer.h" -#include "gallivm/lp_bld.h" -#include "gallivm/lp_bld_init.h" -#include "gallivm/lp_bld_arit.h" - -#include "lp_test.h" - - -void -write_tsv_header(FILE *fp) -{ - fprintf(fp, - "result\t" - "format\n"); - - fflush(fp); -} - - -#ifdef PIPE_ARCH_SSE - -# include <emmintrin.h> - -typedef __m128 (*test_round_t)(__m128); - -typedef LLVMValueRef (*lp_func_t)(struct lp_build_context *, LLVMValueRef); - - -static LLVMValueRef -add_test(struct gallivm_state *gallivm, const char *name, lp_func_t lp_func) -{ - LLVMModuleRef module = gallivm->module; - LLVMContextRef context = gallivm->context; - LLVMBuilderRef builder = gallivm->builder; - - LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(context), 4); - LLVMTypeRef args[1] = { v4sf }; - LLVMValueRef func = LLVMAddFunction(module, name, LLVMFunctionType(v4sf, args, 1, 0)); - LLVMValueRef arg1 = LLVMGetParam(func, 0); - LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry"); - LLVMValueRef ret; - struct lp_build_context bld; - - lp_build_context_init(&bld, gallivm, lp_float32_vec4_type()); - - LLVMSetFunctionCallConv(func, LLVMCCallConv); - - LLVMPositionBuilderAtEnd(builder, block); - - ret = lp_func(&bld, arg1); - - LLVMBuildRet(builder, ret); - - return func; -} - -static void -printv(char* string, __m128 value) -{ - __m128 v = value; - float *f = (float *)&v; - printf("%s: %10f %10f %10f %10f\n", string, - f[0], f[1], f[2], f[3]); -} - -static boolean -compare(__m128 x, __m128 y) -{ - boolean success = TRUE; - float *xp = (float *) &x; - float *yp = (float *) &y; - if (xp[0] != yp[0] || - xp[1] != yp[1] || - xp[2] != yp[2] || - xp[3] != yp[3]) { - printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n"); - success = FALSE; - } - return success; -} - - - -PIPE_ALIGN_STACK -static boolean -test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) -{ - LLVMModuleRef module = gallivm->module; - LLVMValueRef test_round = NULL, test_trunc, test_floor, test_ceil; - LLVMExecutionEngineRef engine = gallivm->engine; - char *error = NULL; - test_round_t round_func, trunc_func, floor_func, ceil_func; - float unpacked[4]; - boolean success = TRUE; - int i; - - test_round = add_test(gallivm, "round", lp_build_round); - test_trunc = add_test(gallivm, "trunc", lp_build_trunc); - test_floor = add_test(gallivm, "floor", lp_build_floor); - test_ceil = add_test(gallivm, "ceil", lp_build_ceil); - - if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { - printf("LLVMVerifyModule: %s\n", error); - LLVMDumpModule(module); - abort(); - } - LLVMDisposeMessage(error); - - round_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_round)); - trunc_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_trunc)); - floor_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_floor)); - ceil_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_ceil)); - - memset(unpacked, 0, sizeof unpacked); - - if (0) - LLVMDumpModule(module); - - for (i = 0; i < 3; i++) { - /* NOTE: There are several acceptable rules for x.5 rounding: ceiling, - * nearest even, etc. So we avoid testing such corner cases here. - */ - __m128 xvals[3] = { - {-10.0, -1, 0, 12.0}, - {-1.49, -0.25, 1.25, 2.51}, - {-0.99, -0.01, 0.01, 0.99} - }; - __m128 x = xvals[i]; - __m128 y, ref; - float *xp = (float *) &x; - float *refp = (float *) &ref; - - printf("\n"); - printv("x ", x); - - refp[0] = round(xp[0]); - refp[1] = round(xp[1]); - refp[2] = round(xp[2]); - refp[3] = round(xp[3]); - y = round_func(x); - printv("C round(x) ", ref); - printv("LLVM round(x)", y); - success = success && compare(ref, y); - - refp[0] = trunc(xp[0]); - refp[1] = trunc(xp[1]); - refp[2] = trunc(xp[2]); - refp[3] = trunc(xp[3]); - y = trunc_func(x); - printv("C trunc(x) ", ref); - printv("LLVM trunc(x)", y); - success = success && compare(ref, y); - - refp[0] = floor(xp[0]); - refp[1] = floor(xp[1]); - refp[2] = floor(xp[2]); - refp[3] = floor(xp[3]); - y = floor_func(x); - printv("C floor(x) ", ref); - printv("LLVM floor(x)", y); - success = success && compare(ref, y); - - refp[0] = ceil(xp[0]); - refp[1] = ceil(xp[1]); - refp[2] = ceil(xp[2]); - refp[3] = ceil(xp[3]); - y = ceil_func(x); - printv("C ceil(x) ", ref); - printv("LLVM ceil(x) ", y); - success = success && compare(ref, y); - } - - LLVMFreeMachineCodeForFunction(engine, test_round); - LLVMFreeMachineCodeForFunction(engine, test_trunc); - LLVMFreeMachineCodeForFunction(engine, test_floor); - LLVMFreeMachineCodeForFunction(engine, test_ceil); - - return success; -} - -#else /* !PIPE_ARCH_SSE */ - -static boolean -test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) -{ - return TRUE; -} - -#endif /* !PIPE_ARCH_SSE */ - - -boolean -test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) -{ - return test_round(gallivm, verbose, fp); -} - - -boolean -test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, - unsigned long n) -{ - return test_all(gallivm, verbose, fp); -} - -boolean -test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp) -{ - printf("no test_single()"); - return TRUE; -} diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c index daa96f2..9151e42 100644 --- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c +++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c @@ -178,8 +178,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const struct lp_derivatives *derivs, LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *texel) @@ -189,7 +188,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, assert(unit < PIPE_MAX_SAMPLERS); if (LP_PERF & PERF_NO_TEX) { - lp_build_sample_nop(gallivm, type, texel); + lp_build_sample_nop(gallivm, type, num_coords, coords, texel); return; } @@ -199,7 +198,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, type, unit, num_coords, coords, - ddx, ddy, + derivs, lod_bias, explicit_lod, texel); } @@ -210,6 +209,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, static void lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, struct gallivm_state *gallivm, + struct lp_type type, unsigned unit, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *sizes_out) @@ -221,6 +221,7 @@ lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, lp_build_size_query_soa(gallivm, &sampler->dynamic_state.static_state[unit], &sampler->dynamic_state.base, + type, unit, explicit_lod, sizes_out); |