diff options
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 12 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 33 | ||||
-rw-r--r-- | test/CodeGen/X86/avx2-vbroadcast.ll | 96 |
3 files changed, 135 insertions, 6 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e6a0df7..ba66593 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5047,8 +5047,16 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && - Sc.getOpcode() != ISD::BUILD_VECTOR) - return SDValue(); + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasAVX2()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.is256BitVector()) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ad8d15d..5319455 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7272,8 +7272,8 @@ let ExeDomain = SSEPackedSingle in { int_x86_avx2_vbroadcast_ss_ps_256>; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256>; +def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, + int_x86_avx2_vbroadcast_sd_pd_256>; let Predicates = [HasAVX2] in def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, @@ -7684,6 +7684,31 @@ let Predicates = [HasAVX2] in { def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), (VPBROADCASTQYrm addr:$src)>; + def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBrr VR128:$src)>; + def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), + (VPBROADCASTBYrr VR128:$src)>; + def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWrr VR128:$src)>; + def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), + (VPBROADCASTWYrr VR128:$src)>; + def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDrr VR128:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), + (VPBROADCASTDYrr VR128:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), + (VPBROADCASTQYrr VR128:$src)>; + def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSrr VR128:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), + (VBROADCASTSSYrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), + (VPBROADCASTQrr VR128:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), + (VBROADCASTSDYrr VR128:$src)>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -7694,7 +7719,7 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), - (VBROADCASTSDrr + (VBROADCASTSDYrr (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), @@ -7704,7 +7729,7 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VBROADCASTSDrr + (VBROADCASTSDYrr (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>; } } diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 46b41fa..b804233 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -259,3 +259,99 @@ define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp { ret <4 x double> %wide } +;CHECK: _inreg8xfloat +;CHECK: vbroadcastss +;CHECK: ret +define <8 x float> @_inreg8xfloat(<8 x float> %a) { + %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %b +} + +;CHECK: _inreg4xfloat +;CHECK: vbroadcastss +;CHECK: ret +define <4 x float> @_inreg4xfloat(<4 x float> %a) { + %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %b +} + +;CHECK: _inreg16xi16 +;CHECK: vpbroadcastw +;CHECK: ret +define <16 x i16> @_inreg16xi16(<16 x i16> %a) { + %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer + ret <16 x i16> %b +} + +;CHECK: _inreg8xi16 +;CHECK: vpbroadcastw +;CHECK: ret +define <8 x i16> @_inreg8xi16(<8 x i16> %a) { + %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %b +} + + +;CHECK: _inreg4xi64 +;CHECK: vpbroadcastq +;CHECK: ret +define <4 x i64> @_inreg4xi64(<4 x i64> %a) { + %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer + ret <4 x i64> %b +} + +;CHECK: _inreg2xi64 +;CHECK: vpbroadcastq +;CHECK: ret +define <2 x i64> @_inreg2xi64(<2 x i64> %a) { + %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %b +} + +;CHECK: _inreg4xdouble +;CHECK: vbroadcastsd +;CHECK: ret +define <4 x double> @_inreg4xdouble(<4 x double> %a) { + %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %b +} + +;CHECK: _inreg2xdouble +;CHECK: vpbroadcastq +;CHECK: ret +define <2 x double> @_inreg2xdouble(<2 x double> %a) { + %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer + ret <2 x double> %b +} + +;CHECK: _inreg8xi32 +;CHECK: vpbroadcastd +;CHECK: ret +define <8 x i32> @_inreg8xi32(<8 x i32> %a) { + %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer + ret <8 x i32> %b +} + +;CHECK: _inreg4xi32 +;CHECK: vpbroadcastd +;CHECK: ret +define <4 x i32> @_inreg4xi32(<4 x i32> %a) { + %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %b +} + +;CHECK: _inreg32xi8 +;CHECK: vpbroadcastb +;CHECK: ret +define <32 x i8> @_inreg32xi8(<32 x i8> %a) { + %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer + ret <32 x i8> %b +} + +;CHECK: _inreg16xi8 +;CHECK: vpbroadcastb +;CHECK: ret +define <16 x i8> @_inreg16xi8(<16 x i8> %a) { + %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %b +} |