diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8f38c0d0daf58a..b6468a9b18e761 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -428,10 +428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Custom); - setOperationAction(ISD::CTPOP , MVT::i16 , Expand); - setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Custom); + setOperationAction(ISD::CTPOP , MVT::i32 , Custom); if (Subtarget.is64Bit()) - setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + setOperationAction(ISD::CTPOP , MVT::i64 , Custom); else setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } @@ -31030,29 +31030,37 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } -static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, +static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - SDLoc DL(Op); + MVT VT = N.getSimpleValueType(); + SDValue Op = N.getOperand(0); + SDLoc DL(N); - // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. - if (VT == MVT::i8) { - SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); - Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32); - Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, - DAG.getConstant(0x08040201U, DL, MVT::i32)); - Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, - DAG.getShiftAmountConstant(3, MVT::i32, DL)); - Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11); - Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11); - Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, - DAG.getShiftAmountConstant(28, MVT::i32, DL)); - return DAG.getZExtOrTrunc(Op, DL, VT); + if (VT.isScalarInteger()) { + KnownBits Known = DAG.computeKnownBits(Op); + unsigned ActiveBits = Known.countMaxActiveBits(); + + // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. + if (ActiveBits <= 8) { + SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); + Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); + Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, + DAG.getConstant(0x08040201U, DL, MVT::i32)); + Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(3, MVT::i32, DL)); + Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11); + Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11); + Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, + DAG.getShiftAmountConstant(28, MVT::i32, DL)); + return DAG.getZExtOrTrunc(Op, DL, VT); + } + + return SDValue(); // fallback to generic expansion. } assert(VT.isVector() && "We only do custom lowering for vector population count."); - return LowerVectorCTPOP(Op, DL, Subtarget, DAG); + return LowerVectorCTPOP(N, DL, Subtarget, DAG); } static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index dedae2893e2ad0..3187bf6448690e 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -516,23 +516,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: kmovw %k1, %eax -; AVX512F-NEXT: movzbl %al, %ecx -; AVX512F-NEXT: shrl %eax -; AVX512F-NEXT: andl $85, %eax -; AVX512F-NEXT: subl %eax, %ecx -; AVX512F-NEXT: movl %ecx, %eax -; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512F-NEXT: shrl $2, %ecx -; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512F-NEXT: addl %eax, %ecx -; AVX512F-NEXT: movl %ecx, %eax -; AVX512F-NEXT: shrl $4, %eax -; AVX512F-NEXT: addl %ecx, %eax -; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F -; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 -; AVX512F-NEXT: shrl $24, %eax ; AVX512F-NEXT: kshiftrw $8, %k1, %k2 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: movzbl %al, %eax +; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512F-NEXT: shrl $28, %eax ; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -543,23 +534,13 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2 ; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512VLDQ-NEXT: kmovb %k1, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: shrl %ecx -; AVX512VLDQ-NEXT: andl $-43, %ecx -; AVX512VLDQ-NEXT: subl %ecx, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLDQ-NEXT: shrl $2, %eax -; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLDQ-NEXT: addl %ecx, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: shrl $4, %ecx -; AVX512VLDQ-NEXT: addl %eax, %ecx -; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLDQ-NEXT: shrl $24, %eax ; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512VLDQ-NEXT: kmovb %k1, %eax +; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512VLDQ-NEXT: shrl $3, %eax +; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512VLDQ-NEXT: shrl $28, %eax ; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: vzeroupper @@ -569,23 +550,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> % ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1 -; AVX512VLBW-NEXT: kmovd %k1, %eax -; AVX512VLBW-NEXT: movzbl %al, %ecx -; AVX512VLBW-NEXT: shrl %eax -; AVX512VLBW-NEXT: andl $85, %eax -; AVX512VLBW-NEXT: subl %eax, %ecx -; AVX512VLBW-NEXT: movl %ecx, %eax -; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLBW-NEXT: shrl $2, %ecx -; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLBW-NEXT: addl %eax, %ecx -; AVX512VLBW-NEXT: movl %ecx, %eax -; AVX512VLBW-NEXT: shrl $4, %eax -; AVX512VLBW-NEXT: addl %ecx, %eax -; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F -; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 -; AVX512VLBW-NEXT: shrl $24, %eax ; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512VLBW-NEXT: kmovd %k1, %eax +; AVX512VLBW-NEXT: movzbl %al, %eax +; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512VLBW-NEXT: shrl $3, %eax +; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512VLBW-NEXT: shrl $28, %eax ; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 46b1fa5dd2757e..4c5b67962a58bd 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1008,21 +1008,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; AVX512F-NEXT: kmovw %k2, %eax ; AVX512F-NEXT: movzbl %al, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl %ecx -; AVX512F-NEXT: andl $-43, %ecx -; AVX512F-NEXT: subl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512F-NEXT: shrl $2, %eax -; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512F-NEXT: addl %ecx, %eax -; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $4, %ecx -; AVX512F-NEXT: addl %eax, %ecx -; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512F-NEXT: shrl $3, %eax +; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512F-NEXT: shrl $28, %eax ; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} ; AVX512F-NEXT: retq ; @@ -1032,21 +1022,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1 ; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2 ; AVX512VLDQ-NEXT: kmovb %k2, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: shrl %ecx -; AVX512VLDQ-NEXT: andl $-43, %ecx -; AVX512VLDQ-NEXT: subl %ecx, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLDQ-NEXT: shrl $2, %eax -; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLDQ-NEXT: addl %ecx, %eax -; AVX512VLDQ-NEXT: movl %eax, %ecx -; AVX512VLDQ-NEXT: shrl $4, %ecx -; AVX512VLDQ-NEXT: addl %eax, %ecx -; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLDQ-NEXT: shrl $24, %eax +; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512VLDQ-NEXT: shrl $3, %eax +; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512VLDQ-NEXT: shrl $28, %eax ; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} ; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; AVX512VLDQ-NEXT: retq @@ -1059,21 +1039,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, < ; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; AVX512VLBW-NEXT: kmovd %k2, %eax ; AVX512VLBW-NEXT: movzbl %al, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl %ecx -; AVX512VLBW-NEXT: andl $-43, %ecx -; AVX512VLBW-NEXT: subl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333 -; AVX512VLBW-NEXT: shrl $2, %eax -; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333 -; AVX512VLBW-NEXT: addl %ecx, %eax -; AVX512VLBW-NEXT: movl %eax, %ecx -; AVX512VLBW-NEXT: shrl $4, %ecx -; AVX512VLBW-NEXT: addl %eax, %ecx -; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F -; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLBW-NEXT: shrl $24, %eax +; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201 +; AVX512VLBW-NEXT: shrl $3, %eax +; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111 +; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111 +; AVX512VLBW-NEXT: shrl $28, %eax ; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} ; AVX512VLBW-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer