Skip to content

Commit

Permalink
[X86] Generalize i8 CTPOP expansion to work with any input with 8 or …
Browse files Browse the repository at this point in the history
…less active bits

Extend llvm#79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in llvm#79823
  • Loading branch information
RKSimon committed Feb 2, 2024
1 parent 1e7d587 commit 275729a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 113 deletions.
48 changes: 28 additions & 20 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Custom);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Custom);
setOperationAction(ISD::CTPOP , MVT::i32 , Custom);
if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}
Expand Down Expand Up @@ -31030,29 +31030,37 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
MVT VT = N.getSimpleValueType();
SDValue Op = N.getOperand(0);
SDLoc DL(N);

// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
if (VT == MVT::i8) {
SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32);
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
DAG.getConstant(0x08040201U, DL, MVT::i32));
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
DAG.getShiftAmountConstant(3, MVT::i32, DL));
Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
DAG.getShiftAmountConstant(28, MVT::i32, DL));
return DAG.getZExtOrTrunc(Op, DL, VT);
if (VT.isScalarInteger()) {
KnownBits Known = DAG.computeKnownBits(Op);
unsigned ActiveBits = Known.countMaxActiveBits();

// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
if (ActiveBits <= 8) {
SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
DAG.getConstant(0x08040201U, DL, MVT::i32));
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
DAG.getShiftAmountConstant(3, MVT::i32, DL));
Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
DAG.getShiftAmountConstant(28, MVT::i32, DL));
return DAG.getZExtOrTrunc(Op, DL, VT);
}

return SDValue(); // fallback to generic expansion.
}

assert(VT.isVector() &&
"We only do custom lowering for vector population count.");
return LowerVectorCTPOP(Op, DL, Subtarget, DAG);
return LowerVectorCTPOP(N, DL, Subtarget, DAG);
}

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
Expand Down
68 changes: 20 additions & 48 deletions llvm/test/CodeGen/X86/masked_compressstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -516,23 +516,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: kmovw %k1, %eax
; AVX512F-NEXT: movzbl %al, %ecx
; AVX512F-NEXT: shrl %eax
; AVX512F-NEXT: andl $85, %eax
; AVX512F-NEXT: subl %eax, %ecx
; AVX512F-NEXT: movl %ecx, %eax
; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512F-NEXT: shrl $2, %ecx
; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512F-NEXT: addl %eax, %ecx
; AVX512F-NEXT: movl %ecx, %eax
; AVX512F-NEXT: shrl $4, %eax
; AVX512F-NEXT: addl %ecx, %eax
; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
; AVX512F-NEXT: shrl $24, %eax
; AVX512F-NEXT: kshiftrw $8, %k1, %k2
; AVX512F-NEXT: kmovw %k1, %eax
; AVX512F-NEXT: movzbl %al, %eax
; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512F-NEXT: shrl $28, %eax
; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
Expand All @@ -543,23 +534,13 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512VLDQ-NEXT: kmovb %k1, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl %ecx
; AVX512VLDQ-NEXT: andl $-43, %ecx
; AVX512VLDQ-NEXT: subl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLDQ-NEXT: shrl $2, %eax
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLDQ-NEXT: addl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl $4, %ecx
; AVX512VLDQ-NEXT: addl %eax, %ecx
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512VLDQ-NEXT: shrl $24, %eax
; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2
; AVX512VLDQ-NEXT: kmovb %k1, %eax
; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512VLDQ-NEXT: shrl $3, %eax
; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512VLDQ-NEXT: shrl $28, %eax
; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
Expand All @@ -569,23 +550,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1
; AVX512VLBW-NEXT: kmovd %k1, %eax
; AVX512VLBW-NEXT: movzbl %al, %ecx
; AVX512VLBW-NEXT: shrl %eax
; AVX512VLBW-NEXT: andl $85, %eax
; AVX512VLBW-NEXT: subl %eax, %ecx
; AVX512VLBW-NEXT: movl %ecx, %eax
; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLBW-NEXT: shrl $2, %ecx
; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLBW-NEXT: addl %eax, %ecx
; AVX512VLBW-NEXT: movl %ecx, %eax
; AVX512VLBW-NEXT: shrl $4, %eax
; AVX512VLBW-NEXT: addl %ecx, %eax
; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
; AVX512VLBW-NEXT: shrl $24, %eax
; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2
; AVX512VLBW-NEXT: kmovd %k1, %eax
; AVX512VLBW-NEXT: movzbl %al, %eax
; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512VLBW-NEXT: shrl $3, %eax
; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512VLBW-NEXT: shrl $28, %eax
; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
Expand Down
60 changes: 15 additions & 45 deletions llvm/test/CodeGen/X86/masked_expandload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1008,21 +1008,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512F-NEXT: kmovw %k2, %eax
; AVX512F-NEXT: movzbl %al, %eax
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl %ecx
; AVX512F-NEXT: andl $-43, %ecx
; AVX512F-NEXT: subl %ecx, %eax
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512F-NEXT: shrl $2, %eax
; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512F-NEXT: addl %ecx, %eax
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl $4, %ecx
; AVX512F-NEXT: addl %eax, %ecx
; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512F-NEXT: shrl $24, %eax
; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512F-NEXT: shrl $28, %eax
; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512F-NEXT: retq
;
Expand All @@ -1032,21 +1022,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1
; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2
; AVX512VLDQ-NEXT: kmovb %k2, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl %ecx
; AVX512VLDQ-NEXT: andl $-43, %ecx
; AVX512VLDQ-NEXT: subl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLDQ-NEXT: shrl $2, %eax
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLDQ-NEXT: addl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl $4, %ecx
; AVX512VLDQ-NEXT: addl %eax, %ecx
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512VLDQ-NEXT: shrl $24, %eax
; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512VLDQ-NEXT: shrl $3, %eax
; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512VLDQ-NEXT: shrl $28, %eax
; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512VLDQ-NEXT: retq
Expand All @@ -1059,21 +1039,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512VLBW-NEXT: kmovd %k2, %eax
; AVX512VLBW-NEXT: movzbl %al, %eax
; AVX512VLBW-NEXT: movl %eax, %ecx
; AVX512VLBW-NEXT: shrl %ecx
; AVX512VLBW-NEXT: andl $-43, %ecx
; AVX512VLBW-NEXT: subl %ecx, %eax
; AVX512VLBW-NEXT: movl %eax, %ecx
; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLBW-NEXT: shrl $2, %eax
; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLBW-NEXT: addl %ecx, %eax
; AVX512VLBW-NEXT: movl %eax, %ecx
; AVX512VLBW-NEXT: shrl $4, %ecx
; AVX512VLBW-NEXT: addl %eax, %ecx
; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512VLBW-NEXT: shrl $24, %eax
; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
; AVX512VLBW-NEXT: shrl $3, %eax
; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
; AVX512VLBW-NEXT: shrl $28, %eax
; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512VLBW-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
Expand Down

0 comments on commit 275729a

Please sign in to comment.