microsoft · LeiWang1999 · Aug 30, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py
@@ -992,18 +992,16 @@ def get_idx():
             sch.compute_at(block_shared_local_local, B_shared_vi, preserve_unit_loops=True)
 
             dequantize_block_local = block_shared_local
-            if ("zeros_mode" in weight_decode_info and
-                    weight_decode_info["zeros_mode"] == "quantized"):
-                if ("with_scaling" in weight_decode_info and weight_decode_info["with_scaling"]):
-                    block_local_scales = sch.cache_read(dequantize_block_local, b_idx + 1, "local")
-                    sch.compute_at(block_local_scales, B_shared_vi, preserve_unit_loops=True)
-                    # pop the scale block
-                    auto_inline_producers(sch, block_local_scales)
-
-                if ("with_zeros" in weight_decode_info and weight_decode_info["with_zeros"]):
-                    block_local_zeros = sch.cache_read(dequantize_block_local, b_idx + 2, "local")
-                    sch.compute_at(block_local_zeros, B_shared_vi, preserve_unit_loops=True)
-                    auto_inline_producers(sch, block_local_zeros)
+            if ("with_scaling" in weight_decode_info and weight_decode_info["with_scaling"]):
+                block_local_scales = sch.cache_read(dequantize_block_local, b_idx + 1, "local")
+                sch.compute_at(block_local_scales, B_shared_vi, preserve_unit_loops=True)
+                # pop the scale block
+                auto_inline_producers(sch, block_local_scales)
+
+            if ("with_zeros" in weight_decode_info and weight_decode_info["with_zeros"]):
+                block_local_zeros = sch.cache_read(dequantize_block_local, b_idx + 2, "local")
+                sch.compute_at(block_local_zeros, B_shared_vi, preserve_unit_loops=True)
+                auto_inline_producers(sch, block_local_zeros)
 
             for producer in weight_producers:
                 with suppress(Exception):

diff --git a/bitblas/module/__init__.py b/bitblas/module/__init__.py
@@ -265,8 +265,6 @@ def warmup(self, topk=20):
         self.bitblas_matmul.hardware_aware_finetune(topk=topk)
 
     def forward(self, A, output=None):
-        if A.dtype != torch.float16:
-            A = A.half()
         A = self.bitblas_matmul.transform_input(A)
         stream = torch.cuda.current_stream()
 
@@ -277,7 +275,9 @@ def forward(self, A, output=None):
         args = [A_void, *self.q_params]
         if output is None:
             output = torch.empty(
-                A.shape[:-1] + (self.out_features,), dtype=A.dtype, device=A.device)
+                A.shape[:-1] + (self.out_features,),
+                dtype=getattr(torch, self.bitblas_matmul.out_dtype),
+                device=A.device)
         args.append(ctypes.c_void_p(output.data_ptr()))
         if self.bitblas_matmul.dynamic_range is not None:
             m = reduce(operator.mul, A.shape[:-1], 1)
@@ -312,12 +312,12 @@ def load_and_transform_weight(
             if bias is not None:
                 self.bias = bias
 
-    def repack_from_gptq(self, gptq_module):
+    def repack_from_gptq(self, gptq_module, device="cuda"):
         # qweight in gptq old quant linear stored with (out_features, in_features), should be transposed.
         qweight = gptq_module.qweight.T.contiguous().view(self.TORCH_STORAGE_DTYPE)
         intweight = unpack_qweight(qweight, self.bits).contiguous()
         if self.bitblas_matmul.weight_transform is not None:
-            qweight = self.bitblas_matmul.weight_transform(intweight.cpu()).cuda()
+            qweight = self.bitblas_matmul.weight_transform(intweight.cpu()).to(device)
         self.qweight = qweight
         # scales in gptq old quant linear stored with (in_features // group_size, out_features), should be transposed.
         scales = gptq_module.scales.T.contiguous().view(self.torch_dtype)

diff --git a/bitblas/ops/general_matmul/__init__.py b/bitblas/ops/general_matmul/__init__.py
@@ -603,7 +603,7 @@ def transform_weight(self, weight, scale=None, zeros=None, bias=None):
         weight = weight.contiguous()
         if self.W_dtype == self.A_dtype:
             if self.weight_transform is not None:
-                return self.weight_transform(weight.cpu()).cuda().contiguous()
+                return self.weight_transform(weight.cpu()).to(weight.device).contiguous()
             return weight
 
         source_format, bit = self.source_format, self.bit
@@ -624,7 +624,7 @@ def transform_weight(self, weight, scale=None, zeros=None, bias=None):
 
         # Apply an optional weight transformation if specified
         if self.weight_transform is not None:
-            weight = self.weight_transform(weight.cpu()).cuda().contiguous()
+            weight = self.weight_transform(weight.cpu()).to(weight.device).contiguous()
 
         # Prepare the return list with the transformed weight and optionally include scale, zeros, and bias
         result = [weight]
@@ -667,15 +667,14 @@ def forward(self, A, W, scale=None, zeros=None, bias=None, output=None) -> Any:
             args.append(bias)
         args.append(output)
 
-        if self.dynamic_range is not None:
-            m = reduce(operator.mul, A.shape[:-1], 1)
-            args.append(m)
-
-        stream = torch.cuda.current_stream()
-
         if self.lib is None:
             self._forward_from_torch_func(*args)
         else:
+            if self.dynamic_range is not None:
+                m = reduce(operator.mul, A.shape[:-1], 1)
+                args.append(m)
+
+            stream = torch.cuda.current_stream(device=A.device)
             self._forward_from_prebuild_lib(*args, stream=stream.cuda_stream)
 
         return output