diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 96cd46719..91dea66b8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -65,3 +65,19 @@ occamy-full-vsim:
     - make LENGTH=384 sw
     - make bin/occamy_top.vsim
     - ./run.py sw/run-full-occamy.yaml --simulator vsim
+
+##########################
+# Multicast Occamy tests #
+##########################
+
+# Questa
+occamy-mcast-vsim:
+  variables:
+    RISCV_CFLAGS: "-DUSE_MULTICAST"
+    DATA_CFG: "$(PWD)/sw/device/apps/blas/gemm/params.hjson"
+  script:
+    - cd target/sim
+    - make CFG_OVERRIDE=cfg/M-Q8C4.hjson rtl
+    - make LENGTH=1024 sw
+    - make bin/occamy_top.vsim
+    - ./run.py sw/run-full-occamy.yaml -j
diff --git a/Bender.local b/Bender.local
index aa72d87c3..f8ec04e0f 100644
--- a/Bender.local
+++ b/Bender.local
@@ -3,6 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 overrides:
-  axi:                { git: https://github.com/pulp-platform/axi.git,                version: 0.39.0-beta.4 }
-  common_cells:       { git: https://github.com/pulp-platform/common_cells.git,       version: 1.31.1 }
+  axi:                { git: https://github.com/pulp-platform/axi.git,                rev: feature/multicast-xbar }
+  common_cells:       { git:  https://github.com/pulp-platform/common_cells.git,      rev: multicast-xbar }
   register_interface: { git: https://github.com/pulp-platform/register_interface.git, version: 0.4.2 }
diff --git a/Bender.yml b/Bender.yml
index c5699134c..42ff9a51f 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -29,7 +29,7 @@ dependencies:
   cva6:                   { path: hw/vendor/openhwgroup_cva6                                                }
   opentitan_peripherals:  { path: hw/vendor/pulp_platform_opentitan_peripherals                             }
   register_interface:     { git:  https://github.com/pulp-platform/register_interface.git, version: 0.3.8   }
-  snitch_cluster:         { git:  https://github.com/pulp-platform/snitch_cluster.git,     rev:     extend/snRuntime }
+  snitch_cluster:         { git:  https://github.com/pulp-platform/snitch_cluster.git,     rev:     experimental/mcast }
   tech_cells_generic:     { git:  https://github.com/pulp-platform/tech_cells_generic.git, rev:     v0.2.11 }
 
 workspace:
diff --git a/hw/occamy/occamy_cva6.sv.tpl b/hw/occamy/occamy_cva6.sv.tpl
index 90c3c1f1d..87b03c0f6 100644
--- a/hw/occamy/occamy_cva6.sv.tpl
+++ b/hw/occamy/occamy_cva6.sv.tpl
@@ -72,7 +72,7 @@ module ${name}_cva6 import ${name}_pkg::*; (
     .AxiAddrWidth (${soc_narrow_xbar.in_cva6.aw}),
     .AxiDataWidth (${soc_narrow_xbar.in_cva6.dw}),
     .AxiIdWidth (${soc_narrow_xbar.in_cva6.iw}),
-    .AxiUserWidth (${max(1, soc_narrow_xbar.in_cva6.uw)}),
+    .AxiUserWidth (${cfg["narrow_xbar_user_width"]}),
     .axi_ar_chan_t(${soc_narrow_xbar.in_cva6.ar_chan_type()}),
     .axi_aw_chan_t(${soc_narrow_xbar.in_cva6.aw_chan_type()}),
     .axi_w_chan_t(${soc_narrow_xbar.in_cva6.w_chan_type()}),
diff --git a/hw/occamy/occamy_quadrant_s1.sv.tpl b/hw/occamy/occamy_quadrant_s1.sv.tpl
index 5226b3cd3..22ee70eb6 100644
--- a/hw/occamy/occamy_quadrant_s1.sv.tpl
+++ b/hw/occamy/occamy_quadrant_s1.sv.tpl
@@ -26,6 +26,7 @@
 %>
 
 `include "axi/typedef.svh"
+`include "axi/assign.svh"
 `include "axi_tlb/typedef.svh"
 
 /// Occamy Stage 1 Quadrant
diff --git a/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl b/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl
index 61ef61789..cd5ec0b3d 100644
--- a/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl
+++ b/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl
@@ -18,6 +18,8 @@
   wide_tlb_entries = wide_tlb_cfg.get("l1_num_entries", 1)
 %>
 
+`include "axi/assign.svh"
+
 module ${name}_quadrant_s1_ctrl
   import ${name}_pkg::*;
   import ${name}_quadrant_s1_reg_pkg::*;
diff --git a/hw/occamy/occamy_soc.sv.tpl b/hw/occamy/occamy_soc.sv.tpl
index 4d25d5333..1644e0583 100644
--- a/hw/occamy/occamy_soc.sv.tpl
+++ b/hw/occamy/occamy_soc.sv.tpl
@@ -421,6 +421,7 @@ module ${name}_soc
     logic [${wide_in.aw-1}:0] num_bytes;
     axi_pkg::cache_t    cache_src, cache_dst;
     axi_pkg::burst_t    burst_src, burst_dst;
+    logic [${wide_in.uw-1}:0] user_src, user_dst;
     logic               decouple_rw;
     logic               deburst;
     logic               serialize;
diff --git a/target/sim/cfg/M-Q8C4.hjson b/target/sim/cfg/M-Q8C4.hjson
new file mode 100644
index 000000000..fa1db50d4
--- /dev/null
+++ b/target/sim/cfg/M-Q8C4.hjson
@@ -0,0 +1,409 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Cluster configuration for Occamy.
+{
+    is_remote_quadrant: false,
+    remote_quadrants: [],
+    enable_multicast: true,
+    quadrant_pre_xbar: {
+      max_slv_trans: 64,
+      max_mst_trans: 64,
+      fall_through: false,
+    },
+    pre_xbar_slv_id_width_no_rocache: 3,
+    wide_xbar: {
+      max_slv_trans: 64,
+      max_mst_trans: 64,
+      fall_through: false,
+    },
+    quadrant_inter_xbar: {
+      max_slv_trans: 64,
+      max_mst_trans: 64,
+      fall_through: false,
+    },
+    hbm_xbar: {
+      max_slv_trans: 128,
+      max_mst_trans: 128,
+      fall_through: false,
+    },
+    narrow_xbar: {
+      max_slv_trans: 32,
+      max_mst_trans: 32,
+      fall_through: false,
+    },
+    cuts: {
+      narrow_to_quad: 3,
+      quad_to_narrow: 3,
+      quad_to_pre: 1,
+      pre_to_inter: 1,
+      inter_to_quad: 3,
+      narrow_to_cva6: 2,
+      narrow_conv_to_spm_narrow_pre: 2,
+      narrow_conv_to_spm_narrow: 1,
+      narrow_and_pcie: 3,
+      narrow_and_wide: 1,
+      wide_conv_to_spm_wide: 3,
+      wide_to_wide_zero_mem: 0,
+      wide_to_hbm: 3,
+      wide_and_inter: 3,
+      wide_and_hbi: 3,
+      narrow_and_hbi: 3,
+      pre_to_hbmx: 3,
+      hbmx_to_hbm: 3,
+      atomic_adapter_narrow: 1,
+      atomic_adapter_narrow_wide: 1,
+      // Give some flexibility in peripheral xbar placement
+      periph_axi_lite_narrow: 2,
+      periph_axi_lite: 2,
+      periph_axi_lite_narrow_hbm_xbar_cfg: 2,
+      // Non-right-side chip peripherals
+      periph_axi_lite_narrow_hbm_cfg: 3,
+      periph_axi_lite_narrow_pcie_cfg: 3,
+      periph_axi_lite_narrow_chip_ctrl_cfg: 3,
+      periph_axi_lite_narrow_hbi_narrow_cfg: 3,
+      periph_axi_lite_narrow_hbi_wide_cfg: 3,
+      periph_axi_lite_narrow_bootrom_cfg: 3,
+      periph_axi_lite_narrow_fll_system_cfg: 3,
+      periph_axi_lite_narrow_fll_periph_cfg: 3,
+      periph_axi_lite_narrow_fll_hbm2e_cfg: 3,
+      // Right-side or latency-invariant chip peripherals
+      periph_axi_lite_narrow_soc_ctrl_cfg: 1,
+      periph_axi_lite_narrow_uart_cfg: 1,
+      periph_axi_lite_narrow_i2c_cfg: 1,
+      periph_axi_lite_narrow_gpio_cfg: 1,
+      periph_axi_lite_narrow_clint_cfg: 1,
+      periph_axi_lite_narrow_plic_cfg: 1,
+      periph_axi_lite_narrow_spim_cfg: 1,
+      periph_axi_lite_narrow_timer_cfg: 1,
+    }
+    txns: {
+      wide_and_inter: 128,
+      wide_to_hbm: 128,
+      narrow_and_wide: 16,
+      rmq: 4,
+    }
+    narrow_xbar_slv_id_width: 4,
+    narrow_xbar_user_width: 5, // clog2(total number of clusters)
+    nr_s1_quadrant: 8,
+    s1_quadrant: {
+      nr_clusters: 4,
+      // number of pending transactions on the narrow/wide network
+      narrow_trans: 32,
+      wide_trans: 32,
+      // Disable for easier flow trials.
+      ro_cache_cfg: {
+          width: 1024,
+          count: 128,
+          sets: 2,
+          max_trans: 32,
+          address_regions: 4,
+      }
+      narrow_tlb_cfg: {
+          max_trans: 32,
+          l1_num_entries: 8,
+          l1_cut_ax: true,
+      }
+      wide_tlb_cfg: {
+          max_trans: 32,
+          l1_num_entries: 8,
+          l1_cut_ax: true,
+      }
+      wide_xbar: {
+        max_slv_trans: 32,
+        max_mst_trans: 32,
+        fall_through: false,
+      },
+      wide_xbar_slv_id_width: 3
+      narrow_xbar: {
+        max_slv_trans: 8,
+        max_mst_trans: 8,
+        fall_through: false,
+      },
+      narrow_xbar_slv_id_width: 4,
+      narrow_xbar_user_width: 5, // clog2(total number of clusters)
+      cfg_base_addr: 184549376, // 0x0b000000
+      cfg_base_offset: 65536 // 0x10000
+    },
+    cluster: {
+        name: "occamy_cluster"
+        boot_addr: 4096, // 0x1000
+        cluster_base_addr: 268435456, // 0x10000000
+        cluster_base_offset: 262144 // 0x40000
+        cluster_base_hartid: 1,
+        addr_width: 48,
+        data_width: 64,
+        user_width: 5, // clog2(total number of clusters)
+        tcdm: {
+            size: 128, // 128 kiB
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        dma_data_width: 512,
+        dma_user_width: 48, // same as addr_width
+        dma_axi_req_fifo_depth: 24,
+        dma_req_fifo_depth: 8,
+        narrow_trans: 4,
+        wide_trans: 32,
+        // We don't need Snitch debugging in Occamy
+        enable_debug: false,
+        // We don't need Snitch (core-internal) virtual memory support
+        vm_support: false,
+        // Memory configuration inputs
+        sram_cfg_expose: true,
+        sram_cfg_fields: {
+            ema: 3,
+            emaw: 2,
+            emas: 1
+        },
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 2,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 1,
+            lat_comp_fp16_alt: 1,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 2,
+            lat_sdotp: 3,
+            fpu_pipe_config: "BEFORE"
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true,
+            register_fpu_req: true,
+            register_ext_narrow: false,
+            register_ext_wide: false
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    sets: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ],
+    }
+    // Templates.
+    compute_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        ssr_nr_credits: 4,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // SSSR configuration below
+        ssr_intersection: true,
+        ssr_intersection_triple: [0, 1, 2],
+        ssrs: [
+            {indirection: true},    // Master 0
+            {indirection: true},    // Master 1
+            {},                     // Slave
+        ],
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        // Xdiv_sqrt: true,
+        # isa: "rv32ema",
+        xdma: true
+        xssr: false
+        xfrep: false
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+    // peripherals
+    peripherals: {
+        rom: {
+            address: 16777216, // 0x0100_0000
+            length: 131072, // 128 kiB 0x2_0000
+        },
+        clint: {
+                    address: 67108864, // 0x0400_0000
+                    length: 1048576, // 1 MiB 0x10_0000
+        },
+        axi_lite_peripherals: [
+            {
+                name: "debug",
+                address: 0, // 0x0000_0000
+                length: 4096, // 4 kiB 0x1000
+            }
+        ],
+        axi_lite_narrow_peripherals: [
+            {
+                name: "soc_ctrl",
+                address: 33554432, // 0x0200_0000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "fll_system",
+                address: 33558528, // 0x0200_1000
+                length: 1024, // 1 kiB 0x400
+            },
+            {
+                name: "fll_periph",
+                address: 33559552, // 0x0200_1400
+                length: 1024, // 1 kiB 0x400
+            },
+            {
+                name: "fll_hbm2e",
+                address: 33560576, // 0x0200_1800
+                length: 1024, // 1 kiB 0x400
+            },
+            {
+                name: "uart",
+                address: 33562624, // 0x0200_2000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "gpio",
+                address:  33566720, // 0x0200_3000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "i2c",
+                address: 33570816, // 0x0200_4000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "chip_ctrl",
+                address: 33574912, // 0x0200_5000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "timer",
+                address: 33579008, // 0x0200_6000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "hbm_xbar_cfg",
+                address: 33583104, // 0x0200_7000
+                length: 4096, // 4 kiB 0x1000
+            },
+            {
+                name: "spim",
+                address: 50331648, // 0x0300_0000
+                length: 131072, // 4 kiB 0x2_0000
+            },
+            {
+                name: "pcie_cfg",
+                address: 83886080, // 0x0500_0000
+                length: 131072, // 128 kiB 0x2_0000
+            },
+            {
+                name: "hbi_wide_cfg",
+                address: 100663296, // 0x0600_0000
+                length: 65536, // 64 kiB 0x1_0000
+            },
+            {
+                name: "hbi_narrow_cfg",
+                address: 117440512, // 0x0700_0000
+                length: 65536, // 64 kiB 0x1_0000
+            },
+            {
+                name: "plic",
+                address: 201326592, // 0x0C00_0000
+                length: 67108864, // 64 MiB 0x400_0000
+            },
+        ],
+    },
+    // non-peripheral IPs
+    pcie: {
+      address_io: 536870912, // 0x2000_0000
+      address_mm: 1207959552, // 0x4800_0000
+      length: 671088640, // 640 MiB 0x2800_0000
+    },
+    spm_narrow: {
+      address: 1879048192, // 0x7000_0000
+      length: 524288, // 512 kiB 0x8_0000
+      # An uncached alias address space of the same length
+      uncached_alias: 1879572480, // 0x7008_0000
+    },
+    spm_wide: {
+      address: 1895825408, // 0x7100_0000
+      length: 1048576, // 1 MiB 0x10_0000
+    },
+    wide_zero_mem: {
+      address: 4294967296, // 0x1_0000_0000
+      length: 8589934592, // 8 GiB 0x2_0000_0000
+    },
+    sys_idma_cfg: {
+      address: 285212672, // 0x1100_0000
+      length: 65536, // 64 kiB 0x1_0000
+    },
+    hbi: {
+        address: 1099511627776, // 0x100_0000_0000
+        length: 1099511627776, // 1 TiB 0x100_0000_0000
+    }
+    hbm: {
+        address_0: 2147483648, // 0x8000_0000
+        address_1: 68719476736, // 0x10_0000_0000
+        channel_size: 1073741824, // 1 GiB 0x4000_0000
+        nr_channels_total: 8,
+        nr_channels_address_0: 2,
+        cfg_regions: {
+            top: {
+                address: 134217728, // 0x0800_0000
+                length: 4194304, // 4 MiB 0x40_0000
+            },
+            phy: {
+                address: 150994944 // 0x0900_0000
+                length: 1048576, // 1 MiB 0x10_0000
+            },
+            seq: {
+                address: 167772160, // 0x0A00_0000
+                length: 65536, // 64 kiB 0x1_0000
+            },
+            ctrl: {
+                address: 176160768, // 0x0A80_0000
+                length: 65536, // 64 kiB 0x1_0000
+            }
+        }
+    },
+    // dram corresponds to 'hbm address_0' and 'nr_channels_address_0'
+    dram: {
+        address: 2147483648, // 0x8000_0000
+        length: 2147483648, // 2 GiB 0x8000_0000
+    },
+}
diff --git a/target/sim/cfg/full.hjson b/target/sim/cfg/full.hjson
index 4c042f913..8766fd184 100644
--- a/target/sim/cfg/full.hjson
+++ b/target/sim/cfg/full.hjson
@@ -6,6 +6,7 @@
 {
     is_remote_quadrant: false,
     remote_quadrants: [],
+    enable_multicast: false,
     quadrant_pre_xbar: {
       max_slv_trans: 64,
       max_mst_trans: 64,
diff --git a/target/sim/cfg/single-cluster.hjson b/target/sim/cfg/single-cluster.hjson
index 5c19ee054..418f2cf0e 100644
--- a/target/sim/cfg/single-cluster.hjson
+++ b/target/sim/cfg/single-cluster.hjson
@@ -6,6 +6,7 @@
 {
     is_remote_quadrant: false,
     remote_quadrants: [],
+    enable_multicast: false,
     quadrant_pre_xbar: {
       max_slv_trans: 64,
       max_mst_trans: 64,
diff --git a/target/sim/sw/device/apps/blas/gemm/params.hjson b/target/sim/sw/device/apps/blas/gemm/params.hjson
new file mode 100644
index 000000000..0f010cd3e
--- /dev/null
+++ b/target/sim/sw/device/apps/blas/gemm/params.hjson
@@ -0,0 +1,16 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMM
+
+{
+    M: 256,
+    N: 16,
+    K: 16,
+    beta: 0,
+    ta: false,
+    tb: true, // must be true for SIMD
+    prec: 64,
+    expand: 0
+}
diff --git a/target/sim/sw/host/apps/offload/src/offload.c b/target/sim/sw/host/apps/offload/src/offload.c
index 2b1fc9751..e504a9f1c 100644
--- a/target/sim/sw/host/apps/offload/src/offload.c
+++ b/target/sim/sw/host/apps/offload/src/offload.c
@@ -20,7 +20,7 @@ int main() {
     asm volatile("" ::: "memory");
 
     // Start Snitches
-    wakeup_snitches_cl();
+    wakeup_snitches();
 
     // Wait for job done and return Snitch exit code
     return wait_snitches_done();
diff --git a/target/sim/sw/host/runtime/host.c b/target/sim/sw/host/runtime/host.c
index 68410bc6a..be81c3953 100644
--- a/target/sim/sw/host/runtime/host.c
+++ b/target/sim/sw/host/runtime/host.c
@@ -198,24 +198,17 @@ static inline void wakeup_cluster(uint32_t cluster_id) {
     *(cluster_clint_set_ptr(cluster_id)) = 511;
 }
 
-/**
- * @brief Wake-up Snitches
- *
- * @detail All Snitches are "parked" in a WFI. A SW interrupt
- *         must be issued to "unpark" every Snitch. This function
- *         sends a SW interrupt to all Snitches.
- */
-void wakeup_snitches() {
-    for (int i = 0; i < N_CLUSTERS; i++) set_sw_interrupt(i);
-}
-
 /**
  * @brief Wake-up Snitches
  *
  * @detail Send a cluster interrupt to all Snitches
  */
-static inline void wakeup_snitches_cl() {
+static inline void wakeup_snitches() {
+#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST)
+    multicast_to_clusters(cluster_clint_set_addr(0), 511);
+#else
     for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(i);
+#endif
 }
 
 /**
@@ -615,3 +608,26 @@ void deactivate_interleaved_mode_hbm() {
         OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR;
     *((volatile uint32_t*)addr) = 1;
 }
+
+//===============================================================
+// CVA6 extensions
+//===============================================================
+
+static inline void multicast(uint64_t addr, uint64_t mask, uint64_t value) {
+    enable_multicast(mask);
+    uint64_t* p = (uint64_t*)addr;
+    *p = value;
+    disable_multicast();
+}
+
+static inline void enable_multicast(uint64_t mask) {
+    asm volatile("csrw 0x7c0, %[mask]\n" : : [ mask ] "r"(mask) : "memory");
+}
+
+static inline void disable_multicast() {
+    asm volatile("csrw 0x7c0, 0" : : : "memory");
+}
+
+static inline void multicast_to_clusters(uint64_t addr, uint64_t value) {
+    multicast(addr, CLUSTER_BCAST_MASK, value);
+}
diff --git a/target/sim/sw/host/runtime/host.h b/target/sim/sw/host/runtime/host.h
index 55e1623ab..c110ed0e9 100644
--- a/target/sim/sw/host/runtime/host.h
+++ b/target/sim/sw/host/runtime/host.h
@@ -5,12 +5,20 @@
 #include <stddef.h>
 #include <stdint.h>
 
-static inline void set_sw_interrupt(uint32_t hartid);
+#define CLUSTER_BCAST_MASK ((N_CLUSTERS - 1) << 18)
 
 void delay_ns(uint64_t delay);
 
 static inline volatile uint32_t* get_shared_lock();
 
+static inline void set_sw_interrupt(uint32_t hartid);
+
 static inline void wait_sw_interrupt();
 
-static inline void clear_sw_interrupt(uint32_t hartid);
+static inline void enable_multicast(uint64_t mask);
+
+static inline void disable_multicast();
+
+static inline void multicast(uint64_t addr, uint64_t mask, uint64_t value);
+
+static inline void multicast_to_clusters(uint64_t addr, uint64_t value);
diff --git a/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl b/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl
index 1787e7110..48b155fd7 100644
--- a/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl
+++ b/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl
@@ -4,4 +4,7 @@
 
 #define N_QUADS ${cfg['nr_s1_quadrant']}
 #define N_CLUSTERS_PER_QUAD ${cfg['s1_quadrant']['nr_clusters']}
-#define N_CORES_PER_CLUSTER ${cfg['cluster']['nr_cores']}
\ No newline at end of file
+#define N_CORES_PER_CLUSTER ${cfg['cluster']['nr_cores']}
+% if cfg['enable_multicast']:
+#define SUPPORTS_MULTICAST
+% endif
diff --git a/util/occamygen/occamy.py b/util/occamygen/occamy.py
index 73a3e748f..2b699f10c 100644
--- a/util/occamygen/occamy.py
+++ b/util/occamygen/occamy.py
@@ -57,6 +57,7 @@ def __init__(self, cfg):
         self.cluster.cfg["boot_addr"] = self.cfg["peripherals"]["rom"]["address"]
 
         self.cluster.cfg['tie_ports'] = False
+        self.cluster.cfg['enable_multicast'] = cfg['enable_multicast']
 
         if "ro_cache_cfg" in self.cfg["s1_quadrant"]:
             ro_cache = self.cfg["s1_quadrant"]["ro_cache_cfg"]
diff --git a/util/occamygen/occamygen.py b/util/occamygen/occamygen.py
index dbb48f147..39016188d 100755
--- a/util/occamygen/occamygen.py
+++ b/util/occamygen/occamygen.py
@@ -27,6 +27,13 @@
 DEFAULT_NAME = "occamy"
 
 
+def clog2(x):
+    """Ceiling of log2"""
+    if x <= 0:
+        raise ValueError("domain error")
+    return (x-1).bit_length()
+
+
 def write_template(tpl_path, outdir, fname=None, **kwargs):
     if tpl_path:
         tpl_path = pathlib.Path(tpl_path).absolute()
@@ -136,6 +143,8 @@ def main():
     nr_s1_quadrants = occamy.cfg["nr_s1_quadrant"]
     nr_s1_clusters = occamy.cfg["s1_quadrant"]["nr_clusters"]
     is_remote_quadrant = occamy.cfg["is_remote_quadrant"]
+    enable_narrow_multicast = occamy.cfg["enable_multicast"]
+    enable_wide_multicast = occamy.cfg["enable_multicast"]
     # Iterate over Hives to get the number of cores.
     nr_cluster_cores = len([
         core for hive in occamy.cfg["cluster"]["hives"]
@@ -564,12 +573,15 @@ def main():
             no_loopback=True,
             atop_support=False,
             context="soc",
-            node=am_quadrant_pre_xbar[i])
+            node=am_quadrant_pre_xbar[i],
+            forward_mcast=enable_wide_multicast)
 
         # Default port:
-        quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar", am_quadrant_inter_xbar)
+        quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar",
+                                           am_quadrant_inter_xbar,
+                                           forward_mcast=enable_wide_multicast)
         quadrant_pre_xbar.add_output_entry("hbm_xbar", am_hbm_xbar)
-        quadrant_pre_xbar.add_input("quadrant")
+        quadrant_pre_xbar.add_input("quadrant", is_mcast_master=enable_wide_multicast)
 
         quadrant_pre_xbars.append(quadrant_pre_xbar)
 
@@ -588,17 +600,18 @@ def main():
         no_loopback=True,
         atop_support=False,
         context="soc",
-        node=am_quadrant_inter_xbar)
+        node=am_quadrant_inter_xbar,
+        enable_multicast=enable_wide_multicast)
 
-    # Default port: soc wide xbar
-    quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar)
-    quadrant_inter_xbar.add_input("wide_xbar")
     for i in range(nr_s1_quadrants):
         # Default route passes HBI through quadrant 0
         # --> mask this route, forcing it through default wide xbar
         quadrant_inter_xbar.add_output_entry("quadrant_{}".format(i),
-                                             am_wide_xbar_quadrant_s1[i])
-        quadrant_inter_xbar.add_input("quadrant_{}".format(i))
+                                             am_wide_xbar_quadrant_s1[i],
+                                             is_mcast_target=enable_wide_multicast,
+                                             forward_mcast=enable_wide_multicast)
+        quadrant_inter_xbar.add_input("quadrant_{}".format(i),
+                                      is_mcast_master=enable_wide_multicast)
     for i, rq in enumerate(occamy.cfg["remote_quadrants"]):
         quadrant_inter_xbar.add_input("rmq_{}".format(i))
         quadrant_inter_xbar.add_output_entry("rmq_{}".format(i), am_remote_quadrants[i])
@@ -607,6 +620,9 @@ def main():
         quadrant_inter_xbar.add_output("remote", [])
         quadrant_inter_xbar.add_input("remote")
 
+    quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar)
+    quadrant_inter_xbar.add_input("wide_xbar")
+
     hbm_xbar = solder.AxiXbar(
         48,
         512,
@@ -675,17 +691,20 @@ def main():
         fall_through=occamy.cfg["narrow_xbar"]["fall_through"],
         no_loopback=True,
         context="soc",
-        node=am_soc_narrow_xbar)
+        node=am_soc_narrow_xbar,
+        enable_multicast=enable_narrow_multicast)
 
     for i in range(nr_s1_quadrants):
         soc_narrow_xbar.add_output_symbolic_multi("s1_quadrant_{}".format(i),
-                                                  [("s1_quadrant_base_addr",
+                                                  [(f"s1_quadrant_base_addr[{i}]",
                                                     "S1QuadrantAddressSpace"),
-                                                   ("s1_quadrant_cfg_base_addr",
-                                                    "S1QuadrantCfgAddressSpace")])
+                                                   (f"s1_quadrant_cfg_base_addr[{i}]",
+                                                    "S1QuadrantCfgAddressSpace")],
+                                                  is_mcast_target=enable_narrow_multicast,
+                                                  forward_mcast=enable_narrow_multicast)
         soc_narrow_xbar.add_input("s1_quadrant_{}".format(i))
 
-    soc_narrow_xbar.add_input("cva6")
+    soc_narrow_xbar.add_input("cva6", is_mcast_master=enable_narrow_multicast)
     soc_narrow_xbar.add_input("soc_wide")
     soc_narrow_xbar.add_input("periph")
     soc_narrow_xbar.add_input("pcie")
@@ -716,9 +735,10 @@ def main():
 
     # We need 3 "crossbars", which are really simple muxes and demuxes
     quadrant_s1_ctrl_xbars = dict()
-    for name, (iw, lm) in {
-        'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS"),
-        'quad_to_soc': (soc_narrow_xbar.iw, "axi_pkg::CUT_MST_PORTS"),
+    for name, (iw, lm, forward_mcast) in {
+        'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS",
+                        enable_narrow_multicast),
+        'quad_to_soc': (soc_narrow_xbar.iw, "axi_pkg::CUT_MST_PORTS", False),
     }.items():
         # Reuse (preserve) narrow Xbar IDs and max transactions
         quadrant_s1_ctrl_xbars[name] = solder.AxiXbar(
@@ -733,13 +753,15 @@ def main():
             max_mst_trans=occamy.cfg["narrow_xbar"]["max_mst_trans"],
             fall_through=occamy.cfg["narrow_xbar"]["fall_through"],
             latency_mode=lm,
-            context="quadrant_s1_ctrl")
+            context="quadrant_s1_ctrl",
+            forward_mcast=forward_mcast,
+            enable_default_mst_port=True,
+            default_mst_port_idx=0)
 
-    for name in ['soc_to_quad', 'quad_to_soc']:
-        quadrant_s1_ctrl_xbars[name].add_output("out", [])
-        quadrant_s1_ctrl_xbars[name].add_input("in")
+        quadrant_s1_ctrl_xbars[name].add_output("out", [], forward_mcast=forward_mcast)
+        quadrant_s1_ctrl_xbars[name].add_input("in", is_mcast_master=forward_mcast)
         quadrant_s1_ctrl_xbars[name].add_output_symbolic("internal",
-                                                         "internal_xbar_base_addr",
+                                                         "internal_xbar_base_addr[0]",
                                                          "S1QuadrantCfgAddressSpace")
 
     # AXI Lite mux to combine register requests
@@ -775,7 +797,10 @@ def main():
         no_loopback=True,
         atop_support=False,
         context="quadrant_s1",
-        node=am_wide_xbar_quadrant_s1[0])
+        node=am_wide_xbar_quadrant_s1[0],
+        enable_multicast=enable_wide_multicast,
+        enable_default_mst_port=True,
+        default_mst_port_idx=nr_s1_clusters)
 
     narrow_xbar_quadrant_s1 = solder.AxiXbar(
         48,
@@ -791,25 +816,37 @@ def main():
         ["max_mst_trans"],
         fall_through=occamy.cfg["s1_quadrant"]["narrow_xbar"]["fall_through"],
         no_loopback=True,
-        context="quadrant_s1")
-
-    wide_xbar_quadrant_s1.add_output("top", [])
-    wide_xbar_quadrant_s1.add_input("top")
-
-    narrow_xbar_quadrant_s1.add_output("top", [])
-    narrow_xbar_quadrant_s1.add_input("top")
+        context="quadrant_s1",
+        enable_multicast=enable_narrow_multicast,
+        enable_default_mst_port=True,
+        default_mst_port_idx=nr_s1_clusters)
 
     for i in range(nr_s1_clusters):
         wide_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i),
-                                                  "cluster_base_addr",
-                                                  "ClusterAddressSpace")
+                                                  f"cluster_base_addr[{i}]",
+                                                  "ClusterAddressSpace",
+                                                  is_mcast_target=enable_wide_multicast,
+                                                  forward_mcast=enable_wide_multicast)
+        wide_xbar_quadrant_s1.add_input("cluster_{}".format(i),
+                                        is_mcast_master=enable_wide_multicast)
 
-        wide_xbar_quadrant_s1.add_input("cluster_{}".format(i))
         narrow_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i),
-                                                    "cluster_base_addr",
-                                                    "ClusterAddressSpace")
+                                                    f"cluster_base_addr[{i}]",
+                                                    "ClusterAddressSpace",
+                                                    is_mcast_target=enable_narrow_multicast,
+                                                    forward_mcast=False)
         narrow_xbar_quadrant_s1.add_input("cluster_{}".format(i))
 
+    wide_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_wide_multicast)
+    wide_xbar_quadrant_s1.add_output("top", [],
+                                     is_mcast_target=enable_wide_multicast,
+                                     forward_mcast=enable_wide_multicast)
+
+    narrow_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_narrow_multicast)
+    narrow_xbar_quadrant_s1.add_output("top", [],
+                                       is_mcast_target=enable_narrow_multicast,
+                                       forward_mcast=False)
+
     # remote downstream mux
     rmq_mux = [None]*max(nr_remote_quadrants, 1 if is_remote_quadrant else 0)
     rmq_demux = [None]*max(nr_remote_quadrants, 1 if is_remote_quadrant else 0)
@@ -855,6 +892,10 @@ def main():
                                     dw=soc_axi_lite_narrow_periph_xbar.dw,
                                     name="apb_hbm_cfg")
 
+    ###########
+    # CodeGen #
+    ###########
+
     kwargs = {
         "solder": solder,
         "util": util,
@@ -902,20 +943,13 @@ def main():
     ###########################
     # SoC (fully synchronous) #
     ###########################
+
     write_template(args.soc_sv,
                    outdir,
                    module=solder.code_module['soc'],
                    soc_periph_xbar=soc_axi_lite_periph_xbar,
                    **kwargs)
 
-    ##########################
-    # S1 Quadrant controller #
-    ##########################
-    write_template(args.quadrant_s1_ctrl,
-                   outdir,
-                   module=solder.code_module['quadrant_s1_ctrl'],
-                   **kwargs)
-
     ###############
     # S1 Quadrant #
     ###############
@@ -934,6 +968,14 @@ def main():
                 with open("{}/{}_quadrant_s1.sv".format(outdir, args.name), 'w') as f:
                     f.write("// no quadrants in this design")
 
+    ##########################
+    # S1 Quadrant controller #
+    ##########################
+    write_template(args.quadrant_s1_ctrl,
+                   outdir,
+                   module=solder.code_module['quadrant_s1_ctrl'],
+                   **kwargs)
+
     ##################
     # Xilinx Wrapper #
     ##################
diff --git a/util/solder/solder.axi_lite_xbar.sv.tpl b/util/solder/solder.axi_lite_xbar.sv.tpl
index 5d623bdac..f1ec1aca0 100644
--- a/util/solder/solder.axi_lite_xbar.sv.tpl
+++ b/util/solder/solder.axi_lite_xbar.sv.tpl
@@ -44,7 +44,8 @@ localparam axi_pkg::xbar_cfg_t ${cfg_name} = '{
   UniqueIds:          0,
   AxiAddrWidth:       ${xbar.aw},
   AxiDataWidth:       ${xbar.dw},
-  NoAddrRules:        ${xbar.addr_map_len()}
+  NoAddrRules:        ${xbar.addr_map_len()},
+  default:            '0
 };
 
 // AXI plugs of the `${xbar.name}` crossbar.
diff --git a/util/solder/solder.py b/util/solder/solder.py
index a5bb5997b..f293eeb1d 100644
--- a/util/solder/solder.py
+++ b/util/solder/solder.py
@@ -8,6 +8,8 @@
 import math
 import pathlib
 import logging
+import operator
+from termcolor import cprint
 
 from copy import copy
 from mako.lookup import TemplateLookup
@@ -321,17 +323,33 @@ def __init__(self):
 class AxiStruct:
     configs = dict()
 
-    def emit(aw, dw, iw, uw):
+    def emit(aw, dw, iw, uw, enable_multicast=False):
         global code_package
-        key = (aw, dw, iw, uw)
+        key = (aw, dw, iw, uw, enable_multicast)
+        # Skip emission if struct was already emitted. Ensures the
+        # same type is not defined multiple times
         if key in AxiStruct.configs:
             return AxiStruct.configs[key]
-        name = "axi_a{}_d{}_i{}_u{}".format(*key)
-        code = "// AXI bus with {} bit address, {} bit data, {} bit IDs, \
-            and {} bit user data.\n".format(*key)
+        if enable_multicast:
+            name = "axi_a{}_d{}_i{}_u{}_mcast".format(*key[0:-1])
+        else:
+            name = "axi_a{}_d{}_i{}_u{}".format(*key[0:-1])
+        code = ("// AXI bus with {} bit address, {} bit data, {} bit IDs,"
+                "and {} bit user data.\n").format(*key[0:-1])
         code += "`AXI_TYPEDEF_ALL_CT({}, {}_req_t, {}_resp_t, ".format(name, name, name)
-        code += "logic [{}:0], logic [{}:0], logic [{}:0], logic [{}:0], logic [{}:0])\n".format(
-            aw - 1, iw - 1, dw - 1, (dw + 7) // 8 - 1, max(0, uw - 1))
+        code += f"logic [{aw - 1}:0],"
+        code += f"logic [{iw - 1}:0], "
+        code += f"logic [{dw - 1}:0], "
+        code += f"logic [{(dw + 7) // 8 - 1}:0], "
+        if enable_multicast:
+            user_t = "struct packed {"
+            user_t += f"logic [{max(0, aw - 1)}:0] mcast;"
+            if uw > aw:
+                user_t += f" logic [{max(0, uw - aw - 1)}:0] atomics_id;"
+            user_t += "}"
+        else:
+            user_t = f"logic [{max(0, uw - 1)}:0]"
+        code += f"{user_t})\n"
         code_package += "\n" + code
         AxiStruct.configs[key] = name
         return name
@@ -681,8 +699,8 @@ def change_uw(self, context, target_uw, name, inst_name=None, to=None):
         assgn = "// Change UW\n"
         assgn += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=bus.req_name(),
                                                                 rhs=self.req_name())
-        assgn += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n".format(lhs=self.rsp_name(),
-                                                                 rhs=bus.rsp_name())
+        assgn += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=self.rsp_name(),
+                                                                   rhs=bus.rsp_name())
         context.write(assgn)
         return bus
 
@@ -1458,6 +1476,10 @@ def __init__(self,
                  atop_support=True,
                  latency_mode=None,
                  interleaved_ena=False,
+                 enable_multicast=False,
+                 forward_mcast=False,
+                 enable_default_mst_port=False,
+                 default_mst_port_idx=0,
                  **kwargs):
         super().__init__(**kwargs)
         self.aw = aw
@@ -1468,50 +1490,111 @@ def __init__(self,
         self.max_mst_trans = max_mst_trans
         self.fall_through = fall_through
         self.no_loopback = no_loopback
-        self.symbolic_addrmap = list()
-        self.symbolic_addrmap_multi = list()
         self.atop_support = atop_support
         self.interleaved_ena = interleaved_ena
+        self.enable_multicast = enable_multicast
+        if self.enable_multicast:
+            self.forward_mcast = True
+        else:
+            self.forward_mcast = forward_mcast
+        self.enable_default_mst_port = enable_default_mst_port
+        self.default_mst_port_idx = default_mst_port_idx
         self.addrmap = list()
         self.connections = dict()
         self.latency_mode = latency_mode or "axi_pkg::CUT_ALL_PORTS"
 
-    def add_input(self, name, outputs=None):
-        self.inputs.append(name)
+    def add_input(self, name, is_mcast_master=False, outputs=None):
+        self.inputs.append({'name': name, 'is_mcast_master': is_mcast_master})
         if outputs:
             self.connections[name] = outputs
 
-    def add_output(self, name, addrs, default=False):
+    def add_output(self, name, addrs, is_mcast_target=False, forward_mcast=False):
         idx = len(self.outputs)
         for lo, hi in addrs:
             if hi >> self.aw == 1:
                 hi -= 1
-            self.addrmap.append((idx, lo, hi))
-        self.outputs.append(name)
-
-    def add_output_symbolic(self, name, base, length):
+            self.addrmap.append({'idx': idx,
+                                 'is_symbolic': False,
+                                 'is_mcast_rule': is_mcast_target,
+                                 'lo': lo,
+                                 'hi': hi})
+        self.outputs.append({'name': name,
+                             'is_mcast_target': is_mcast_target,
+                             'forward_mcast': forward_mcast})
+
+    def add_output_symbolic(self, name, base, length, is_mcast_target=False, forward_mcast=False):
         idx = len(self.outputs)
-        self.symbolic_addrmap.append((idx, base, length))
-        self.outputs.append(name)
-
-    def add_output_symbolic_multi(self, name, entries):
+        self.addrmap.append({'idx': idx,
+                             'is_symbolic': True,
+                             'is_mcast_rule': is_mcast_target,
+                             'base': base,
+                             'length': length})
+        self.outputs.append({'name': name,
+                             'is_mcast_target': is_mcast_target,
+                             'forward_mcast': forward_mcast})
+
+    def add_output_symbolic_multi(self, name, entries, is_mcast_target=False, forward_mcast=False):
         idx = len(self.outputs)
-        self.symbolic_addrmap_multi.append((idx, entries))
-        self.outputs.append(name)
-
-    def add_output_entry(self, name, entry, range_mask=None):
+        for base, length in entries:
+            self.addrmap.append({'idx': idx,
+                                 'is_symbolic': True,
+                                 'is_mcast_rule': is_mcast_target,
+                                 'base': base,
+                                 'length': length})
+        self.outputs.append({'name': name,
+                             'is_mcast_target': is_mcast_target,
+                             'forward_mcast': forward_mcast})
+
+    def add_output_entry(self, name, entry, range_mask=None, is_mcast_target=False,
+                         forward_mcast=False):
         addrs = [(r.lo, r.hi) for r in self.node.get_routes() if r.port == entry]
         if range_mask is not None:
             addrs = filter(lambda r: r[0] >= range_mask[0] and r[1] < range_mask[1], addrs)
-        self.add_output(name, addrs)
+        self.add_output(name, addrs, is_mcast_target=is_mcast_target, forward_mcast=forward_mcast)
 
     def addr_map_len(self):
-        return len(self.addrmap) + len(self.symbolic_addrmap) + sum(
-            len(am[1]) for am in self.symbolic_addrmap_multi)
+        return len(self.addrmap)
+
+    def num_mcast_rules(self):
+        if self.enable_multicast:
+            return len([rule for rule in self.addrmap if rule['is_mcast_rule']])
+        else:
+            return 0
+
+    def num_mcast_ports(self):
+        if self.enable_multicast:
+            return len([output for output in self.outputs if output['is_mcast_target']])
+        else:
+            return 0
 
     def iw_out(self):
         return self.iw + int(math.ceil(math.log2(max(1, len(self.inputs)))))
 
+    def union_multicast_rules(self):
+        # Issue warning to manually check rules are ordered and contiguous
+        cprint(f"Warning: please ensure manually that following rules for the {self.name} XBAR"
+               " are contiguous and ordered (lower addresses first).", color="yellow")
+        for rule in self.addrmap[0:self.num_mcast_rules()]:
+            if rule['is_symbolic']:
+                cprint(f"{self.outputs[rule['idx']]['name']} [{rule['base']},"
+                       f" {rule['base']} + {rule['length']}]", color="yellow")
+            else:
+                cprint(f"{self.outputs[rule['idx']]['name']} [{rule['lo']}, {rule['hi']}]",
+                       color="yellow")
+        # Get start address of the first multicast rule in the addrmap
+        first_rule = self.addrmap[0]
+        if first_rule['is_symbolic']:
+            start_addr = first_rule['base']
+        else:
+            start_addr = first_rule['lo']
+        # Get end address of the last multicast rule in the addrmap
+        last_rule = self.addrmap[self.num_mcast_rules()-1]
+        if last_rule['is_symbolic']:
+            end_addr = last_rule['base'] + ' + ' + last_rule['length']
+        else:
+            end_addr = last_rule['hi']
+        return start_addr, end_addr
+
     def emit(self):
         global code_module
         global code_package
@@ -1522,14 +1605,35 @@ def emit(self):
         # Compute the ID widths.
         iw_in = self.iw
         iw_out = self.iw_out()
+        # Compute the USER widths
+        uw = self.uw
+
+        # Multicast requirements
+        if self.enable_multicast:
+            # Check that multicast-targetable slaves are at lower indices.
+            # If a multicast-targetable slave follows a non multicast-targetable slave
+            # we have a violation of this rule.
+            violations = []
+            for i in range(len(self.outputs) - 1):
+                if not self.outputs[i]['is_mcast_target']:
+                    if self.outputs[i+1]['is_mcast_target']:
+                        violations.append(True)
+            assert (not violations), \
+                f'{self.name}: multicast-targetable slaves must be at lower indices'
+            # Sort address map rules by `is_multicast_rule` to ensure that
+            # multicast rules are at lower indices
+            self.addrmap.sort(key=operator.itemgetter('is_mcast_rule'), reverse=True)
+        if self.forward_mcast:
+            # Add multicast mask to USER signal
+            uw += self.aw
 
         # Emit the input enum into the package.
         input_enum_name = "{}_inputs_e".format(self.name)
         input_enum = "/// Inputs of the `{}` crossbar.\n".format(self.name)
         input_enum += "typedef enum int {\n"
         input_enums = list()
-        for name in self.inputs:
-            x = "{}_in_{}".format(self.name, name).upper()
+        for inp in self.inputs:
+            x = "{}_in_{}".format(self.name, inp['name']).upper()
             input_enums.append(x)
             input_enum += "  {},\n".format(x)
         input_enum += "  {}_NUM_INPUTS\n".format(self.name.upper())
@@ -1541,8 +1645,8 @@ def emit(self):
         output_enum = "/// Outputs of the `{}` crossbar.\n".format(self.name)
         output_enum += "typedef enum int {\n"
         output_enums = list()
-        for name in self.outputs:
-            x = "{}_out_{}".format(self.name, name).upper()
+        for output in self.outputs:
+            x = "{}_out_{}".format(self.name, output['name']).upper()
             output_enums.append(x)
             output_enum += "  {},\n".format(x)
         output_enum += "  {}_NUM_OUTPUTS\n".format(self.name.upper())
@@ -1568,7 +1672,10 @@ def emit(self):
         cfg += "  UniqueIds:          {},\n".format(0)
         cfg += "  AxiAddrWidth:       {},\n".format(self.aw)
         cfg += "  AxiDataWidth:       {},\n".format(self.dw)
-        cfg += "  NoAddrRules:        {}\n".format(self.addr_map_len())
+        cfg += "  NoAddrRules:        {},\n".format(self.addr_map_len())
+        cfg += "  NoMulticastRules:   {},\n".format(self.num_mcast_rules())
+        cfg += "  NoMulticastPorts:   {},\n".format(self.num_mcast_ports())
+        cfg += "  default:            '0\n"
         cfg += "};\n"
 
         code_package += "\n" + cfg
@@ -1581,36 +1688,68 @@ def emit(self):
             self.addr_map_len() - 1, addrmap_name)
         addrmap += "assign {} = '{{\n".format(addrmap_name)
         addrmap_lines = []
-        for i in range(len(self.addrmap)):
-            addrmap_lines.append(
-                "  '{{ idx: {}, start_addr: {aw}'h{:08x}, end_addr: {aw}'h{:08x} }}".format(
-                    *self.addrmap[i], aw=self.aw))
-        for i, (idx, base, length) in enumerate(self.symbolic_addrmap):
-            addrmap_lines.append(
-                "  '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format(
-                    idx, base, base, length, i=i))
-        for i, (idx, entries) in enumerate(self.symbolic_addrmap_multi):
-            for base, length in entries:
-                addrmap_lines.append(
-                    "  '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format(
-                        idx, base, base, length, i=i))
+        # Invert order of rules in address map as lower indices come last in System Verilog
+        # array initializers
+        for rule in reversed(self.addrmap):
+            if rule['is_symbolic']:
+                line = "  '{{ idx: {}, start_addr: {}, end_addr: {} + {} }}".format(
+                    rule['idx'], rule['base'], rule['base'], rule['length'])
+            else:
+                line = "  '{{ idx: {}, start_addr: {aw}'h{:08x}, end_addr: {aw}'h{:08x} }}".format(
+                    rule['idx'], rule['lo'], rule['hi'], aw=self.aw)
+            addrmap_lines.append(line)
         addrmap += "{}\n}};\n".format(',\n'.join(addrmap_lines))
 
         code_module[self.context] += "\n" + addrmap
 
+        # Emit the default port definition
+        en_default_mst_port_i = f"'{int(self.enable_default_mst_port)}"
+        if self.enable_default_mst_port:
+            if self.enable_multicast:
+                union_start_addr, union_end_addr = self.union_multicast_rules()
+                default_port = "/// Default port of the `{}` crossbar.\n".format(self.name)
+                default_port += f"xbar_rule_{self.aw}_t {self.name}_default_port;\n"
+                default_port += f"assign {self.name}_default_port = '{{\n"
+                default_port += f"  idx: {self.default_mst_port_idx},\n"
+                default_port += f"  start_addr: {union_start_addr},\n"
+                default_port += f"  end_addr: {union_end_addr}\n"
+                default_port += "};\n"
+                code_module[self.context] += "\n" + default_port
+                default_mst_port_i = f"{{{len(self.inputs)}{{{self.name}_default_port}}}}"
+            else:
+                default_mst_port_idx_bits = (len(self.outputs)-1).bit_length()
+                default_port = f"{default_mst_port_idx_bits}" + \
+                               f"'b{self.default_mst_port_idx:b}"
+                default_mst_port_i = f"{{{len(self.inputs)}{{{default_port}}}}}"
+        else:
+            default_mst_port_i = "'0"
+
         # Emit the AXI structs into the package.
         self.input_struct = AxiStruct.emit(self.aw, self.dw, iw_in, self.uw)
         self.output_struct = AxiStruct.emit(self.aw, self.dw, iw_out, self.uw)
-
+        if self.forward_mcast:
+            self.input_struct_mcast = AxiStruct.emit(self.aw, self.dw, iw_in, uw,
+                                                     enable_multicast=True)
+            self.output_struct_mcast = AxiStruct.emit(self.aw, self.dw, iw_out, uw,
+                                                      enable_multicast=True)
+
+        # Rename (typedef) the generic AXI structs generated above to unique types
+        # for the input and output signals of this XBAR
         code_package += "\n"
         for tds in [
                 "req", "resp", "aw_chan", "w_chan", "b_chan", "ar_chan",
                 "r_chan"
         ]:
+            if self.forward_mcast:
+                input_struct = self.input_struct_mcast
+                output_struct = self.output_struct_mcast
+            else:
+                input_struct = self.input_struct
+                output_struct = self.output_struct
             code_package += "typedef {}_{tds}_t {}_in_{tds}_t;\n".format(
-                self.input_struct, self.name, tds=tds)
+                input_struct, self.name, tds=tds)
             code_package += "typedef {}_{tds}_t {}_out_{tds}_t;\n".format(
-                self.output_struct, self.name, tds=tds)
+                output_struct, self.name, tds=tds)
 
         # Emit the characteristics of the AXI plugs into the package.
         code_package += "\n"
@@ -1637,64 +1776,114 @@ def emit(self):
             len(self.outputs) - 1, self.name)
         code_module[self.context] += "\n" + code
 
-        for name, enum in zip(self.inputs, input_enums):
-            bus = AxiBus(
-                self.clk,
-                self.rst,
-                self.aw,
-                self.dw,
-                iw_in,
-                self.uw,
-                "{}_in".format(self.name),
-                "[{}]".format(enum),
-                type_prefix=self.input_struct,
-                declared=True,
-            )
-            self.__dict__["in_" + name] = bus
+        # Generate the buses to connect to every input port
+        # Note: does not generate any code, but may be referenced by the templates
+        code = ""
+        for inp, enum in zip(self.inputs, input_enums):
+            if self.forward_mcast:
+
+                if inp['is_mcast_master']:
+
+                    bus = AxiBus(
+                        self.clk,
+                        self.rst,
+                        self.aw,
+                        self.dw,
+                        iw_in,
+                        uw,
+                        "{}_in".format(self.name),
+                        "[{}]".format(enum),
+                        type_prefix=self.input_struct_mcast,
+                        declared=True,
+                    )
+                    self.__dict__["in_" + inp['name']] = bus
+
+                else:
+
+                    # If the XBAR supports multicast, for all input buses which
+                    # are not multicast masters, we need to change the
+                    # user width of the respective interfaces
+                    input_req = f"{self.name}_in_req[{enum}]"
+                    input_rsp = f"{self.name}_in_rsp[{enum}]"
+
+                    input_uwc = f"{self.name}_in_{inp['name']}_uwc"
+                    input_uwc_req = input_uwc + "_req"
+                    input_uwc_rsp = input_uwc + "_rsp"
+
+                    # Declare the intermediate interfaces first
+                    code += f"// Declare UWC input to {self.name}\n"
+                    code += f"  {self.input_struct + '_req_t'} {input_uwc_req};\n"
+                    code += f"  {self.input_struct + '_resp_t'} {input_uwc_rsp};\n\n"
+
+                    # And then change the user width
+                    code += "// Change UW\n"
+                    code += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=input_req,
+                                                                           rhs=input_uwc_req)
+                    code += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=input_uwc_rsp,
+                                                                              rhs=input_rsp)
+
+                    bus = AxiBus(
+                            self.clk,
+                            self.rst,
+                            self.aw,
+                            self.dw,
+                            iw_in,
+                            self.uw,
+                            input_uwc,
+                            type_prefix=self.input_struct,
+                            declared=True,
+                        )
+                    self.__dict__["in_" + inp['name']] = bus
 
-        for name, enum in zip(self.outputs, output_enums):
-            bus = AxiBus(
-                self.clk,
-                self.rst,
-                self.aw,
-                self.dw,
-                iw_out,
-                self.uw,
-                "{}_out".format(self.name),
-                "[{}]".format(enum),
-                type_prefix=self.output_struct,
-                declared=True,
-            )
-            self.__dict__["out_" + name] = bus
+            else:
+                bus = AxiBus(
+                        self.clk,
+                        self.rst,
+                        self.aw,
+                        self.dw,
+                        iw_in,
+                        self.uw,
+                        "{}_in".format(self.name),
+                        "[{}]".format(enum),
+                        type_prefix=self.input_struct,
+                        declared=True,
+                    )
+                self.__dict__["in_" + inp['name']] = bus
+
+        code_module[self.context] += "\n" + code
 
         # Emit the crossbar instance itself.
-        if not self.interleaved_ena:
-            code = "axi_xbar #(\n"
-        else:
+        if self.interleaved_ena:
             code = "axi_interleaved_xbar #(\n"
+        elif self.enable_multicast:
+            code = "axi_mcast_xbar #(\n"
+        else:
+            code = "axi_xbar #(\n"
+        input_struct = self.input_struct_mcast if self.forward_mcast else self.input_struct
+        output_struct = self.output_struct_mcast if self.forward_mcast else self.output_struct
         code += "  .Cfg           ( {cfg_name} ),\n".format(
             cfg_name=self.cfg_name)
         code += "  .Connectivity  ( {} ), \n".format(self.connectivity())
         code += "  .ATOPs         ( {} ), \n".format(int(self.atop_support))
         code += "  .slv_aw_chan_t ( {}_aw_chan_t ),\n".format(
-            self.input_struct)
+            input_struct)
         code += "  .mst_aw_chan_t ( {}_aw_chan_t ),\n".format(
-            self.output_struct)
-        code += "  .w_chan_t      ( {}_w_chan_t ),\n".format(self.input_struct)
-        code += "  .slv_b_chan_t  ( {}_b_chan_t ),\n".format(self.input_struct)
+            output_struct)
+        code += "  .w_chan_t      ( {}_w_chan_t ),\n".format(input_struct)
+        code += "  .slv_b_chan_t  ( {}_b_chan_t ),\n".format(input_struct)
         code += "  .mst_b_chan_t  ( {}_b_chan_t ),\n".format(
-            self.output_struct)
+            output_struct)
         code += "  .slv_ar_chan_t ( {}_ar_chan_t ),\n".format(
-            self.input_struct)
+            input_struct)
         code += "  .mst_ar_chan_t ( {}_ar_chan_t ),\n".format(
-            self.output_struct)
-        code += "  .slv_r_chan_t  ( {}_r_chan_t ),\n".format(self.input_struct)
+            output_struct)
+        code += "  .slv_r_chan_t  ( {}_r_chan_t ),\n".format(input_struct)
         code += "  .mst_r_chan_t  ( {}_r_chan_t ),\n".format(
-            self.output_struct)
-        code += "  .slv_req_t     ( {}_req_t ),\n".format(self.input_struct)
-        code += "  .slv_resp_t    ( {}_resp_t ),\n".format(self.input_struct)
-        code += "  .mst_req_t     ( {}_req_t ),\n".format(self.output_struct)
-        code += "  .mst_resp_t    ( {}_resp_t ),\n".format(self.output_struct)
+            output_struct)
+        code += "  .slv_req_t     ( {}_req_t ),\n".format(input_struct)
+        code += "  .slv_resp_t    ( {}_resp_t ),\n".format(input_struct)
+        code += "  .mst_req_t     ( {}_req_t ),\n".format(output_struct)
+        code += "  .mst_resp_t    ( {}_resp_t ),\n".format(output_struct)
         code += "  .rule_t        ( xbar_rule_{}_t )\n".format(self.aw)
         code += ") i_{name} (\n".format(name=self.name)
         code += "  .clk_i  ( {clk} ),\n".format(clk=self.clk)
@@ -1713,9 +1902,83 @@ def emit(self):
         if self.interleaved_ena:
             code += "  .interleaved_mode_ena_i ( {name}_interleaved_mode_ena ),\n".format(
                 name=self.name)
-        code += "  .en_default_mst_port_i ( '1 ),\n"
-        code += "  .default_mst_port_i    ( '0 )\n"
+        code += "  .en_default_mst_port_i ( {} ),\n".format(en_default_mst_port_i)
+        code += "  .default_mst_port_i    ( {} )\n".format(default_mst_port_i)
         code += ");\n"
+        code_module[self.context] += "\n" + code
+
+        # Generate the buses to connect to every output port
+        # Note: does not generate any code, but may be referenced by the templates
+        code = ""
+        for output, enum in zip(self.outputs, output_enums):
+
+            if self.forward_mcast:
+
+                if output['forward_mcast']:
+
+                    bus = AxiBus(
+                        self.clk,
+                        self.rst,
+                        self.aw,
+                        self.dw,
+                        iw_out,
+                        uw,
+                        "{}_out".format(self.name),
+                        "[{}]".format(enum),
+                        type_prefix=self.output_struct_mcast,
+                        declared=True,
+                    )
+                    self.__dict__["out_" + output['name']] = bus
+                else:
+                    # If the XBAR supports multicast, for all output buses which
+                    # do not forward the multicast signals, we need to change the
+                    # user width of the respective interfaces
+                    output_req = f"{self.name}_out_req[{enum}]"
+                    output_rsp = f"{self.name}_out_rsp[{enum}]"
+
+                    output_uwc = f"{self.name}_out_{output['name']}_uwc"
+                    output_uwc_req = output_uwc + "_req"
+                    output_uwc_rsp = output_uwc + "_rsp"
+
+                    # Declare the intermediate interfaces first
+                    code += f"// Declare UWC output from {self.name}\n"
+                    code += f"  {self.output_struct + '_req_t'} {output_uwc_req};\n"
+                    code += f"  {self.output_struct + '_resp_t'} {output_uwc_rsp};\n\n"
+
+                    # And then change the user width
+                    code += "// Change UW\n"
+                    code += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=output_uwc_req,
+                                                                           rhs=output_req)
+                    code += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=output_rsp,
+                                                                              rhs=output_uwc_rsp)
+
+                    bus = AxiBus(
+                        self.clk,
+                        self.rst,
+                        self.aw,
+                        self.dw,
+                        iw_out,
+                        self.uw,
+                        output_uwc,
+                        type_prefix=self.output_struct,
+                        declared=True,
+                    )
+                    self.__dict__["out_" + output['name']] = bus
+
+            else:
+                bus = AxiBus(
+                    self.clk,
+                    self.rst,
+                    self.aw,
+                    self.dw,
+                    iw_out,
+                    self.uw,
+                    "{}_out".format(self.name),
+                    "[{}]".format(enum),
+                    type_prefix=self.output_struct,
+                    declared=True,
+                )
+                self.__dict__["out_" + output['name']] = bus
 
         code_module[self.context] += "\n" + code
 
@@ -1726,8 +1989,9 @@ def connectivity(self):
         for i in self.inputs:
             for o in self.outputs:
                 # Disable link only if connectivity specified for input or loopback disabled
-                connectivity += "0" if (((i in self.connections) and (o not in self.connections[i]))
-                                        or (self.no_loopback and i == o)) else "1"
+                connectivity += "0" if (((i['name'] in self.connections) and
+                                         (o['name'] not in self.connections[i['name']]))
+                                        or (self.no_loopback and i['name'] == o['name'])) else "1"
         connectivity = "{}'b{}".format(length, connectivity[::-1])
 
         return connectivity
@@ -2119,7 +2383,6 @@ def __init__(self,
         self.max_mst_trans = max_mst_trans
         self.fall_through = fall_through
         self.symbolic_addrmap = list()
-        self.symbolic_addrmap_multi = list()
         self.addrmap = list()
         self.latency_mode = latency_mode or "axi_pkg::CUT_ALL_PORTS"
 
@@ -2147,12 +2410,12 @@ def add_output_symbolic(self, name, base, length):
 
     def add_output_symbolic_multi(self, name, entries):
         idx = len(self.outputs)
-        self.symbolic_addrmap_multi.append((idx, entries))
+        for base, length in entries:
+            self.symbolic_addrmap.append((idx, base, length))
         self.outputs.append(name)
 
     def addr_map_len(self):
-        return len(self.addrmap) + len(self.symbolic_addrmap) + sum(
-            len(am) for am in self.symbolic_addrmap_multi)
+        return len(self.addrmap) + len(self.symbolic_addrmap)
 
     def emit(self):
         global code_module
@@ -2178,11 +2441,6 @@ def emit(self):
             addrmap_lines.append(
                 "  '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format(
                     idx, base, base, length, i=i))
-        for i, (idx, entries) in enumerate(self.symbolic_addrmap_multi):
-            for base, length in entries:
-                addrmap_lines.append(
-                    "  '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format(
-                        idx, base, base, length, i=i))
         addrmap += "{}\n}};\n".format(',\n'.join(addrmap_lines))
 
         code_module[self.context] += "\n" + addrmap