diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 96cd46719..91dea66b8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -65,3 +65,19 @@ occamy-full-vsim: - make LENGTH=384 sw - make bin/occamy_top.vsim - ./run.py sw/run-full-occamy.yaml --simulator vsim + +########################## +# Multicast Occamy tests # +########################## + +# Questa +occamy-mcast-vsim: + variables: + RISCV_CFLAGS: "-DUSE_MULTICAST" + DATA_CFG: "$(PWD)/sw/device/apps/blas/gemm/params.hjson" + script: + - cd target/sim + - make CFG_OVERRIDE=cfg/M-Q8C4.hjson rtl + - make LENGTH=1024 sw + - make bin/occamy_top.vsim + - ./run.py sw/run-full-occamy.yaml -j diff --git a/Bender.local b/Bender.local index aa72d87c3..f8ec04e0f 100644 --- a/Bender.local +++ b/Bender.local @@ -3,6 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 overrides: - axi: { git: https://github.com/pulp-platform/axi.git, version: 0.39.0-beta.4 } - common_cells: { git: https://github.com/pulp-platform/common_cells.git, version: 1.31.1 } + axi: { git: https://github.com/pulp-platform/axi.git, rev: feature/multicast-xbar } + common_cells: { git: https://github.com/pulp-platform/common_cells.git, rev: multicast-xbar } register_interface: { git: https://github.com/pulp-platform/register_interface.git, version: 0.4.2 } diff --git a/Bender.yml b/Bender.yml index c5699134c..42ff9a51f 100644 --- a/Bender.yml +++ b/Bender.yml @@ -29,7 +29,7 @@ dependencies: cva6: { path: hw/vendor/openhwgroup_cva6 } opentitan_peripherals: { path: hw/vendor/pulp_platform_opentitan_peripherals } register_interface: { git: https://github.com/pulp-platform/register_interface.git, version: 0.3.8 } - snitch_cluster: { git: https://github.com/pulp-platform/snitch_cluster.git, rev: extend/snRuntime } + snitch_cluster: { git: https://github.com/pulp-platform/snitch_cluster.git, rev: experimental/mcast } tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic.git, rev: v0.2.11 } workspace: diff --git a/hw/occamy/occamy_cva6.sv.tpl b/hw/occamy/occamy_cva6.sv.tpl index 90c3c1f1d..87b03c0f6 100644 --- a/hw/occamy/occamy_cva6.sv.tpl +++ b/hw/occamy/occamy_cva6.sv.tpl @@ -72,7 +72,7 @@ module ${name}_cva6 import ${name}_pkg::*; ( .AxiAddrWidth (${soc_narrow_xbar.in_cva6.aw}), .AxiDataWidth (${soc_narrow_xbar.in_cva6.dw}), .AxiIdWidth (${soc_narrow_xbar.in_cva6.iw}), - .AxiUserWidth (${max(1, soc_narrow_xbar.in_cva6.uw)}), + .AxiUserWidth (${cfg["narrow_xbar_user_width"]}), .axi_ar_chan_t(${soc_narrow_xbar.in_cva6.ar_chan_type()}), .axi_aw_chan_t(${soc_narrow_xbar.in_cva6.aw_chan_type()}), .axi_w_chan_t(${soc_narrow_xbar.in_cva6.w_chan_type()}), diff --git a/hw/occamy/occamy_quadrant_s1.sv.tpl b/hw/occamy/occamy_quadrant_s1.sv.tpl index 5226b3cd3..22ee70eb6 100644 --- a/hw/occamy/occamy_quadrant_s1.sv.tpl +++ b/hw/occamy/occamy_quadrant_s1.sv.tpl @@ -26,6 +26,7 @@ %> `include "axi/typedef.svh" +`include "axi/assign.svh" `include "axi_tlb/typedef.svh" /// Occamy Stage 1 Quadrant diff --git a/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl b/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl index 61ef61789..cd5ec0b3d 100644 --- a/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl +++ b/hw/occamy/occamy_quadrant_s1_ctrl.sv.tpl @@ -18,6 +18,8 @@ wide_tlb_entries = wide_tlb_cfg.get("l1_num_entries", 1) %> +`include "axi/assign.svh" + module ${name}_quadrant_s1_ctrl import ${name}_pkg::*; import ${name}_quadrant_s1_reg_pkg::*; diff --git a/hw/occamy/occamy_soc.sv.tpl b/hw/occamy/occamy_soc.sv.tpl index 4d25d5333..1644e0583 100644 --- a/hw/occamy/occamy_soc.sv.tpl +++ b/hw/occamy/occamy_soc.sv.tpl @@ -421,6 +421,7 @@ module ${name}_soc logic [${wide_in.aw-1}:0] num_bytes; axi_pkg::cache_t cache_src, cache_dst; axi_pkg::burst_t burst_src, burst_dst; + logic [${wide_in.uw-1}:0] user_src, user_dst; logic decouple_rw; logic deburst; logic serialize; diff --git a/target/sim/cfg/M-Q8C4.hjson b/target/sim/cfg/M-Q8C4.hjson new file mode 100644 index 000000000..fa1db50d4 --- /dev/null +++ b/target/sim/cfg/M-Q8C4.hjson @@ -0,0 +1,409 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Cluster configuration for Occamy. +{ + is_remote_quadrant: false, + remote_quadrants: [], + enable_multicast: true, + quadrant_pre_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + pre_xbar_slv_id_width_no_rocache: 3, + wide_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + quadrant_inter_xbar: { + max_slv_trans: 64, + max_mst_trans: 64, + fall_through: false, + }, + hbm_xbar: { + max_slv_trans: 128, + max_mst_trans: 128, + fall_through: false, + }, + narrow_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + cuts: { + narrow_to_quad: 3, + quad_to_narrow: 3, + quad_to_pre: 1, + pre_to_inter: 1, + inter_to_quad: 3, + narrow_to_cva6: 2, + narrow_conv_to_spm_narrow_pre: 2, + narrow_conv_to_spm_narrow: 1, + narrow_and_pcie: 3, + narrow_and_wide: 1, + wide_conv_to_spm_wide: 3, + wide_to_wide_zero_mem: 0, + wide_to_hbm: 3, + wide_and_inter: 3, + wide_and_hbi: 3, + narrow_and_hbi: 3, + pre_to_hbmx: 3, + hbmx_to_hbm: 3, + atomic_adapter_narrow: 1, + atomic_adapter_narrow_wide: 1, + // Give some flexibility in peripheral xbar placement + periph_axi_lite_narrow: 2, + periph_axi_lite: 2, + periph_axi_lite_narrow_hbm_xbar_cfg: 2, + // Non-right-side chip peripherals + periph_axi_lite_narrow_hbm_cfg: 3, + periph_axi_lite_narrow_pcie_cfg: 3, + periph_axi_lite_narrow_chip_ctrl_cfg: 3, + periph_axi_lite_narrow_hbi_narrow_cfg: 3, + periph_axi_lite_narrow_hbi_wide_cfg: 3, + periph_axi_lite_narrow_bootrom_cfg: 3, + periph_axi_lite_narrow_fll_system_cfg: 3, + periph_axi_lite_narrow_fll_periph_cfg: 3, + periph_axi_lite_narrow_fll_hbm2e_cfg: 3, + // Right-side or latency-invariant chip peripherals + periph_axi_lite_narrow_soc_ctrl_cfg: 1, + periph_axi_lite_narrow_uart_cfg: 1, + periph_axi_lite_narrow_i2c_cfg: 1, + periph_axi_lite_narrow_gpio_cfg: 1, + periph_axi_lite_narrow_clint_cfg: 1, + periph_axi_lite_narrow_plic_cfg: 1, + periph_axi_lite_narrow_spim_cfg: 1, + periph_axi_lite_narrow_timer_cfg: 1, + } + txns: { + wide_and_inter: 128, + wide_to_hbm: 128, + narrow_and_wide: 16, + rmq: 4, + } + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + nr_s1_quadrant: 8, + s1_quadrant: { + nr_clusters: 4, + // number of pending transactions on the narrow/wide network + narrow_trans: 32, + wide_trans: 32, + // Disable for easier flow trials. + ro_cache_cfg: { + width: 1024, + count: 128, + sets: 2, + max_trans: 32, + address_regions: 4, + } + narrow_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_tlb_cfg: { + max_trans: 32, + l1_num_entries: 8, + l1_cut_ax: true, + } + wide_xbar: { + max_slv_trans: 32, + max_mst_trans: 32, + fall_through: false, + }, + wide_xbar_slv_id_width: 3 + narrow_xbar: { + max_slv_trans: 8, + max_mst_trans: 8, + fall_through: false, + }, + narrow_xbar_slv_id_width: 4, + narrow_xbar_user_width: 5, // clog2(total number of clusters) + cfg_base_addr: 184549376, // 0x0b000000 + cfg_base_offset: 65536 // 0x10000 + }, + cluster: { + name: "occamy_cluster" + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x10000000 + cluster_base_offset: 262144 // 0x40000 + cluster_base_hartid: 1, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, // 128 kiB + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + dma_data_width: 512, + dma_user_width: 48, // same as addr_width + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE" + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ], + } + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + # isa: "rv32ema", + xdma: true + xssr: false + xfrep: false + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } + // peripherals + peripherals: { + rom: { + address: 16777216, // 0x0100_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + clint: { + address: 67108864, // 0x0400_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + axi_lite_peripherals: [ + { + name: "debug", + address: 0, // 0x0000_0000 + length: 4096, // 4 kiB 0x1000 + } + ], + axi_lite_narrow_peripherals: [ + { + name: "soc_ctrl", + address: 33554432, // 0x0200_0000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "fll_system", + address: 33558528, // 0x0200_1000 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_periph", + address: 33559552, // 0x0200_1400 + length: 1024, // 1 kiB 0x400 + }, + { + name: "fll_hbm2e", + address: 33560576, // 0x0200_1800 + length: 1024, // 1 kiB 0x400 + }, + { + name: "uart", + address: 33562624, // 0x0200_2000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "gpio", + address: 33566720, // 0x0200_3000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "i2c", + address: 33570816, // 0x0200_4000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "chip_ctrl", + address: 33574912, // 0x0200_5000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "timer", + address: 33579008, // 0x0200_6000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "hbm_xbar_cfg", + address: 33583104, // 0x0200_7000 + length: 4096, // 4 kiB 0x1000 + }, + { + name: "spim", + address: 50331648, // 0x0300_0000 + length: 131072, // 4 kiB 0x2_0000 + }, + { + name: "pcie_cfg", + address: 83886080, // 0x0500_0000 + length: 131072, // 128 kiB 0x2_0000 + }, + { + name: "hbi_wide_cfg", + address: 100663296, // 0x0600_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "hbi_narrow_cfg", + address: 117440512, // 0x0700_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + { + name: "plic", + address: 201326592, // 0x0C00_0000 + length: 67108864, // 64 MiB 0x400_0000 + }, + ], + }, + // non-peripheral IPs + pcie: { + address_io: 536870912, // 0x2000_0000 + address_mm: 1207959552, // 0x4800_0000 + length: 671088640, // 640 MiB 0x2800_0000 + }, + spm_narrow: { + address: 1879048192, // 0x7000_0000 + length: 524288, // 512 kiB 0x8_0000 + # An uncached alias address space of the same length + uncached_alias: 1879572480, // 0x7008_0000 + }, + spm_wide: { + address: 1895825408, // 0x7100_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + wide_zero_mem: { + address: 4294967296, // 0x1_0000_0000 + length: 8589934592, // 8 GiB 0x2_0000_0000 + }, + sys_idma_cfg: { + address: 285212672, // 0x1100_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + hbi: { + address: 1099511627776, // 0x100_0000_0000 + length: 1099511627776, // 1 TiB 0x100_0000_0000 + } + hbm: { + address_0: 2147483648, // 0x8000_0000 + address_1: 68719476736, // 0x10_0000_0000 + channel_size: 1073741824, // 1 GiB 0x4000_0000 + nr_channels_total: 8, + nr_channels_address_0: 2, + cfg_regions: { + top: { + address: 134217728, // 0x0800_0000 + length: 4194304, // 4 MiB 0x40_0000 + }, + phy: { + address: 150994944 // 0x0900_0000 + length: 1048576, // 1 MiB 0x10_0000 + }, + seq: { + address: 167772160, // 0x0A00_0000 + length: 65536, // 64 kiB 0x1_0000 + }, + ctrl: { + address: 176160768, // 0x0A80_0000 + length: 65536, // 64 kiB 0x1_0000 + } + } + }, + // dram corresponds to 'hbm address_0' and 'nr_channels_address_0' + dram: { + address: 2147483648, // 0x8000_0000 + length: 2147483648, // 2 GiB 0x8000_0000 + }, +} diff --git a/target/sim/cfg/full.hjson b/target/sim/cfg/full.hjson index 4c042f913..8766fd184 100644 --- a/target/sim/cfg/full.hjson +++ b/target/sim/cfg/full.hjson @@ -6,6 +6,7 @@ { is_remote_quadrant: false, remote_quadrants: [], + enable_multicast: false, quadrant_pre_xbar: { max_slv_trans: 64, max_mst_trans: 64, diff --git a/target/sim/cfg/single-cluster.hjson b/target/sim/cfg/single-cluster.hjson index 5c19ee054..418f2cf0e 100644 --- a/target/sim/cfg/single-cluster.hjson +++ b/target/sim/cfg/single-cluster.hjson @@ -6,6 +6,7 @@ { is_remote_quadrant: false, remote_quadrants: [], + enable_multicast: false, quadrant_pre_xbar: { max_slv_trans: 64, max_mst_trans: 64, diff --git a/target/sim/sw/device/apps/blas/gemm/params.hjson b/target/sim/sw/device/apps/blas/gemm/params.hjson new file mode 100644 index 000000000..0f010cd3e --- /dev/null +++ b/target/sim/sw/device/apps/blas/gemm/params.hjson @@ -0,0 +1,16 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMM + +{ + M: 256, + N: 16, + K: 16, + beta: 0, + ta: false, + tb: true, // must be true for SIMD + prec: 64, + expand: 0 +} diff --git a/target/sim/sw/host/apps/offload/src/offload.c b/target/sim/sw/host/apps/offload/src/offload.c index 2b1fc9751..e504a9f1c 100644 --- a/target/sim/sw/host/apps/offload/src/offload.c +++ b/target/sim/sw/host/apps/offload/src/offload.c @@ -20,7 +20,7 @@ int main() { asm volatile("" ::: "memory"); // Start Snitches - wakeup_snitches_cl(); + wakeup_snitches(); // Wait for job done and return Snitch exit code return wait_snitches_done(); diff --git a/target/sim/sw/host/runtime/host.c b/target/sim/sw/host/runtime/host.c index 68410bc6a..be81c3953 100644 --- a/target/sim/sw/host/runtime/host.c +++ b/target/sim/sw/host/runtime/host.c @@ -198,24 +198,17 @@ static inline void wakeup_cluster(uint32_t cluster_id) { *(cluster_clint_set_ptr(cluster_id)) = 511; } -/** - * @brief Wake-up Snitches - * - * @detail All Snitches are "parked" in a WFI. A SW interrupt - * must be issued to "unpark" every Snitch. This function - * sends a SW interrupt to all Snitches. - */ -void wakeup_snitches() { - for (int i = 0; i < N_CLUSTERS; i++) set_sw_interrupt(i); -} - /** * @brief Wake-up Snitches * * @detail Send a cluster interrupt to all Snitches */ -static inline void wakeup_snitches_cl() { +static inline void wakeup_snitches() { +#if defined(SUPPORTS_MULTICAST) && defined(USE_MULTICAST) + multicast_to_clusters(cluster_clint_set_addr(0), 511); +#else for (int i = 0; i < N_CLUSTERS; i++) wakeup_cluster(i); +#endif } /** @@ -615,3 +608,26 @@ void deactivate_interleaved_mode_hbm() { OCCAMY_HBM_XBAR_INTERLEAVED_ENA_REG_OFFSET + HBM_XBAR_CFG_BASE_ADDR; *((volatile uint32_t*)addr) = 1; } + +//=============================================================== +// CVA6 extensions +//=============================================================== + +static inline void multicast(uint64_t addr, uint64_t mask, uint64_t value) { + enable_multicast(mask); + uint64_t* p = (uint64_t*)addr; + *p = value; + disable_multicast(); +} + +static inline void enable_multicast(uint64_t mask) { + asm volatile("csrw 0x7c0, %[mask]\n" : : [ mask ] "r"(mask) : "memory"); +} + +static inline void disable_multicast() { + asm volatile("csrw 0x7c0, 0" : : : "memory"); +} + +static inline void multicast_to_clusters(uint64_t addr, uint64_t value) { + multicast(addr, CLUSTER_BCAST_MASK, value); +} diff --git a/target/sim/sw/host/runtime/host.h b/target/sim/sw/host/runtime/host.h index 55e1623ab..c110ed0e9 100644 --- a/target/sim/sw/host/runtime/host.h +++ b/target/sim/sw/host/runtime/host.h @@ -5,12 +5,20 @@ #include #include -static inline void set_sw_interrupt(uint32_t hartid); +#define CLUSTER_BCAST_MASK ((N_CLUSTERS - 1) << 18) void delay_ns(uint64_t delay); static inline volatile uint32_t* get_shared_lock(); +static inline void set_sw_interrupt(uint32_t hartid); + static inline void wait_sw_interrupt(); -static inline void clear_sw_interrupt(uint32_t hartid); +static inline void enable_multicast(uint64_t mask); + +static inline void disable_multicast(); + +static inline void multicast(uint64_t addr, uint64_t mask, uint64_t value); + +static inline void multicast_to_clusters(uint64_t addr, uint64_t value); diff --git a/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl b/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl index 1787e7110..48b155fd7 100644 --- a/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl +++ b/target/sim/sw/shared/platform/generated/occamy_cfg.h.tpl @@ -4,4 +4,7 @@ #define N_QUADS ${cfg['nr_s1_quadrant']} #define N_CLUSTERS_PER_QUAD ${cfg['s1_quadrant']['nr_clusters']} -#define N_CORES_PER_CLUSTER ${cfg['cluster']['nr_cores']} \ No newline at end of file +#define N_CORES_PER_CLUSTER ${cfg['cluster']['nr_cores']} +% if cfg['enable_multicast']: +#define SUPPORTS_MULTICAST +% endif diff --git a/util/occamygen/occamy.py b/util/occamygen/occamy.py index 73a3e748f..2b699f10c 100644 --- a/util/occamygen/occamy.py +++ b/util/occamygen/occamy.py @@ -57,6 +57,7 @@ def __init__(self, cfg): self.cluster.cfg["boot_addr"] = self.cfg["peripherals"]["rom"]["address"] self.cluster.cfg['tie_ports'] = False + self.cluster.cfg['enable_multicast'] = cfg['enable_multicast'] if "ro_cache_cfg" in self.cfg["s1_quadrant"]: ro_cache = self.cfg["s1_quadrant"]["ro_cache_cfg"] diff --git a/util/occamygen/occamygen.py b/util/occamygen/occamygen.py index dbb48f147..39016188d 100755 --- a/util/occamygen/occamygen.py +++ b/util/occamygen/occamygen.py @@ -27,6 +27,13 @@ DEFAULT_NAME = "occamy" +def clog2(x): + """Ceiling of log2""" + if x <= 0: + raise ValueError("domain error") + return (x-1).bit_length() + + def write_template(tpl_path, outdir, fname=None, **kwargs): if tpl_path: tpl_path = pathlib.Path(tpl_path).absolute() @@ -136,6 +143,8 @@ def main(): nr_s1_quadrants = occamy.cfg["nr_s1_quadrant"] nr_s1_clusters = occamy.cfg["s1_quadrant"]["nr_clusters"] is_remote_quadrant = occamy.cfg["is_remote_quadrant"] + enable_narrow_multicast = occamy.cfg["enable_multicast"] + enable_wide_multicast = occamy.cfg["enable_multicast"] # Iterate over Hives to get the number of cores. nr_cluster_cores = len([ core for hive in occamy.cfg["cluster"]["hives"] @@ -564,12 +573,15 @@ def main(): no_loopback=True, atop_support=False, context="soc", - node=am_quadrant_pre_xbar[i]) + node=am_quadrant_pre_xbar[i], + forward_mcast=enable_wide_multicast) # Default port: - quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar", am_quadrant_inter_xbar) + quadrant_pre_xbar.add_output_entry("quadrant_inter_xbar", + am_quadrant_inter_xbar, + forward_mcast=enable_wide_multicast) quadrant_pre_xbar.add_output_entry("hbm_xbar", am_hbm_xbar) - quadrant_pre_xbar.add_input("quadrant") + quadrant_pre_xbar.add_input("quadrant", is_mcast_master=enable_wide_multicast) quadrant_pre_xbars.append(quadrant_pre_xbar) @@ -588,17 +600,18 @@ def main(): no_loopback=True, atop_support=False, context="soc", - node=am_quadrant_inter_xbar) + node=am_quadrant_inter_xbar, + enable_multicast=enable_wide_multicast) - # Default port: soc wide xbar - quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar) - quadrant_inter_xbar.add_input("wide_xbar") for i in range(nr_s1_quadrants): # Default route passes HBI through quadrant 0 # --> mask this route, forcing it through default wide xbar quadrant_inter_xbar.add_output_entry("quadrant_{}".format(i), - am_wide_xbar_quadrant_s1[i]) - quadrant_inter_xbar.add_input("quadrant_{}".format(i)) + am_wide_xbar_quadrant_s1[i], + is_mcast_target=enable_wide_multicast, + forward_mcast=enable_wide_multicast) + quadrant_inter_xbar.add_input("quadrant_{}".format(i), + is_mcast_master=enable_wide_multicast) for i, rq in enumerate(occamy.cfg["remote_quadrants"]): quadrant_inter_xbar.add_input("rmq_{}".format(i)) quadrant_inter_xbar.add_output_entry("rmq_{}".format(i), am_remote_quadrants[i]) @@ -607,6 +620,9 @@ def main(): quadrant_inter_xbar.add_output("remote", []) quadrant_inter_xbar.add_input("remote") + quadrant_inter_xbar.add_output_entry("wide_xbar", am_soc_wide_xbar) + quadrant_inter_xbar.add_input("wide_xbar") + hbm_xbar = solder.AxiXbar( 48, 512, @@ -675,17 +691,20 @@ def main(): fall_through=occamy.cfg["narrow_xbar"]["fall_through"], no_loopback=True, context="soc", - node=am_soc_narrow_xbar) + node=am_soc_narrow_xbar, + enable_multicast=enable_narrow_multicast) for i in range(nr_s1_quadrants): soc_narrow_xbar.add_output_symbolic_multi("s1_quadrant_{}".format(i), - [("s1_quadrant_base_addr", + [(f"s1_quadrant_base_addr[{i}]", "S1QuadrantAddressSpace"), - ("s1_quadrant_cfg_base_addr", - "S1QuadrantCfgAddressSpace")]) + (f"s1_quadrant_cfg_base_addr[{i}]", + "S1QuadrantCfgAddressSpace")], + is_mcast_target=enable_narrow_multicast, + forward_mcast=enable_narrow_multicast) soc_narrow_xbar.add_input("s1_quadrant_{}".format(i)) - soc_narrow_xbar.add_input("cva6") + soc_narrow_xbar.add_input("cva6", is_mcast_master=enable_narrow_multicast) soc_narrow_xbar.add_input("soc_wide") soc_narrow_xbar.add_input("periph") soc_narrow_xbar.add_input("pcie") @@ -716,9 +735,10 @@ def main(): # We need 3 "crossbars", which are really simple muxes and demuxes quadrant_s1_ctrl_xbars = dict() - for name, (iw, lm) in { - 'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS"), - 'quad_to_soc': (soc_narrow_xbar.iw, "axi_pkg::CUT_MST_PORTS"), + for name, (iw, lm, forward_mcast) in { + 'soc_to_quad': (soc_narrow_xbar.iw_out(), "axi_pkg::CUT_SLV_PORTS", + enable_narrow_multicast), + 'quad_to_soc': (soc_narrow_xbar.iw, "axi_pkg::CUT_MST_PORTS", False), }.items(): # Reuse (preserve) narrow Xbar IDs and max transactions quadrant_s1_ctrl_xbars[name] = solder.AxiXbar( @@ -733,13 +753,15 @@ def main(): max_mst_trans=occamy.cfg["narrow_xbar"]["max_mst_trans"], fall_through=occamy.cfg["narrow_xbar"]["fall_through"], latency_mode=lm, - context="quadrant_s1_ctrl") + context="quadrant_s1_ctrl", + forward_mcast=forward_mcast, + enable_default_mst_port=True, + default_mst_port_idx=0) - for name in ['soc_to_quad', 'quad_to_soc']: - quadrant_s1_ctrl_xbars[name].add_output("out", []) - quadrant_s1_ctrl_xbars[name].add_input("in") + quadrant_s1_ctrl_xbars[name].add_output("out", [], forward_mcast=forward_mcast) + quadrant_s1_ctrl_xbars[name].add_input("in", is_mcast_master=forward_mcast) quadrant_s1_ctrl_xbars[name].add_output_symbolic("internal", - "internal_xbar_base_addr", + "internal_xbar_base_addr[0]", "S1QuadrantCfgAddressSpace") # AXI Lite mux to combine register requests @@ -775,7 +797,10 @@ def main(): no_loopback=True, atop_support=False, context="quadrant_s1", - node=am_wide_xbar_quadrant_s1[0]) + node=am_wide_xbar_quadrant_s1[0], + enable_multicast=enable_wide_multicast, + enable_default_mst_port=True, + default_mst_port_idx=nr_s1_clusters) narrow_xbar_quadrant_s1 = solder.AxiXbar( 48, @@ -791,25 +816,37 @@ def main(): ["max_mst_trans"], fall_through=occamy.cfg["s1_quadrant"]["narrow_xbar"]["fall_through"], no_loopback=True, - context="quadrant_s1") - - wide_xbar_quadrant_s1.add_output("top", []) - wide_xbar_quadrant_s1.add_input("top") - - narrow_xbar_quadrant_s1.add_output("top", []) - narrow_xbar_quadrant_s1.add_input("top") + context="quadrant_s1", + enable_multicast=enable_narrow_multicast, + enable_default_mst_port=True, + default_mst_port_idx=nr_s1_clusters) for i in range(nr_s1_clusters): wide_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i), - "cluster_base_addr", - "ClusterAddressSpace") + f"cluster_base_addr[{i}]", + "ClusterAddressSpace", + is_mcast_target=enable_wide_multicast, + forward_mcast=enable_wide_multicast) + wide_xbar_quadrant_s1.add_input("cluster_{}".format(i), + is_mcast_master=enable_wide_multicast) - wide_xbar_quadrant_s1.add_input("cluster_{}".format(i)) narrow_xbar_quadrant_s1.add_output_symbolic("cluster_{}".format(i), - "cluster_base_addr", - "ClusterAddressSpace") + f"cluster_base_addr[{i}]", + "ClusterAddressSpace", + is_mcast_target=enable_narrow_multicast, + forward_mcast=False) narrow_xbar_quadrant_s1.add_input("cluster_{}".format(i)) + wide_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_wide_multicast) + wide_xbar_quadrant_s1.add_output("top", [], + is_mcast_target=enable_wide_multicast, + forward_mcast=enable_wide_multicast) + + narrow_xbar_quadrant_s1.add_input("top", is_mcast_master=enable_narrow_multicast) + narrow_xbar_quadrant_s1.add_output("top", [], + is_mcast_target=enable_narrow_multicast, + forward_mcast=False) + # remote downstream mux rmq_mux = [None]*max(nr_remote_quadrants, 1 if is_remote_quadrant else 0) rmq_demux = [None]*max(nr_remote_quadrants, 1 if is_remote_quadrant else 0) @@ -855,6 +892,10 @@ def main(): dw=soc_axi_lite_narrow_periph_xbar.dw, name="apb_hbm_cfg") + ########### + # CodeGen # + ########### + kwargs = { "solder": solder, "util": util, @@ -902,20 +943,13 @@ def main(): ########################### # SoC (fully synchronous) # ########################### + write_template(args.soc_sv, outdir, module=solder.code_module['soc'], soc_periph_xbar=soc_axi_lite_periph_xbar, **kwargs) - ########################## - # S1 Quadrant controller # - ########################## - write_template(args.quadrant_s1_ctrl, - outdir, - module=solder.code_module['quadrant_s1_ctrl'], - **kwargs) - ############### # S1 Quadrant # ############### @@ -934,6 +968,14 @@ def main(): with open("{}/{}_quadrant_s1.sv".format(outdir, args.name), 'w') as f: f.write("// no quadrants in this design") + ########################## + # S1 Quadrant controller # + ########################## + write_template(args.quadrant_s1_ctrl, + outdir, + module=solder.code_module['quadrant_s1_ctrl'], + **kwargs) + ################## # Xilinx Wrapper # ################## diff --git a/util/solder/solder.axi_lite_xbar.sv.tpl b/util/solder/solder.axi_lite_xbar.sv.tpl index 5d623bdac..f1ec1aca0 100644 --- a/util/solder/solder.axi_lite_xbar.sv.tpl +++ b/util/solder/solder.axi_lite_xbar.sv.tpl @@ -44,7 +44,8 @@ localparam axi_pkg::xbar_cfg_t ${cfg_name} = '{ UniqueIds: 0, AxiAddrWidth: ${xbar.aw}, AxiDataWidth: ${xbar.dw}, - NoAddrRules: ${xbar.addr_map_len()} + NoAddrRules: ${xbar.addr_map_len()}, + default: '0 }; // AXI plugs of the `${xbar.name}` crossbar. diff --git a/util/solder/solder.py b/util/solder/solder.py index a5bb5997b..f293eeb1d 100644 --- a/util/solder/solder.py +++ b/util/solder/solder.py @@ -8,6 +8,8 @@ import math import pathlib import logging +import operator +from termcolor import cprint from copy import copy from mako.lookup import TemplateLookup @@ -321,17 +323,33 @@ def __init__(self): class AxiStruct: configs = dict() - def emit(aw, dw, iw, uw): + def emit(aw, dw, iw, uw, enable_multicast=False): global code_package - key = (aw, dw, iw, uw) + key = (aw, dw, iw, uw, enable_multicast) + # Skip emission if struct was already emitted. Ensures the + # same type is not defined multiple times if key in AxiStruct.configs: return AxiStruct.configs[key] - name = "axi_a{}_d{}_i{}_u{}".format(*key) - code = "// AXI bus with {} bit address, {} bit data, {} bit IDs, \ - and {} bit user data.\n".format(*key) + if enable_multicast: + name = "axi_a{}_d{}_i{}_u{}_mcast".format(*key[0:-1]) + else: + name = "axi_a{}_d{}_i{}_u{}".format(*key[0:-1]) + code = ("// AXI bus with {} bit address, {} bit data, {} bit IDs," + "and {} bit user data.\n").format(*key[0:-1]) code += "`AXI_TYPEDEF_ALL_CT({}, {}_req_t, {}_resp_t, ".format(name, name, name) - code += "logic [{}:0], logic [{}:0], logic [{}:0], logic [{}:0], logic [{}:0])\n".format( - aw - 1, iw - 1, dw - 1, (dw + 7) // 8 - 1, max(0, uw - 1)) + code += f"logic [{aw - 1}:0]," + code += f"logic [{iw - 1}:0], " + code += f"logic [{dw - 1}:0], " + code += f"logic [{(dw + 7) // 8 - 1}:0], " + if enable_multicast: + user_t = "struct packed {" + user_t += f"logic [{max(0, aw - 1)}:0] mcast;" + if uw > aw: + user_t += f" logic [{max(0, uw - aw - 1)}:0] atomics_id;" + user_t += "}" + else: + user_t = f"logic [{max(0, uw - 1)}:0]" + code += f"{user_t})\n" code_package += "\n" + code AxiStruct.configs[key] = name return name @@ -681,8 +699,8 @@ def change_uw(self, context, target_uw, name, inst_name=None, to=None): assgn = "// Change UW\n" assgn += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=bus.req_name(), rhs=self.req_name()) - assgn += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n".format(lhs=self.rsp_name(), - rhs=bus.rsp_name()) + assgn += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=self.rsp_name(), + rhs=bus.rsp_name()) context.write(assgn) return bus @@ -1458,6 +1476,10 @@ def __init__(self, atop_support=True, latency_mode=None, interleaved_ena=False, + enable_multicast=False, + forward_mcast=False, + enable_default_mst_port=False, + default_mst_port_idx=0, **kwargs): super().__init__(**kwargs) self.aw = aw @@ -1468,50 +1490,111 @@ def __init__(self, self.max_mst_trans = max_mst_trans self.fall_through = fall_through self.no_loopback = no_loopback - self.symbolic_addrmap = list() - self.symbolic_addrmap_multi = list() self.atop_support = atop_support self.interleaved_ena = interleaved_ena + self.enable_multicast = enable_multicast + if self.enable_multicast: + self.forward_mcast = True + else: + self.forward_mcast = forward_mcast + self.enable_default_mst_port = enable_default_mst_port + self.default_mst_port_idx = default_mst_port_idx self.addrmap = list() self.connections = dict() self.latency_mode = latency_mode or "axi_pkg::CUT_ALL_PORTS" - def add_input(self, name, outputs=None): - self.inputs.append(name) + def add_input(self, name, is_mcast_master=False, outputs=None): + self.inputs.append({'name': name, 'is_mcast_master': is_mcast_master}) if outputs: self.connections[name] = outputs - def add_output(self, name, addrs, default=False): + def add_output(self, name, addrs, is_mcast_target=False, forward_mcast=False): idx = len(self.outputs) for lo, hi in addrs: if hi >> self.aw == 1: hi -= 1 - self.addrmap.append((idx, lo, hi)) - self.outputs.append(name) - - def add_output_symbolic(self, name, base, length): + self.addrmap.append({'idx': idx, + 'is_symbolic': False, + 'is_mcast_rule': is_mcast_target, + 'lo': lo, + 'hi': hi}) + self.outputs.append({'name': name, + 'is_mcast_target': is_mcast_target, + 'forward_mcast': forward_mcast}) + + def add_output_symbolic(self, name, base, length, is_mcast_target=False, forward_mcast=False): idx = len(self.outputs) - self.symbolic_addrmap.append((idx, base, length)) - self.outputs.append(name) - - def add_output_symbolic_multi(self, name, entries): + self.addrmap.append({'idx': idx, + 'is_symbolic': True, + 'is_mcast_rule': is_mcast_target, + 'base': base, + 'length': length}) + self.outputs.append({'name': name, + 'is_mcast_target': is_mcast_target, + 'forward_mcast': forward_mcast}) + + def add_output_symbolic_multi(self, name, entries, is_mcast_target=False, forward_mcast=False): idx = len(self.outputs) - self.symbolic_addrmap_multi.append((idx, entries)) - self.outputs.append(name) - - def add_output_entry(self, name, entry, range_mask=None): + for base, length in entries: + self.addrmap.append({'idx': idx, + 'is_symbolic': True, + 'is_mcast_rule': is_mcast_target, + 'base': base, + 'length': length}) + self.outputs.append({'name': name, + 'is_mcast_target': is_mcast_target, + 'forward_mcast': forward_mcast}) + + def add_output_entry(self, name, entry, range_mask=None, is_mcast_target=False, + forward_mcast=False): addrs = [(r.lo, r.hi) for r in self.node.get_routes() if r.port == entry] if range_mask is not None: addrs = filter(lambda r: r[0] >= range_mask[0] and r[1] < range_mask[1], addrs) - self.add_output(name, addrs) + self.add_output(name, addrs, is_mcast_target=is_mcast_target, forward_mcast=forward_mcast) def addr_map_len(self): - return len(self.addrmap) + len(self.symbolic_addrmap) + sum( - len(am[1]) for am in self.symbolic_addrmap_multi) + return len(self.addrmap) + + def num_mcast_rules(self): + if self.enable_multicast: + return len([rule for rule in self.addrmap if rule['is_mcast_rule']]) + else: + return 0 + + def num_mcast_ports(self): + if self.enable_multicast: + return len([output for output in self.outputs if output['is_mcast_target']]) + else: + return 0 def iw_out(self): return self.iw + int(math.ceil(math.log2(max(1, len(self.inputs))))) + def union_multicast_rules(self): + # Issue warning to manually check rules are ordered and contiguous + cprint(f"Warning: please ensure manually that following rules for the {self.name} XBAR" + " are contiguous and ordered (lower addresses first).", color="yellow") + for rule in self.addrmap[0:self.num_mcast_rules()]: + if rule['is_symbolic']: + cprint(f"{self.outputs[rule['idx']]['name']} [{rule['base']}," + f" {rule['base']} + {rule['length']}]", color="yellow") + else: + cprint(f"{self.outputs[rule['idx']]['name']} [{rule['lo']}, {rule['hi']}]", + color="yellow") + # Get start address of the first multicast rule in the addrmap + first_rule = self.addrmap[0] + if first_rule['is_symbolic']: + start_addr = first_rule['base'] + else: + start_addr = first_rule['lo'] + # Get end address of the last multicast rule in the addrmap + last_rule = self.addrmap[self.num_mcast_rules()-1] + if last_rule['is_symbolic']: + end_addr = last_rule['base'] + ' + ' + last_rule['length'] + else: + end_addr = last_rule['hi'] + return start_addr, end_addr + def emit(self): global code_module global code_package @@ -1522,14 +1605,35 @@ def emit(self): # Compute the ID widths. iw_in = self.iw iw_out = self.iw_out() + # Compute the USER widths + uw = self.uw + + # Multicast requirements + if self.enable_multicast: + # Check that multicast-targetable slaves are at lower indices. + # If a multicast-targetable slave follows a non multicast-targetable slave + # we have a violation of this rule. + violations = [] + for i in range(len(self.outputs) - 1): + if not self.outputs[i]['is_mcast_target']: + if self.outputs[i+1]['is_mcast_target']: + violations.append(True) + assert (not violations), \ + f'{self.name}: multicast-targetable slaves must be at lower indices' + # Sort address map rules by `is_multicast_rule` to ensure that + # multicast rules are at lower indices + self.addrmap.sort(key=operator.itemgetter('is_mcast_rule'), reverse=True) + if self.forward_mcast: + # Add multicast mask to USER signal + uw += self.aw # Emit the input enum into the package. input_enum_name = "{}_inputs_e".format(self.name) input_enum = "/// Inputs of the `{}` crossbar.\n".format(self.name) input_enum += "typedef enum int {\n" input_enums = list() - for name in self.inputs: - x = "{}_in_{}".format(self.name, name).upper() + for inp in self.inputs: + x = "{}_in_{}".format(self.name, inp['name']).upper() input_enums.append(x) input_enum += " {},\n".format(x) input_enum += " {}_NUM_INPUTS\n".format(self.name.upper()) @@ -1541,8 +1645,8 @@ def emit(self): output_enum = "/// Outputs of the `{}` crossbar.\n".format(self.name) output_enum += "typedef enum int {\n" output_enums = list() - for name in self.outputs: - x = "{}_out_{}".format(self.name, name).upper() + for output in self.outputs: + x = "{}_out_{}".format(self.name, output['name']).upper() output_enums.append(x) output_enum += " {},\n".format(x) output_enum += " {}_NUM_OUTPUTS\n".format(self.name.upper()) @@ -1568,7 +1672,10 @@ def emit(self): cfg += " UniqueIds: {},\n".format(0) cfg += " AxiAddrWidth: {},\n".format(self.aw) cfg += " AxiDataWidth: {},\n".format(self.dw) - cfg += " NoAddrRules: {}\n".format(self.addr_map_len()) + cfg += " NoAddrRules: {},\n".format(self.addr_map_len()) + cfg += " NoMulticastRules: {},\n".format(self.num_mcast_rules()) + cfg += " NoMulticastPorts: {},\n".format(self.num_mcast_ports()) + cfg += " default: '0\n" cfg += "};\n" code_package += "\n" + cfg @@ -1581,36 +1688,68 @@ def emit(self): self.addr_map_len() - 1, addrmap_name) addrmap += "assign {} = '{{\n".format(addrmap_name) addrmap_lines = [] - for i in range(len(self.addrmap)): - addrmap_lines.append( - " '{{ idx: {}, start_addr: {aw}'h{:08x}, end_addr: {aw}'h{:08x} }}".format( - *self.addrmap[i], aw=self.aw)) - for i, (idx, base, length) in enumerate(self.symbolic_addrmap): - addrmap_lines.append( - " '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format( - idx, base, base, length, i=i)) - for i, (idx, entries) in enumerate(self.symbolic_addrmap_multi): - for base, length in entries: - addrmap_lines.append( - " '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format( - idx, base, base, length, i=i)) + # Invert order of rules in address map as lower indices come last in System Verilog + # array initializers + for rule in reversed(self.addrmap): + if rule['is_symbolic']: + line = " '{{ idx: {}, start_addr: {}, end_addr: {} + {} }}".format( + rule['idx'], rule['base'], rule['base'], rule['length']) + else: + line = " '{{ idx: {}, start_addr: {aw}'h{:08x}, end_addr: {aw}'h{:08x} }}".format( + rule['idx'], rule['lo'], rule['hi'], aw=self.aw) + addrmap_lines.append(line) addrmap += "{}\n}};\n".format(',\n'.join(addrmap_lines)) code_module[self.context] += "\n" + addrmap + # Emit the default port definition + en_default_mst_port_i = f"'{int(self.enable_default_mst_port)}" + if self.enable_default_mst_port: + if self.enable_multicast: + union_start_addr, union_end_addr = self.union_multicast_rules() + default_port = "/// Default port of the `{}` crossbar.\n".format(self.name) + default_port += f"xbar_rule_{self.aw}_t {self.name}_default_port;\n" + default_port += f"assign {self.name}_default_port = '{{\n" + default_port += f" idx: {self.default_mst_port_idx},\n" + default_port += f" start_addr: {union_start_addr},\n" + default_port += f" end_addr: {union_end_addr}\n" + default_port += "};\n" + code_module[self.context] += "\n" + default_port + default_mst_port_i = f"{{{len(self.inputs)}{{{self.name}_default_port}}}}" + else: + default_mst_port_idx_bits = (len(self.outputs)-1).bit_length() + default_port = f"{default_mst_port_idx_bits}" + \ + f"'b{self.default_mst_port_idx:b}" + default_mst_port_i = f"{{{len(self.inputs)}{{{default_port}}}}}" + else: + default_mst_port_i = "'0" + # Emit the AXI structs into the package. self.input_struct = AxiStruct.emit(self.aw, self.dw, iw_in, self.uw) self.output_struct = AxiStruct.emit(self.aw, self.dw, iw_out, self.uw) - + if self.forward_mcast: + self.input_struct_mcast = AxiStruct.emit(self.aw, self.dw, iw_in, uw, + enable_multicast=True) + self.output_struct_mcast = AxiStruct.emit(self.aw, self.dw, iw_out, uw, + enable_multicast=True) + + # Rename (typedef) the generic AXI structs generated above to unique types + # for the input and output signals of this XBAR code_package += "\n" for tds in [ "req", "resp", "aw_chan", "w_chan", "b_chan", "ar_chan", "r_chan" ]: + if self.forward_mcast: + input_struct = self.input_struct_mcast + output_struct = self.output_struct_mcast + else: + input_struct = self.input_struct + output_struct = self.output_struct code_package += "typedef {}_{tds}_t {}_in_{tds}_t;\n".format( - self.input_struct, self.name, tds=tds) + input_struct, self.name, tds=tds) code_package += "typedef {}_{tds}_t {}_out_{tds}_t;\n".format( - self.output_struct, self.name, tds=tds) + output_struct, self.name, tds=tds) # Emit the characteristics of the AXI plugs into the package. code_package += "\n" @@ -1637,64 +1776,114 @@ def emit(self): len(self.outputs) - 1, self.name) code_module[self.context] += "\n" + code - for name, enum in zip(self.inputs, input_enums): - bus = AxiBus( - self.clk, - self.rst, - self.aw, - self.dw, - iw_in, - self.uw, - "{}_in".format(self.name), - "[{}]".format(enum), - type_prefix=self.input_struct, - declared=True, - ) - self.__dict__["in_" + name] = bus + # Generate the buses to connect to every input port + # Note: does not generate any code, but may be referenced by the templates + code = "" + for inp, enum in zip(self.inputs, input_enums): + if self.forward_mcast: + + if inp['is_mcast_master']: + + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_in, + uw, + "{}_in".format(self.name), + "[{}]".format(enum), + type_prefix=self.input_struct_mcast, + declared=True, + ) + self.__dict__["in_" + inp['name']] = bus + + else: + + # If the XBAR supports multicast, for all input buses which + # are not multicast masters, we need to change the + # user width of the respective interfaces + input_req = f"{self.name}_in_req[{enum}]" + input_rsp = f"{self.name}_in_rsp[{enum}]" + + input_uwc = f"{self.name}_in_{inp['name']}_uwc" + input_uwc_req = input_uwc + "_req" + input_uwc_rsp = input_uwc + "_rsp" + + # Declare the intermediate interfaces first + code += f"// Declare UWC input to {self.name}\n" + code += f" {self.input_struct + '_req_t'} {input_uwc_req};\n" + code += f" {self.input_struct + '_resp_t'} {input_uwc_rsp};\n\n" + + # And then change the user width + code += "// Change UW\n" + code += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=input_req, + rhs=input_uwc_req) + code += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=input_uwc_rsp, + rhs=input_rsp) + + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_in, + self.uw, + input_uwc, + type_prefix=self.input_struct, + declared=True, + ) + self.__dict__["in_" + inp['name']] = bus - for name, enum in zip(self.outputs, output_enums): - bus = AxiBus( - self.clk, - self.rst, - self.aw, - self.dw, - iw_out, - self.uw, - "{}_out".format(self.name), - "[{}]".format(enum), - type_prefix=self.output_struct, - declared=True, - ) - self.__dict__["out_" + name] = bus + else: + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_in, + self.uw, + "{}_in".format(self.name), + "[{}]".format(enum), + type_prefix=self.input_struct, + declared=True, + ) + self.__dict__["in_" + inp['name']] = bus + + code_module[self.context] += "\n" + code # Emit the crossbar instance itself. - if not self.interleaved_ena: - code = "axi_xbar #(\n" - else: + if self.interleaved_ena: code = "axi_interleaved_xbar #(\n" + elif self.enable_multicast: + code = "axi_mcast_xbar #(\n" + else: + code = "axi_xbar #(\n" + input_struct = self.input_struct_mcast if self.forward_mcast else self.input_struct + output_struct = self.output_struct_mcast if self.forward_mcast else self.output_struct code += " .Cfg ( {cfg_name} ),\n".format( cfg_name=self.cfg_name) code += " .Connectivity ( {} ), \n".format(self.connectivity()) code += " .ATOPs ( {} ), \n".format(int(self.atop_support)) code += " .slv_aw_chan_t ( {}_aw_chan_t ),\n".format( - self.input_struct) + input_struct) code += " .mst_aw_chan_t ( {}_aw_chan_t ),\n".format( - self.output_struct) - code += " .w_chan_t ( {}_w_chan_t ),\n".format(self.input_struct) - code += " .slv_b_chan_t ( {}_b_chan_t ),\n".format(self.input_struct) + output_struct) + code += " .w_chan_t ( {}_w_chan_t ),\n".format(input_struct) + code += " .slv_b_chan_t ( {}_b_chan_t ),\n".format(input_struct) code += " .mst_b_chan_t ( {}_b_chan_t ),\n".format( - self.output_struct) + output_struct) code += " .slv_ar_chan_t ( {}_ar_chan_t ),\n".format( - self.input_struct) + input_struct) code += " .mst_ar_chan_t ( {}_ar_chan_t ),\n".format( - self.output_struct) - code += " .slv_r_chan_t ( {}_r_chan_t ),\n".format(self.input_struct) + output_struct) + code += " .slv_r_chan_t ( {}_r_chan_t ),\n".format(input_struct) code += " .mst_r_chan_t ( {}_r_chan_t ),\n".format( - self.output_struct) - code += " .slv_req_t ( {}_req_t ),\n".format(self.input_struct) - code += " .slv_resp_t ( {}_resp_t ),\n".format(self.input_struct) - code += " .mst_req_t ( {}_req_t ),\n".format(self.output_struct) - code += " .mst_resp_t ( {}_resp_t ),\n".format(self.output_struct) + output_struct) + code += " .slv_req_t ( {}_req_t ),\n".format(input_struct) + code += " .slv_resp_t ( {}_resp_t ),\n".format(input_struct) + code += " .mst_req_t ( {}_req_t ),\n".format(output_struct) + code += " .mst_resp_t ( {}_resp_t ),\n".format(output_struct) code += " .rule_t ( xbar_rule_{}_t )\n".format(self.aw) code += ") i_{name} (\n".format(name=self.name) code += " .clk_i ( {clk} ),\n".format(clk=self.clk) @@ -1713,9 +1902,83 @@ def emit(self): if self.interleaved_ena: code += " .interleaved_mode_ena_i ( {name}_interleaved_mode_ena ),\n".format( name=self.name) - code += " .en_default_mst_port_i ( '1 ),\n" - code += " .default_mst_port_i ( '0 )\n" + code += " .en_default_mst_port_i ( {} ),\n".format(en_default_mst_port_i) + code += " .default_mst_port_i ( {} )\n".format(default_mst_port_i) code += ");\n" + code_module[self.context] += "\n" + code + + # Generate the buses to connect to every output port + # Note: does not generate any code, but may be referenced by the templates + code = "" + for output, enum in zip(self.outputs, output_enums): + + if self.forward_mcast: + + if output['forward_mcast']: + + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_out, + uw, + "{}_out".format(self.name), + "[{}]".format(enum), + type_prefix=self.output_struct_mcast, + declared=True, + ) + self.__dict__["out_" + output['name']] = bus + else: + # If the XBAR supports multicast, for all output buses which + # do not forward the multicast signals, we need to change the + # user width of the respective interfaces + output_req = f"{self.name}_out_req[{enum}]" + output_rsp = f"{self.name}_out_rsp[{enum}]" + + output_uwc = f"{self.name}_out_{output['name']}_uwc" + output_uwc_req = output_uwc + "_req" + output_uwc_rsp = output_uwc + "_rsp" + + # Declare the intermediate interfaces first + code += f"// Declare UWC output from {self.name}\n" + code += f" {self.output_struct + '_req_t'} {output_uwc_req};\n" + code += f" {self.output_struct + '_resp_t'} {output_uwc_rsp};\n\n" + + # And then change the user width + code += "// Change UW\n" + code += "`AXI_ASSIGN_REQ_STRUCT({lhs},{rhs})\n".format(lhs=output_uwc_req, + rhs=output_req) + code += "`AXI_ASSIGN_RESP_STRUCT({lhs},{rhs})\n\n".format(lhs=output_rsp, + rhs=output_uwc_rsp) + + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_out, + self.uw, + output_uwc, + type_prefix=self.output_struct, + declared=True, + ) + self.__dict__["out_" + output['name']] = bus + + else: + bus = AxiBus( + self.clk, + self.rst, + self.aw, + self.dw, + iw_out, + self.uw, + "{}_out".format(self.name), + "[{}]".format(enum), + type_prefix=self.output_struct, + declared=True, + ) + self.__dict__["out_" + output['name']] = bus code_module[self.context] += "\n" + code @@ -1726,8 +1989,9 @@ def connectivity(self): for i in self.inputs: for o in self.outputs: # Disable link only if connectivity specified for input or loopback disabled - connectivity += "0" if (((i in self.connections) and (o not in self.connections[i])) - or (self.no_loopback and i == o)) else "1" + connectivity += "0" if (((i['name'] in self.connections) and + (o['name'] not in self.connections[i['name']])) + or (self.no_loopback and i['name'] == o['name'])) else "1" connectivity = "{}'b{}".format(length, connectivity[::-1]) return connectivity @@ -2119,7 +2383,6 @@ def __init__(self, self.max_mst_trans = max_mst_trans self.fall_through = fall_through self.symbolic_addrmap = list() - self.symbolic_addrmap_multi = list() self.addrmap = list() self.latency_mode = latency_mode or "axi_pkg::CUT_ALL_PORTS" @@ -2147,12 +2410,12 @@ def add_output_symbolic(self, name, base, length): def add_output_symbolic_multi(self, name, entries): idx = len(self.outputs) - self.symbolic_addrmap_multi.append((idx, entries)) + for base, length in entries: + self.symbolic_addrmap.append((idx, base, length)) self.outputs.append(name) def addr_map_len(self): - return len(self.addrmap) + len(self.symbolic_addrmap) + sum( - len(am) for am in self.symbolic_addrmap_multi) + return len(self.addrmap) + len(self.symbolic_addrmap) def emit(self): global code_module @@ -2178,11 +2441,6 @@ def emit(self): addrmap_lines.append( " '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format( idx, base, base, length, i=i)) - for i, (idx, entries) in enumerate(self.symbolic_addrmap_multi): - for base, length in entries: - addrmap_lines.append( - " '{{ idx: {}, start_addr: {}[{i}], end_addr: {}[{i}] + {} }}".format( - idx, base, base, length, i=i)) addrmap += "{}\n}};\n".format(',\n'.join(addrmap_lines)) code_module[self.context] += "\n" + addrmap