OswaldHe123 commited on Apr 13, 2025

Commit

033e60e

verified ·

1 Parent(s): e05b93e

Upload Bitstreams

Browse files

Files changed (38) hide show

.gitattributes +10 -0
gpt-2-medium/Makefile +30 -0
gpt-2-medium/README.md +26 -0
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin +3 -0
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info +497 -0
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin +3 -0
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info +490 -0
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin +3 -0
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info +502 -0
gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa +3 -0
gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa +3 -0
gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa +3 -0
gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa +3 -0
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info +485 -0
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin +3 -0
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin +3 -0
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info +502 -0
gpt-2-medium/export_xo.py +52 -0
gpt-2-medium/generate_bitstream_sample.sh +40 -0
gpt-2-medium/hbm_config.ini +7 -0
gpt-2-medium/host-u280.cpp +172 -0
gpt-2-medium/host-versal.cpp +194 -0
gpt-2-medium/host.cpp +194 -0
gpt-2-medium/host_opencl.cpp +273 -0
gpt-2-medium/host_opencl.h +71 -0
gpt-2-medium/kernel-ultrascale.cpp +2091 -0
gpt-2-medium/kernel-versal.cpp +0 -0
gpt-2-medium/kernel.cpp +1528 -0
gpt-2-medium/link_config_versal.ini +7 -0
gpt-2-medium/opt-versal-rs.py +43 -0
gpt-2-medium/package_sample.sh +38 -0
gpt-2-medium/parse_floorplan.py +223 -0
gpt-2-medium/run_app.sh +8 -0
gpt-2-medium/run_tapa.sh +15 -0
gpt-2-medium/run_tapa_rs.sh +28 -0
gpt-2-medium/xo/constraints.tcl +157 -0
gpt-2-medium/xo/opt_kernel.xo +3 -0
gpt-2-medium/xrt.ini +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_latest.xclbin filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin filter=lfs diff=lfs merge=lfs -text
+gpt-2-medium/xo/opt_kernel.xo filter=lfs diff=lfs merge=lfs -text

gpt-2-medium/Makefile ADDED Viewed

	@@ -0,0 +1,30 @@

+GCC=g++
+ARMGCC=$(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++
+SYSROOT=/home/oswaldhe/vpk180_custom_platform/vpk180_custom_platform.vitis/xilinx-versal-common-v2023.2/sysroots/cortexa72-cortexa53-xilinx-linux
+# TAPA_ROOT=$(shell spack location -i tapa@2024-05-18)
+# FRT_ROOT=$(shell spack location -i fpga-runtime)
+# GLOG_ROOT=$(shell spack location -i glog/pqucikz)
+# GFLAGS_ROOT=$(shell spack location -i gflags/y2uaz43)
+INCLUDE_FLAGS=-I$(TAPA_ROOT)/include -I$(FRT_ROOT)/include -I$(GLOG_ROOT)/include -I$(GFLAGS_ROOT)/include -I$(XILINX_HLS)/include
+LDFLAGS=-L$(TAPA_ROOT)/lib -L$(FRT_ROOT)/lib -L$(GLOG_ROOT)/lib -L$(GFLAGS_ROOT)/lib -ltapa -lfrt -lglog -lgflags -lm
+# RPATH_FLAGS=-Wl,-rpath,$(TAPA_ROOT)/lib -Wl,-rpath,$(FRT_ROOT)/lib -Wl,-rpath,$(GLOG_ROOT)/lib -Wl,-rpath,$(GFLAGS_ROOT)/lib
+#OPT=-I$(shell spack location -i tapa@2023-01-08)/include -I$(shell spack location -i fpga-runtime)/include -I$(shell spack location -i glog/pqucikz)/include -I${shell spack location -i gflags/y2uaz43}/include -ltapa -lfrt -lglog -lgflags -lOpenCL -lm -I${XILINX_HLS}/include
+#RPATH_FLAGS=-Wl,-rpath,$(shell spack location -i tapa@2023-01-08)/lib -Wl,-rpath,$(shell spack location -i fpga-runtime)/lib -Wl,-rpath,$(shell spack location -i glog/pqucikz)/lib -Wl,-rpath,$(shell spack location -i gflags/y2uaz43)/lib
+opt350: kernel.cpp host.cpp
+	$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
+opt350-ultrascale: kernel-ultrascale.cpp host-u280.cpp
+	$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
+host-opencl: host_opencl.o
+	$(ARMGCC) -o $@ $^ -L$(SYSROOT)/usr/lib/ -lxrt_coreutil -lpthread -lrt -lstdc++ -lgmp -lOpenCL --sysroot=$(SYSROOT)
+host_opencl.o: host_opencl.cpp
+	$(ARMGCC) -c -D__USE_XOPEN2K8 -I$(SYSROOT)/usr/include/xrt -I$(XILINX_VIVADO)/include -I$(SYSROOT)/usr/include -I$(XILINX_HLS)/include -fmessage-length=0 -std=c++17 --sysroot=$(SYSROOT) -o $@ $^
+opt350-versal: kernel-versal.cpp host-versal.cpp
+	$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lglog -lgflags -lm -lOpenCL -I$(XILINX_HLS)/include
+clean:
+	rm opt350 opt-versal opt350-ultrascale

gpt-2-medium/README.md ADDED Viewed

	@@ -0,0 +1,26 @@

+## Place & Route Instructions
+### Generate Vitis Platform
+Follow this [tutorial](https://docs.amd.com/r/2023.2-English/Vitis-Tutorials-Vitis-Platform-Creation/Versal-Platform-Creation-Quick-Start) to generate the Vitis Platform for VPK180. There are a couple of changes:
+1. Step 1-3: Select VPK180 as the device. Generate 3 clocks: 100MHz, 200MHz, 300MHz.
+2. Step 2-2: git-branch should be `xlnx_rel_v2023.2`. `system-user.dtsi` is on [Vitis Tutorial Github Repo](https://github.com/Xilinx/Vitis-Tutorials/blob/2023.2/Vitis_Platform_Creation/Design_Tutorials/03_Edge_VCK190/ref_files/step2_pfm/system-user.dtsi). Change the name to Xilinx custom-vpk180. Board name is `versal-vpk180-reva`.
+### Launch V++ Script for P&R
+After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh` and launch the script to start P&R.
+### Hardware Emulation Using QEMU
+After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh`. Change target to `hw_emu` and turn on debug mode `-g`. After generating the xsa file for hardware emulation, run `package_sample.sh` with the same modifications as `generate_bitstream_sample.sh`, with the files you want to include in the SD card image (including the host binary, launch scripts, and configuration file `xrt.ini`). You will find a script `/package/launch_hw_emu.sh` to start QEMU directly.
+## Latency References vs. SoTA (ms)
+|Seq Length | Allo | DFX | NVIDIA T4 | NVIDIA A100 | AMD MI210 |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| 64 | 205.46 | 349.1 | 47.26 | 39.8 | 7.776 |
+| 128 | 370.56 | 692.8 | 56.4 | 39.51 | 8.541 |
+| 256 | 740.76 | 1412.5 | 81.0 | 39.82 | 10.12 |
+| 512 | 1333.79 | 2825.1 | 162.91 | 49.06 | 15.52 |
+| 1024 | 3777.4 | 6079 | 360.9 | 49.17 | 33.08 |

gpt-2-medium/bitstreams/opt_kernel_latest.xclbin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:090f0f57d4d3450a0a44c8bc3c50c3271fe5af186e2c7d165d62ec70ac48dbe7
+size 76134932

gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info ADDED Viewed

	@@ -0,0 +1,497 @@

+==============================================================================
+XRT Build Version: 2.14.384 (2022.2)
+       Build Date: 2022-12-09 00:55:08
+          Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
+==============================================================================
+xclbin Information
+------------------
+   Generated by:           v++ (2021.2) on 2021-10-14-04:41:01
+   Version:                2.14.384
+   Kernels:                opt_kernel
+   Signature:
+   Content:                Bitstream
+   UUID (xclbin):          41b0c8a4-f618-a8f7-0b11-d3c822641412
+   Sections:               DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
+                           CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
+                           EMBEDDED_METADATA, SYSTEM_METADATA,
+                           GROUP_CONNECTIVITY, GROUP_TOPOLOGY
+==============================================================================
+Hardware Platform (Shell) Information
+-------------------------------------
+   Vendor:                 xilinx
+   Board:                  u280
+   Name:                   xdma
+   Version:                201920.3
+   Generated Version:      Vivado 2019.2 (SW Build: 2742762)
+   Created:
+               Tue Jan 21 23:21:22 2020   FPGA Device:            xcu280
+   Board Vendor:           xilinx.com
+   Board Name:             xilinx.com:au280:1.0
+   Board Part:             xilinx.com:au280:part0:1.0
+   Platform VBNV:          xilinx_u280_xdma_201920_3
+   Static UUID:            f2b82d53-372f-45a4-bbe9-3d1c980216da
+   Feature ROM TimeStamp:  1579649056
+Scalable Clocks
+---------------
+   Name:      clk_out1_pfm_top_clkwiz_hbm_aclk_0
+   Index:     0
+   Type:      SYSTEM
+   Frequency: 450 MHz
+   Name:      DATA_CLK
+   Index:     1
+   Type:      DATA
+   Frequency: 224 MHz
+   Name:      KERNEL_CLK
+   Index:     2
+   Type:      KERNEL
+   Frequency: 500 MHz
+System Clocks
+------
+   Name:           _bd_top_clkwiz_kernel2_clk_out1
+   Type:           SCALABLE
+   Default Freq:   500 MHz
+   Requested Freq: 500 MHz
+   Achieved Freq:  500 MHz
+   Name:           _bd_top_clkwiz_kernel_clk_out1
+   Type:           SCALABLE
+   Default Freq:   300 MHz
+   Requested Freq: 300 MHz
+   Achieved Freq:  224.4 MHz
+Memory Configuration
+--------------------
+   Name:         HBM[0]
+   Index:        0
+   Type:         MEM_DDR4
+   Base Address: 0x0
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[1]
+   Index:        1
+   Type:         MEM_DDR4
+   Base Address: 0x10000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[2]
+   Index:        2
+   Type:         MEM_DRAM
+   Base Address: 0x20000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[3]
+   Index:        3
+   Type:         MEM_DRAM
+   Base Address: 0x30000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[4]
+   Index:        4
+   Type:         MEM_DRAM
+   Base Address: 0x40000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[5]
+   Index:        5
+   Type:         MEM_DRAM
+   Base Address: 0x50000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[6]
+   Index:        6
+   Type:         MEM_DRAM
+   Base Address: 0x60000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[7]
+   Index:        7
+   Type:         MEM_DRAM
+   Base Address: 0x70000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[8]
+   Index:        8
+   Type:         MEM_DRAM
+   Base Address: 0x80000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[9]
+   Index:        9
+   Type:         MEM_DRAM
+   Base Address: 0x90000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[10]
+   Index:        10
+   Type:         MEM_DRAM
+   Base Address: 0xa0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[11]
+   Index:        11
+   Type:         MEM_DRAM
+   Base Address: 0xb0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[12]
+   Index:        12
+   Type:         MEM_DRAM
+   Base Address: 0xc0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[13]
+   Index:        13
+   Type:         MEM_DRAM
+   Base Address: 0xd0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[14]
+   Index:        14
+   Type:         MEM_DRAM
+   Base Address: 0xe0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[15]
+   Index:        15
+   Type:         MEM_DRAM
+   Base Address: 0xf0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[16]
+   Index:        16
+   Type:         MEM_DRAM
+   Base Address: 0x100000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[17]
+   Index:        17
+   Type:         MEM_DRAM
+   Base Address: 0x110000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[18]
+   Index:        18
+   Type:         MEM_DRAM
+   Base Address: 0x120000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[19]
+   Index:        19
+   Type:         MEM_DRAM
+   Base Address: 0x130000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[20]
+   Index:        20
+   Type:         MEM_DRAM
+   Base Address: 0x140000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[21]
+   Index:        21
+   Type:         MEM_DRAM
+   Base Address: 0x150000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[22]
+   Index:        22
+   Type:         MEM_DRAM
+   Base Address: 0x160000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[23]
+   Index:        23
+   Type:         MEM_DRAM
+   Base Address: 0x170000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[24]
+   Index:        24
+   Type:         MEM_DRAM
+   Base Address: 0x180000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[25]
+   Index:        25
+   Type:         MEM_DRAM
+   Base Address: 0x190000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[26]
+   Index:        26
+   Type:         MEM_DRAM
+   Base Address: 0x1a0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[27]
+   Index:        27
+   Type:         MEM_DRAM
+   Base Address: 0x1b0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[28]
+   Index:        28
+   Type:         MEM_DRAM
+   Base Address: 0x1c0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[29]
+   Index:        29
+   Type:         MEM_DRAM
+   Base Address: 0x1d0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[30]
+   Index:        30
+   Type:         MEM_DRAM
+   Base Address: 0x1e0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[31]
+   Index:        31
+   Type:         MEM_DRAM
+   Base Address: 0x1f0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         DDR[0]
+   Index:        32
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         DDR[1]
+   Index:        33
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[0]
+   Index:        34
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[1]
+   Index:        35
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[2]
+   Index:        36
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[3]
+   Index:        37
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[4]
+   Index:        38
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[5]
+   Index:        39
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+==============================================================================
+Kernel: opt_kernel
+Definition
+----------
+   Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
+Ports
+-----
+   Port:          m_axi_X_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_X_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_acc0_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_acc1_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_cycle_count
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    32 bits
+   Port Type:     addressable
+   Port:          s_axi_control
+   Mode:          slave
+   Range (bytes): 0x1000
+   Data Width:    32 bits
+   Port Type:     addressable
+--------------------------
+Instance:        opt_kernel
+   Base Address: 0x1800000
+   Argument:          L
+   Register Offset:   0x10
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          L_out
+   Register Offset:   0x18
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          seq_len
+   Register Offset:   0x20
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          X_acc0
+   Register Offset:   0x28
+   Port:              m_axi_X_acc0
+   Memory:            HBM[0] (MEM_DDR4)
+   Argument:          X_acc1
+   Register Offset:   0x34
+   Port:              m_axi_X_acc1
+   Memory:            HBM[16] (MEM_DRAM)
+   Argument:          W_acc0
+   Register Offset:   0x40
+   Port:              m_axi_W_acc0
+   Memory:            HBM[1] (MEM_DDR4)
+   Argument:          W_acc1
+   Register Offset:   0x4c
+   Port:              m_axi_W_acc1
+   Memory:            HBM[17] (MEM_DRAM)
+   Argument:          acc0_out
+   Register Offset:   0x58
+   Port:              m_axi_acc0_out
+   Memory:            HBM[2] (MEM_DRAM)
+   Argument:          acc1_out
+   Register Offset:   0x64
+   Port:              m_axi_acc1_out
+   Memory:            HBM[18] (MEM_DRAM)
+   Argument:          cycle_count
+   Register Offset:   0x70
+   Port:              m_axi_cycle_count
+   Memory:            HBM[19] (MEM_DRAM)
+==============================================================================
+Generated By
+------------
+   Command:       v++
+   Version:       2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
+   Command Line:  v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[19] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
+   Options:       --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini
+                  --connectivity.nk opt_kernel:1:opt_kernel
+                  --connectivity.sp opt_kernel.X_acc0:HBM[0]
+                  --connectivity.sp opt_kernel.X_acc1:HBM[16]
+                  --connectivity.sp opt_kernel.W_acc0:HBM[1]
+                  --connectivity.sp opt_kernel.W_acc1:HBM[17]
+                  --connectivity.sp opt_kernel.acc0_out:HBM[2]
+                  --connectivity.sp opt_kernel.acc1_out:HBM[18]
+                  --connectivity.sp opt_kernel.cycle_count:HBM[19]
+                  --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo
+                  --kernel opt_kernel
+                  --link
+                  --optimize 3
+                  --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
+                  --platform xilinx_u280_xdma_201920_3
+                  --report_level 2
+                  --save-temps
+                  --target hw
+                  --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
+                  --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
+                  -propconst
+                  -sweep
+                  -shift_register_opt}
+                  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl
+                  --vivado.synth.jobs 8
+==============================================================================
+User Added Key Value Pairs
+--------------------------
+   <empty>
+==============================================================================

gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0413a6d5d20f76bc7b5d5376f088edec7ee574db131b857215b5a2fbd99e6075
+size 76961468

gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info ADDED Viewed

	@@ -0,0 +1,490 @@

+==============================================================================
+XRT Build Version: 2.14.384 (2022.2)
+       Build Date: 2022-12-09 00:55:08
+          Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
+==============================================================================
+xclbin Information
+------------------
+   Generated by:           v++ (2021.2) on 2021-10-14-04:41:01
+   Version:                2.14.384
+   Kernels:                opt_kernel
+   Signature:
+   Content:                Bitstream
+   UUID (xclbin):          4617f7da-9790-9c63-864e-303bcf47c723
+   Sections:               DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
+                           CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
+                           EMBEDDED_METADATA, SYSTEM_METADATA,
+                           GROUP_CONNECTIVITY, GROUP_TOPOLOGY
+==============================================================================
+Hardware Platform (Shell) Information
+-------------------------------------
+   Vendor:                 xilinx
+   Board:                  u280
+   Name:                   xdma
+   Version:                201920.3
+   Generated Version:      Vivado 2019.2 (SW Build: 2742762)
+   Created:
+               Tue Jan 21 23:21:22 2020   FPGA Device:            xcu280
+   Board Vendor:           xilinx.com
+   Board Name:             xilinx.com:au280:1.0
+   Board Part:             xilinx.com:au280:part0:1.0
+   Platform VBNV:          xilinx_u280_xdma_201920_3
+   Static UUID:            f2b82d53-372f-45a4-bbe9-3d1c980216da
+   Feature ROM TimeStamp:  1579649056
+Scalable Clocks
+---------------
+   Name:      clk_out1_pfm_top_clkwiz_hbm_aclk_0
+   Index:     0
+   Type:      SYSTEM
+   Frequency: 450 MHz
+   Name:      DATA_CLK
+   Index:     1
+   Type:      DATA
+   Frequency: 241 MHz
+   Name:      KERNEL_CLK
+   Index:     2
+   Type:      KERNEL
+   Frequency: 500 MHz
+System Clocks
+------
+   Name:           _bd_top_clkwiz_kernel2_clk_out1
+   Type:           SCALABLE
+   Default Freq:   500 MHz
+   Requested Freq: 500 MHz
+   Achieved Freq:  500 MHz
+   Name:           _bd_top_clkwiz_kernel_clk_out1
+   Type:           SCALABLE
+   Default Freq:   300 MHz
+   Requested Freq: 300 MHz
+   Achieved Freq:  241.4 MHz
+Memory Configuration
+--------------------
+   Name:         HBM[0]
+   Index:        0
+   Type:         MEM_DDR4
+   Base Address: 0x0
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[1]
+   Index:        1
+   Type:         MEM_DDR4
+   Base Address: 0x10000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[2]
+   Index:        2
+   Type:         MEM_DRAM
+   Base Address: 0x20000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[3]
+   Index:        3
+   Type:         MEM_DRAM
+   Base Address: 0x30000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[4]
+   Index:        4
+   Type:         MEM_DRAM
+   Base Address: 0x40000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[5]
+   Index:        5
+   Type:         MEM_DRAM
+   Base Address: 0x50000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[6]
+   Index:        6
+   Type:         MEM_DRAM
+   Base Address: 0x60000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[7]
+   Index:        7
+   Type:         MEM_DRAM
+   Base Address: 0x70000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[8]
+   Index:        8
+   Type:         MEM_DRAM
+   Base Address: 0x80000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[9]
+   Index:        9
+   Type:         MEM_DRAM
+   Base Address: 0x90000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[10]
+   Index:        10
+   Type:         MEM_DRAM
+   Base Address: 0xa0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[11]
+   Index:        11
+   Type:         MEM_DRAM
+   Base Address: 0xb0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[12]
+   Index:        12
+   Type:         MEM_DRAM
+   Base Address: 0xc0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[13]
+   Index:        13
+   Type:         MEM_DRAM
+   Base Address: 0xd0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[14]
+   Index:        14
+   Type:         MEM_DRAM
+   Base Address: 0xe0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[15]
+   Index:        15
+   Type:         MEM_DRAM
+   Base Address: 0xf0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[16]
+   Index:        16
+   Type:         MEM_DRAM
+   Base Address: 0x100000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[17]
+   Index:        17
+   Type:         MEM_DRAM
+   Base Address: 0x110000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[18]
+   Index:        18
+   Type:         MEM_DRAM
+   Base Address: 0x120000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[19]
+   Index:        19
+   Type:         MEM_DRAM
+   Base Address: 0x130000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[20]
+   Index:        20
+   Type:         MEM_DRAM
+   Base Address: 0x140000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[21]
+   Index:        21
+   Type:         MEM_DRAM
+   Base Address: 0x150000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[22]
+   Index:        22
+   Type:         MEM_DRAM
+   Base Address: 0x160000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[23]
+   Index:        23
+   Type:         MEM_DRAM
+   Base Address: 0x170000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[24]
+   Index:        24
+   Type:         MEM_DRAM
+   Base Address: 0x180000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[25]
+   Index:        25
+   Type:         MEM_DRAM
+   Base Address: 0x190000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[26]
+   Index:        26
+   Type:         MEM_DRAM
+   Base Address: 0x1a0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[27]
+   Index:        27
+   Type:         MEM_DRAM
+   Base Address: 0x1b0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[28]
+   Index:        28
+   Type:         MEM_DRAM
+   Base Address: 0x1c0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[29]
+   Index:        29
+   Type:         MEM_DRAM
+   Base Address: 0x1d0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[30]
+   Index:        30
+   Type:         MEM_DRAM
+   Base Address: 0x1e0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[31]
+   Index:        31
+   Type:         MEM_DRAM
+   Base Address: 0x1f0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         DDR[0]
+   Index:        32
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         DDR[1]
+   Index:        33
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[0]
+   Index:        34
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[1]
+   Index:        35
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[2]
+   Index:        36
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[3]
+   Index:        37
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[4]
+   Index:        38
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[5]
+   Index:        39
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+==============================================================================
+Kernel: opt_kernel
+Definition
+----------
+   Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc1_out, int* cycle_count)
+Ports
+-----
+   Port:          m_axi_X_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_X_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_acc1_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_cycle_count
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    32 bits
+   Port Type:     addressable
+   Port:          s_axi_control
+   Mode:          slave
+   Range (bytes): 0x1000
+   Data Width:    32 bits
+   Port Type:     addressable
+--------------------------
+Instance:        opt_kernel
+   Base Address: 0x1800000
+   Argument:          L
+   Register Offset:   0x10
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          L_out
+   Register Offset:   0x18
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          seq_len
+   Register Offset:   0x20
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          reload
+   Register Offset:   0x28
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          X_acc0
+   Register Offset:   0x30
+   Port:              m_axi_X_acc0
+   Memory:            HBM[16] (MEM_DRAM)
+   Argument:          X_acc1
+   Register Offset:   0x3c
+   Port:              m_axi_X_acc1
+   Memory:            HBM[0] (MEM_DDR4)
+   Argument:          W_acc0
+   Register Offset:   0x48
+   Port:              m_axi_W_acc0
+   Memory:            HBM[17] (MEM_DRAM)
+   Argument:          W_acc1
+   Register Offset:   0x54
+   Port:              m_axi_W_acc1
+   Memory:            HBM[1] (MEM_DDR4)
+   Argument:          acc1_out
+   Register Offset:   0x60
+   Port:              m_axi_acc1_out
+   Memory:            HBM[2] (MEM_DRAM)
+   Argument:          cycle_count
+   Register Offset:   0x6c
+   Port:              m_axi_cycle_count
+   Memory:            HBM[3] (MEM_DRAM)
+==============================================================================
+Generated By
+------------
+   Command:       v++
+   Version:       2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
+   Command Line:  v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[16] --connectivity.sp opt_kernel.X_acc1:HBM[0] --connectivity.sp opt_kernel.W_acc0:HBM[17] --connectivity.sp opt_kernel.W_acc1:HBM[1] --connectivity.sp opt_kernel.acc1_out:HBM[2] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
+   Options:       --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini
+                  --connectivity.nk opt_kernel:1:opt_kernel
+                  --connectivity.sp opt_kernel.X_acc0:HBM[16]
+                  --connectivity.sp opt_kernel.X_acc1:HBM[0]
+                  --connectivity.sp opt_kernel.W_acc0:HBM[17]
+                  --connectivity.sp opt_kernel.W_acc1:HBM[1]
+                  --connectivity.sp opt_kernel.acc1_out:HBM[2]
+                  --connectivity.sp opt_kernel.cycle_count:HBM[3]
+                  --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo
+                  --kernel opt_kernel
+                  --link
+                  --optimize 3
+                  --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
+                  --platform xilinx_u280_xdma_201920_3
+                  --report_level 2
+                  --save-temps
+                  --target hw
+                  --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
+                  --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
+                  -propconst
+                  -sweep
+                  -shift_register_opt}
+                  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl
+                  --vivado.synth.jobs 8
+==============================================================================
+User Added Key Value Pairs
+--------------------------
+   <empty>
+==============================================================================

gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30d40b37a38089e3181996d9df02dd4371c8423c93304316bf22637e655992c3
+size 76724924

gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info ADDED Viewed

	@@ -0,0 +1,502 @@

+==============================================================================
+XRT Build Version: 2.14.384 (2022.2)
+       Build Date: 2022-12-09 00:55:08
+          Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
+==============================================================================
+xclbin Information
+------------------
+   Generated by:           v++ (2021.2) on 2021-10-14-04:41:01
+   Version:                2.14.384
+   Kernels:                opt_kernel
+   Signature:
+   Content:                Bitstream
+   UUID (xclbin):          cbb0489a-3f5c-066e-845c-af93ba50ad0a
+   Sections:               DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
+                           CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
+                           EMBEDDED_METADATA, SYSTEM_METADATA,
+                           GROUP_CONNECTIVITY, GROUP_TOPOLOGY
+==============================================================================
+Hardware Platform (Shell) Information
+-------------------------------------
+   Vendor:                 xilinx
+   Board:                  u280
+   Name:                   xdma
+   Version:                201920.3
+   Generated Version:      Vivado 2019.2 (SW Build: 2742762)
+   Created:
+               Tue Jan 21 23:21:22 2020   FPGA Device:            xcu280
+   Board Vendor:           xilinx.com
+   Board Name:             xilinx.com:au280:1.0
+   Board Part:             xilinx.com:au280:part0:1.0
+   Platform VBNV:          xilinx_u280_xdma_201920_3
+   Static UUID:            f2b82d53-372f-45a4-bbe9-3d1c980216da
+   Feature ROM TimeStamp:  1579649056
+Scalable Clocks
+---------------
+   Name:      clk_out1_pfm_top_clkwiz_hbm_aclk_0
+   Index:     0
+   Type:      SYSTEM
+   Frequency: 450 MHz
+   Name:      DATA_CLK
+   Index:     1
+   Type:      DATA
+   Frequency: 202 MHz
+   Name:      KERNEL_CLK
+   Index:     2
+   Type:      KERNEL
+   Frequency: 500 MHz
+System Clocks
+------
+   Name:           _bd_top_clkwiz_kernel2_clk_out1
+   Type:           SCALABLE
+   Default Freq:   500 MHz
+   Requested Freq: 500 MHz
+   Achieved Freq:  500 MHz
+   Name:           _bd_top_clkwiz_kernel_clk_out1
+   Type:           SCALABLE
+   Default Freq:   300 MHz
+   Requested Freq: 300 MHz
+   Achieved Freq:  202.5 MHz
+Memory Configuration
+--------------------
+   Name:         HBM[0]
+   Index:        0
+   Type:         MEM_DDR4
+   Base Address: 0x0
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[1]
+   Index:        1
+   Type:         MEM_DDR4
+   Base Address: 0x10000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[2]
+   Index:        2
+   Type:         MEM_DRAM
+   Base Address: 0x20000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[3]
+   Index:        3
+   Type:         MEM_DRAM
+   Base Address: 0x30000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[4]
+   Index:        4
+   Type:         MEM_DRAM
+   Base Address: 0x40000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[5]
+   Index:        5
+   Type:         MEM_DRAM
+   Base Address: 0x50000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[6]
+   Index:        6
+   Type:         MEM_DRAM
+   Base Address: 0x60000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[7]
+   Index:        7
+   Type:         MEM_DRAM
+   Base Address: 0x70000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[8]
+   Index:        8
+   Type:         MEM_DRAM
+   Base Address: 0x80000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[9]
+   Index:        9
+   Type:         MEM_DRAM
+   Base Address: 0x90000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[10]
+   Index:        10
+   Type:         MEM_DRAM
+   Base Address: 0xa0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[11]
+   Index:        11
+   Type:         MEM_DRAM
+   Base Address: 0xb0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[12]
+   Index:        12
+   Type:         MEM_DRAM
+   Base Address: 0xc0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[13]
+   Index:        13
+   Type:         MEM_DRAM
+   Base Address: 0xd0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[14]
+   Index:        14
+   Type:         MEM_DRAM
+   Base Address: 0xe0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[15]
+   Index:        15
+   Type:         MEM_DRAM
+   Base Address: 0xf0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[16]
+   Index:        16
+   Type:         MEM_DRAM
+   Base Address: 0x100000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[17]
+   Index:        17
+   Type:         MEM_DRAM
+   Base Address: 0x110000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[18]
+   Index:        18
+   Type:         MEM_DRAM
+   Base Address: 0x120000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[19]
+   Index:        19
+   Type:         MEM_DRAM
+   Base Address: 0x130000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[20]
+   Index:        20
+   Type:         MEM_DRAM
+   Base Address: 0x140000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[21]
+   Index:        21
+   Type:         MEM_DRAM
+   Base Address: 0x150000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[22]
+   Index:        22
+   Type:         MEM_DRAM
+   Base Address: 0x160000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[23]
+   Index:        23
+   Type:         MEM_DRAM
+   Base Address: 0x170000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[24]
+   Index:        24
+   Type:         MEM_DRAM
+   Base Address: 0x180000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[25]
+   Index:        25
+   Type:         MEM_DRAM
+   Base Address: 0x190000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[26]
+   Index:        26
+   Type:         MEM_DRAM
+   Base Address: 0x1a0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[27]
+   Index:        27
+   Type:         MEM_DRAM
+   Base Address: 0x1b0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[28]
+   Index:        28
+   Type:         MEM_DRAM
+   Base Address: 0x1c0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[29]
+   Index:        29
+   Type:         MEM_DRAM
+   Base Address: 0x1d0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[30]
+   Index:        30
+   Type:         MEM_DRAM
+   Base Address: 0x1e0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[31]
+   Index:        31
+   Type:         MEM_DRAM
+   Base Address: 0x1f0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         DDR[0]
+   Index:        32
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         DDR[1]
+   Index:        33
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[0]
+   Index:        34
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[1]
+   Index:        35
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[2]
+   Index:        36
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[3]
+   Index:        37
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[4]
+   Index:        38
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[5]
+   Index:        39
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+==============================================================================
+Kernel: opt_kernel
+Definition
+----------
+   Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
+Ports
+-----
+   Port:          m_axi_X_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_X_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_acc0_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_acc1_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_cycle_count
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    32 bits
+   Port Type:     addressable
+   Port:          s_axi_control
+   Mode:          slave
+   Range (bytes): 0x1000
+   Data Width:    32 bits
+   Port Type:     addressable
+--------------------------
+Instance:        opt_kernel
+   Base Address: 0x1800000
+   Argument:          L
+   Register Offset:   0x10
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          L_out
+   Register Offset:   0x18
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          seq_len
+   Register Offset:   0x20
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          reload
+   Register Offset:   0x28
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          X_acc0
+   Register Offset:   0x30
+   Port:              m_axi_X_acc0
+   Memory:            HBM[0] (MEM_DDR4)
+   Argument:          X_acc1
+   Register Offset:   0x3c
+   Port:              m_axi_X_acc1
+   Memory:            HBM[16] (MEM_DRAM)
+   Argument:          W_acc0
+   Register Offset:   0x48
+   Port:              m_axi_W_acc0
+   Memory:            HBM[1] (MEM_DDR4)
+   Argument:          W_acc1
+   Register Offset:   0x54
+   Port:              m_axi_W_acc1
+   Memory:            HBM[17] (MEM_DRAM)
+   Argument:          acc0_out
+   Register Offset:   0x60
+   Port:              m_axi_acc0_out
+   Memory:            HBM[2] (MEM_DRAM)
+   Argument:          acc1_out
+   Register Offset:   0x6c
+   Port:              m_axi_acc1_out
+   Memory:            HBM[18] (MEM_DRAM)
+   Argument:          cycle_count
+   Register Offset:   0x78
+   Port:              m_axi_cycle_count
+   Memory:            HBM[3] (MEM_DRAM)
+==============================================================================
+Generated By
+------------
+   Command:       v++
+   Version:       2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
+   Command Line:  v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
+   Options:       --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
+                  --connectivity.nk opt_kernel:1:opt_kernel
+                  --connectivity.sp opt_kernel.X_acc0:HBM[0]
+                  --connectivity.sp opt_kernel.X_acc1:HBM[16]
+                  --connectivity.sp opt_kernel.W_acc0:HBM[1]
+                  --connectivity.sp opt_kernel.W_acc1:HBM[17]
+                  --connectivity.sp opt_kernel.acc0_out:HBM[2]
+                  --connectivity.sp opt_kernel.acc1_out:HBM[18]
+                  --connectivity.sp opt_kernel.cycle_count:HBM[3]
+                  --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
+                  --kernel opt_kernel
+                  --link
+                  --optimize 3
+                  --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
+                  --platform xilinx_u280_xdma_201920_3
+                  --report_level 2
+                  --save-temps
+                  --target hw
+                  --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
+                  --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
+                  -propconst
+                  -sweep
+                  -shift_register_opt}
+                  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
+                  --vivado.synth.jobs 8
+==============================================================================
+User Added Key Value Pairs
+--------------------------
+   <empty>
+==============================================================================

gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:722a71423e17da2f05587a9fd3c1e9d695f5cee02962744fcbde569aca21242f
+size 70565471

gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d986bb71631b79c5c2b5c6576e1f78051a671d6d4536e2095c8a39127c456461
+size 86497092

gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7c1339d6b78b36c4a35cb09709dfebb321bdf0decf037802e5d617356ad42b6
+size 84081530

gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d258e7884b1e3c2f42bc8fd7a3878ab976b8cd2cc5042bdaaea949b27f506688
+size 82554104

gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info ADDED Viewed

	@@ -0,0 +1,485 @@

+==============================================================================
+XRT Build Version: 2.14.384 (2022.2)
+       Build Date: 2022-12-09 00:55:08
+          Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
+==============================================================================
+xclbin Information
+------------------
+   Generated by:           v++ (2021.2) on 2021-10-14-04:41:01
+   Version:                2.14.384
+   Kernels:                opt_kernel
+   Signature:
+   Content:                Bitstream
+   UUID (xclbin):          06dfa191-ba53-780e-16db-fd0655f01fc3
+   Sections:               DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
+                           CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
+                           EMBEDDED_METADATA, SYSTEM_METADATA,
+                           GROUP_CONNECTIVITY, GROUP_TOPOLOGY
+==============================================================================
+Hardware Platform (Shell) Information
+-------------------------------------
+   Vendor:                 xilinx
+   Board:                  u280
+   Name:                   xdma
+   Version:                201920.3
+   Generated Version:      Vivado 2019.2 (SW Build: 2742762)
+   Created:
+               Tue Jan 21 23:21:22 2020   FPGA Device:            xcu280
+   Board Vendor:           xilinx.com
+   Board Name:             xilinx.com:au280:1.0
+   Board Part:             xilinx.com:au280:part0:1.0
+   Platform VBNV:          xilinx_u280_xdma_201920_3
+   Static UUID:            f2b82d53-372f-45a4-bbe9-3d1c980216da
+   Feature ROM TimeStamp:  1579649056
+Scalable Clocks
+---------------
+   Name:      clk_out1_pfm_top_clkwiz_hbm_aclk_0
+   Index:     0
+   Type:      SYSTEM
+   Frequency: 450 MHz
+   Name:      DATA_CLK
+   Index:     1
+   Type:      DATA
+   Frequency: 257 MHz
+   Name:      KERNEL_CLK
+   Index:     2
+   Type:      KERNEL
+   Frequency: 500 MHz
+System Clocks
+------
+   Name:           _bd_top_clkwiz_kernel2_clk_out1
+   Type:           SCALABLE
+   Default Freq:   500 MHz
+   Requested Freq: 500 MHz
+   Achieved Freq:  500 MHz
+   Name:           _bd_top_clkwiz_kernel_clk_out1
+   Type:           SCALABLE
+   Default Freq:   300 MHz
+   Requested Freq: 300 MHz
+   Achieved Freq:  257.2 MHz
+Memory Configuration
+--------------------
+   Name:         HBM[0]
+   Index:        0
+   Type:         MEM_DDR4
+   Base Address: 0x0
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[1]
+   Index:        1
+   Type:         MEM_DDR4
+   Base Address: 0x10000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[2]
+   Index:        2
+   Type:         MEM_DRAM
+   Base Address: 0x20000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[3]
+   Index:        3
+   Type:         MEM_DRAM
+   Base Address: 0x30000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[4]
+   Index:        4
+   Type:         MEM_DRAM
+   Base Address: 0x40000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[5]
+   Index:        5
+   Type:         MEM_DRAM
+   Base Address: 0x50000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[6]
+   Index:        6
+   Type:         MEM_DRAM
+   Base Address: 0x60000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[7]
+   Index:        7
+   Type:         MEM_DRAM
+   Base Address: 0x70000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[8]
+   Index:        8
+   Type:         MEM_DRAM
+   Base Address: 0x80000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[9]
+   Index:        9
+   Type:         MEM_DRAM
+   Base Address: 0x90000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[10]
+   Index:        10
+   Type:         MEM_DRAM
+   Base Address: 0xa0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[11]
+   Index:        11
+   Type:         MEM_DRAM
+   Base Address: 0xb0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[12]
+   Index:        12
+   Type:         MEM_DRAM
+   Base Address: 0xc0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[13]
+   Index:        13
+   Type:         MEM_DRAM
+   Base Address: 0xd0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[14]
+   Index:        14
+   Type:         MEM_DRAM
+   Base Address: 0xe0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[15]
+   Index:        15
+   Type:         MEM_DRAM
+   Base Address: 0xf0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[16]
+   Index:        16
+   Type:         MEM_DRAM
+   Base Address: 0x100000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[17]
+   Index:        17
+   Type:         MEM_DRAM
+   Base Address: 0x110000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[18]
+   Index:        18
+   Type:         MEM_DRAM
+   Base Address: 0x120000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[19]
+   Index:        19
+   Type:         MEM_DRAM
+   Base Address: 0x130000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[20]
+   Index:        20
+   Type:         MEM_DRAM
+   Base Address: 0x140000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[21]
+   Index:        21
+   Type:         MEM_DRAM
+   Base Address: 0x150000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[22]
+   Index:        22
+   Type:         MEM_DRAM
+   Base Address: 0x160000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[23]
+   Index:        23
+   Type:         MEM_DRAM
+   Base Address: 0x170000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[24]
+   Index:        24
+   Type:         MEM_DRAM
+   Base Address: 0x180000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[25]
+   Index:        25
+   Type:         MEM_DRAM
+   Base Address: 0x190000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[26]
+   Index:        26
+   Type:         MEM_DRAM
+   Base Address: 0x1a0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[27]
+   Index:        27
+   Type:         MEM_DRAM
+   Base Address: 0x1b0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[28]
+   Index:        28
+   Type:         MEM_DRAM
+   Base Address: 0x1c0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[29]
+   Index:        29
+   Type:         MEM_DRAM
+   Base Address: 0x1d0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[30]
+   Index:        30
+   Type:         MEM_DRAM
+   Base Address: 0x1e0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[31]
+   Index:        31
+   Type:         MEM_DRAM
+   Base Address: 0x1f0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         DDR[0]
+   Index:        32
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         DDR[1]
+   Index:        33
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[0]
+   Index:        34
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[1]
+   Index:        35
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[2]
+   Index:        36
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[3]
+   Index:        37
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[4]
+   Index:        38
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[5]
+   Index:        39
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+==============================================================================
+Kernel: opt_kernel
+Definition
+----------
+   Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<128>* acc0_out, int* cycle_count)
+Ports
+-----
+   Port:          m_axi_X_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_X_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_acc0_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    128 bits
+   Port Type:     addressable
+   Port:          m_axi_cycle_count
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    32 bits
+   Port Type:     addressable
+   Port:          s_axi_control
+   Mode:          slave
+   Range (bytes): 0x1000
+   Data Width:    32 bits
+   Port Type:     addressable
+--------------------------
+Instance:        opt_kernel
+   Base Address: 0x1800000
+   Argument:          L
+   Register Offset:   0x10
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          L_out
+   Register Offset:   0x18
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          seq_len
+   Register Offset:   0x20
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          X_acc0
+   Register Offset:   0x28
+   Port:              m_axi_X_acc0
+   Memory:            HBM[1] (MEM_DDR4)
+   Argument:          X_acc1
+   Register Offset:   0x34
+   Port:              m_axi_X_acc1
+   Memory:            HBM[2] (MEM_DRAM)
+   Argument:          W_acc0
+   Register Offset:   0x40
+   Port:              m_axi_W_acc0
+   Memory:            HBM[3] (MEM_DRAM)
+   Argument:          W_acc1
+   Register Offset:   0x4c
+   Port:              m_axi_W_acc1
+   Memory:            HBM[4] (MEM_DRAM)
+   Argument:          acc0_out
+   Register Offset:   0x58
+   Port:              m_axi_acc0_out
+   Memory:            HBM[7] (MEM_DRAM)
+   Argument:          cycle_count
+   Register Offset:   0x64
+   Port:              m_axi_cycle_count
+   Memory:            HBM[9] (MEM_DRAM)
+==============================================================================
+Generated By
+------------
+   Command:       v++
+   Version:       2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
+   Command Line:  v++ --config /scratch/oswaldhe/hbm_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[1] --connectivity.sp opt_kernel.X_acc1:HBM[2] --connectivity.sp opt_kernel.W_acc0:HBM[3] --connectivity.sp opt_kernel.W_acc1:HBM[4] --connectivity.sp opt_kernel.acc0_out:HBM[7] --connectivity.sp opt_kernel.cycle_count:HBM[9] --input_files /scratch/oswaldhe/work.out/run-1/design-point.xo --kernel opt_kernel --link --optimize 3 --output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl --vivado.synth.jobs 8
+   Options:       --config /scratch/oswaldhe/hbm_config.ini
+                  --connectivity.nk opt_kernel:1:opt_kernel
+                  --connectivity.sp opt_kernel.X_acc0:HBM[1]
+                  --connectivity.sp opt_kernel.X_acc1:HBM[2]
+                  --connectivity.sp opt_kernel.W_acc0:HBM[3]
+                  --connectivity.sp opt_kernel.W_acc1:HBM[4]
+                  --connectivity.sp opt_kernel.acc0_out:HBM[7]
+                  --connectivity.sp opt_kernel.cycle_count:HBM[9]
+                  --input_files /scratch/oswaldhe/work.out/run-1/design-point.xo
+                  --kernel opt_kernel
+                  --link
+                  --optimize 3
+                  --output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
+                  --platform xilinx_u280_xdma_201920_3
+                  --report_level 2
+                  --save-temps
+                  --target hw
+                  --temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
+                  --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
+                  -propconst
+                  -sweep
+                  -shift_register_opt}
+                  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default
+                  --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default
+                  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl
+                  --vivado.synth.jobs 8
+==============================================================================
+User Added Key Value Pairs
+--------------------------
+   <empty>
+==============================================================================

gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc34da0da50c9058d7705e2529b37d1b88d2da38c315fa4d8ca878255a43b282
+size 68746361

gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c64f06b252dc6400e5a6a4910f803b6c120b828876009ed128b25db1719c05d
+size 76311460

gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info ADDED Viewed

	@@ -0,0 +1,502 @@

+==============================================================================
+XRT Build Version: 2.14.384 (2022.2)
+       Build Date: 2022-12-09 00:55:08
+          Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
+==============================================================================
+xclbin Information
+------------------
+   Generated by:           v++ (2021.2) on 2021-10-14-04:41:01
+   Version:                2.14.384
+   Kernels:                opt_kernel
+   Signature:
+   Content:                Bitstream
+   UUID (xclbin):          ce5651b8-ff94-7baf-4833-5b6446d1a345
+   Sections:               DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
+                           CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
+                           EMBEDDED_METADATA, SYSTEM_METADATA,
+                           GROUP_CONNECTIVITY, GROUP_TOPOLOGY
+==============================================================================
+Hardware Platform (Shell) Information
+-------------------------------------
+   Vendor:                 xilinx
+   Board:                  u280
+   Name:                   xdma
+   Version:                201920.3
+   Generated Version:      Vivado 2019.2 (SW Build: 2742762)
+   Created:
+               Tue Jan 21 23:21:22 2020   FPGA Device:            xcu280
+   Board Vendor:           xilinx.com
+   Board Name:             xilinx.com:au280:1.0
+   Board Part:             xilinx.com:au280:part0:1.0
+   Platform VBNV:          xilinx_u280_xdma_201920_3
+   Static UUID:            f2b82d53-372f-45a4-bbe9-3d1c980216da
+   Feature ROM TimeStamp:  1579649056
+Scalable Clocks
+---------------
+   Name:      clk_out1_pfm_top_clkwiz_hbm_aclk_0
+   Index:     0
+   Type:      SYSTEM
+   Frequency: 450 MHz
+   Name:      DATA_CLK
+   Index:     1
+   Type:      DATA
+   Frequency: 220 MHz
+   Name:      KERNEL_CLK
+   Index:     2
+   Type:      KERNEL
+   Frequency: 500 MHz
+System Clocks
+------
+   Name:           _bd_top_clkwiz_kernel2_clk_out1
+   Type:           SCALABLE
+   Default Freq:   500 MHz
+   Requested Freq: 500 MHz
+   Achieved Freq:  500 MHz
+   Name:           _bd_top_clkwiz_kernel_clk_out1
+   Type:           SCALABLE
+   Default Freq:   300 MHz
+   Requested Freq: 300 MHz
+   Achieved Freq:  220 MHz
+Memory Configuration
+--------------------
+   Name:         HBM[0]
+   Index:        0
+   Type:         MEM_DDR4
+   Base Address: 0x0
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[1]
+   Index:        1
+   Type:         MEM_DDR4
+   Base Address: 0x10000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[2]
+   Index:        2
+   Type:         MEM_DRAM
+   Base Address: 0x20000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[3]
+   Index:        3
+   Type:         MEM_DRAM
+   Base Address: 0x30000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[4]
+   Index:        4
+   Type:         MEM_DRAM
+   Base Address: 0x40000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[5]
+   Index:        5
+   Type:         MEM_DRAM
+   Base Address: 0x50000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[6]
+   Index:        6
+   Type:         MEM_DRAM
+   Base Address: 0x60000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[7]
+   Index:        7
+   Type:         MEM_DRAM
+   Base Address: 0x70000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[8]
+   Index:        8
+   Type:         MEM_DRAM
+   Base Address: 0x80000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[9]
+   Index:        9
+   Type:         MEM_DRAM
+   Base Address: 0x90000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[10]
+   Index:        10
+   Type:         MEM_DRAM
+   Base Address: 0xa0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[11]
+   Index:        11
+   Type:         MEM_DRAM
+   Base Address: 0xb0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[12]
+   Index:        12
+   Type:         MEM_DRAM
+   Base Address: 0xc0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[13]
+   Index:        13
+   Type:         MEM_DRAM
+   Base Address: 0xd0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[14]
+   Index:        14
+   Type:         MEM_DRAM
+   Base Address: 0xe0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[15]
+   Index:        15
+   Type:         MEM_DRAM
+   Base Address: 0xf0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[16]
+   Index:        16
+   Type:         MEM_DRAM
+   Base Address: 0x100000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[17]
+   Index:        17
+   Type:         MEM_DRAM
+   Base Address: 0x110000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[18]
+   Index:        18
+   Type:         MEM_DRAM
+   Base Address: 0x120000000
+   Address Size: 0x10000000
+   Bank Used:    Yes
+   Name:         HBM[19]
+   Index:        19
+   Type:         MEM_DRAM
+   Base Address: 0x130000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[20]
+   Index:        20
+   Type:         MEM_DRAM
+   Base Address: 0x140000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[21]
+   Index:        21
+   Type:         MEM_DRAM
+   Base Address: 0x150000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[22]
+   Index:        22
+   Type:         MEM_DRAM
+   Base Address: 0x160000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[23]
+   Index:        23
+   Type:         MEM_DRAM
+   Base Address: 0x170000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[24]
+   Index:        24
+   Type:         MEM_DRAM
+   Base Address: 0x180000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[25]
+   Index:        25
+   Type:         MEM_DRAM
+   Base Address: 0x190000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[26]
+   Index:        26
+   Type:         MEM_DRAM
+   Base Address: 0x1a0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[27]
+   Index:        27
+   Type:         MEM_DRAM
+   Base Address: 0x1b0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[28]
+   Index:        28
+   Type:         MEM_DRAM
+   Base Address: 0x1c0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[29]
+   Index:        29
+   Type:         MEM_DRAM
+   Base Address: 0x1d0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[30]
+   Index:        30
+   Type:         MEM_DRAM
+   Base Address: 0x1e0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         HBM[31]
+   Index:        31
+   Type:         MEM_DRAM
+   Base Address: 0x1f0000000
+   Address Size: 0x10000000
+   Bank Used:    No
+   Name:         DDR[0]
+   Index:        32
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         DDR[1]
+   Index:        33
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[0]
+   Index:        34
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[1]
+   Index:        35
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[2]
+   Index:        36
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[3]
+   Index:        37
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[4]
+   Index:        38
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+   Name:         PLRAM[5]
+   Index:        39
+   Type:         MEM_DRAM
+   Base Address: 0x0
+   Address Size: 0x0
+   Bank Used:    No
+==============================================================================
+Kernel: opt_kernel
+Definition
+----------
+   Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
+Ports
+-----
+   Port:          m_axi_X_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_X_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc0
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_W_acc1
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    512 bits
+   Port Type:     addressable
+   Port:          m_axi_acc0_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_acc1_out
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    64 bits
+   Port Type:     addressable
+   Port:          m_axi_cycle_count
+   Mode:          master
+   Range (bytes): 0xFFFFFFFFFFFFFFFF
+   Data Width:    32 bits
+   Port Type:     addressable
+   Port:          s_axi_control
+   Mode:          slave
+   Range (bytes): 0x1000
+   Data Width:    32 bits
+   Port Type:     addressable
+--------------------------
+Instance:        opt_kernel
+   Base Address: 0x1800000
+   Argument:          L
+   Register Offset:   0x10
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          L_out
+   Register Offset:   0x18
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          seq_len
+   Register Offset:   0x20
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          reload
+   Register Offset:   0x28
+   Port:              s_axi_control
+   Memory:            <not applicable>
+   Argument:          X_acc0
+   Register Offset:   0x30
+   Port:              m_axi_X_acc0
+   Memory:            HBM[0] (MEM_DDR4)
+   Argument:          X_acc1
+   Register Offset:   0x3c
+   Port:              m_axi_X_acc1
+   Memory:            HBM[16] (MEM_DRAM)
+   Argument:          W_acc0
+   Register Offset:   0x48
+   Port:              m_axi_W_acc0
+   Memory:            HBM[1] (MEM_DDR4)
+   Argument:          W_acc1
+   Register Offset:   0x54
+   Port:              m_axi_W_acc1
+   Memory:            HBM[17] (MEM_DRAM)
+   Argument:          acc0_out
+   Register Offset:   0x60
+   Port:              m_axi_acc0_out
+   Memory:            HBM[2] (MEM_DRAM)
+   Argument:          acc1_out
+   Register Offset:   0x6c
+   Port:              m_axi_acc1_out
+   Memory:            HBM[18] (MEM_DRAM)
+   Argument:          cycle_count
+   Register Offset:   0x78
+   Port:              m_axi_cycle_count
+   Memory:            HBM[3] (MEM_DRAM)
+==============================================================================
+Generated By
+------------
+   Command:       v++
+   Version:       2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
+   Command Line:  v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
+   Options:       --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
+                  --connectivity.nk opt_kernel:1:opt_kernel
+                  --connectivity.sp opt_kernel.X_acc0:HBM[0]
+                  --connectivity.sp opt_kernel.X_acc1:HBM[16]
+                  --connectivity.sp opt_kernel.W_acc0:HBM[1]
+                  --connectivity.sp opt_kernel.W_acc1:HBM[17]
+                  --connectivity.sp opt_kernel.acc0_out:HBM[2]
+                  --connectivity.sp opt_kernel.acc1_out:HBM[18]
+                  --connectivity.sp opt_kernel.cycle_count:HBM[3]
+                  --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
+                  --kernel opt_kernel
+                  --link
+                  --optimize 3
+                  --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
+                  --platform xilinx_u280_xdma_201920_3
+                  --report_level 2
+                  --save-temps
+                  --target hw
+                  --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
+                  --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
+                  -propconst
+                  -sweep
+                  -shift_register_opt}
+                  --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
+                  --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
+                  --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
+                  --vivado.synth.jobs 8
+==============================================================================
+User Added Key Value Pairs
+--------------------------
+   <empty>
+==============================================================================

gpt-2-medium/export_xo.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from rapidstream import RapidStreamTAPA, DeviceFactory, get_u250_vitis_device_factory
+from pathlib import Path
+import os
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+BUILD_DIR = "rs_build"
+VITIS_PLATFORM = "~/vpk180_linux_platform/vpk180_pfm_vitis/export/vpk180_pfm_vitis/vpk180_pfm_vitis.xpfm"
+rs = RapidStreamTAPA(BUILD_DIR)
+# factory = get_u250_vitis_device_factory(VITIS_PLATFORM)
+factory = DeviceFactory(
+    row=4,
+    col=2,
+    part_num="xcvp1802-lsvc4072-2MP-e-S",
+    board_name="xilinx.com:vpk180:part0:1.1",
+)
+# Set the pblocks of the device so that each slot contains half of an SLR:
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
+factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
+factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
+factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
+factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
+# There are 18870 total SLL nodes for VP1552:
+factory.set_slot_capacity(0, 0, north=9435)
+factory.set_slot_capacity(1, 0, north=9435)
+factory.set_slot_capacity(0, 1, north=9435)
+factory.set_slot_capacity(1, 1, north=9435)
+factory.set_slot_capacity(0, 2, north=9435)
+factory.set_slot_capacity(1, 2, north=9435)
+# Call factory to extract the slot resources automatically from Vivado:
+factory.extract_slot_resources()
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file("./gpt2-sa.tapa/gpt2.xo")
+rs.set_top_module_name("opt_kernel")
+rs.add_clock("ap_clk", period_ns=3.33)
+rs.set_vitis_connectivity_config("link_config_versal.ini")
+work_dir_to_ir = {Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5'): Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5/add_pipeline.json')}
+rs.remote_ip_cache = Path(f"{CURR_DIR}/{BUILD_DIR}")
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.parallel_export_candidates(work_dir_to_ir)

gpt-2-medium/generate_bitstream_sample.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+TARGET=hw
+# TARGET=hw_emu
+# DEBUG=-g
+TOP=opt_kernel
+XO='/path/to/opt_kernel.xo'
+CONSTRAINT='/path/to/floorplanning/constraint.tcl'
+>&2 echo "Using the default clock target of the platform."
+PLATFORM="/path/to/vitis/vpk180.xpfm"
+TARGET_FREQUENCY=240000000
+if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
+OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
+MAX_SYNTH_JOBS=16
+STRATEGY="Explore"
+PLACEMENT_STRATEGY="Explore"
+v++ ${DEBUG} \
+  --link \
+  --output "${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
+  --kernel ${TOP} \
+  --platform ${PLATFORM} \
+  --target ${TARGET} \
+  --report_level 2 \
+  --temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp" \
+  --optimize 3 \
+  --connectivity.nk ${TOP}:1:${TOP} \
+  --save-temps \
+  "${XO}" \
+  --vivado.synth.jobs ${MAX_SYNTH_JOBS} \
+  --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
+  --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
+  --vivado.prop=run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE\ OPTIONS}={-debug_log} \
+  --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$PLACEMENT_STRATEGY \
+  --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
+  --vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
+  --clock.default_freqhz ${TARGET_FREQUENCY} \
+  --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT \

gpt-2-medium/hbm_config.ini ADDED Viewed

	@@ -0,0 +1,7 @@

+[connectivity]
+sp=opt_kernel.X_acc0:HBM[0]
+sp=opt_kernel.X_acc1:HBM[16]
+sp=opt_kernel.W_acc0:HBM[1]
+sp=opt_kernel.W_acc1:HBM[17]
+sp=opt_kernel.acc0_out:HBM[2]
+sp=opt_kernel.cycle_count:HBM[19]

gpt-2-medium/host-u280.cpp ADDED Viewed

	@@ -0,0 +1,172 @@

+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <ctime>
+#include <cmath>
+#include <tapa.h>
+#include <gflags/gflags.h>
+#include <ap_int.h>
+constexpr int D = 1024;
+constexpr int D_ffn = 5504;
+constexpr int N_head = 16;
+constexpr int MAX_SEQ_LEN = 1024;
+constexpr int NUM_SLR = 3;
+constexpr int NUM_DUM_SLR = 4;
+constexpr int D_head = D / N_head;
+constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
+constexpr int OUT_WEIGHT_SIZE = D * D;
+constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
+using std::vector;
+using int_v16 = tapa::vec_t<int, 16>;
+using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
+using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
+void opt_kernel(
+    const int L,
+    const int L_out,
+    const int seq_len,
+    // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
+    tapa::mmap<ap_uint<512>> X_acc0,
+    tapa::mmap<ap_uint<512>> X_acc1,
+    tapa::mmap<ap_uint<512>> W_acc0,
+    tapa::mmap<ap_uint<512>> W_acc1,
+    tapa::mmap<ap_uint<128>> acc0_out,
+    // tapa::mmap<ap_uint<64>> acc1_out,
+    tapa::mmap<int> cycle_count
+);
+template <typename T>
+using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
+DEFINE_string(bitstream, "", "path to bitstream file");
+int main(int argc, char *argv[]){
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
+    srand((unsigned)time(nullptr));
+    // data preparation
+    aligned_vector<int> inst = {L, 1};
+    aligned_vector<ap_int<8>> X_acc0(L * D, 0);
+    aligned_vector<ap_int<8>> X_acc1(L * D, 0);
+    aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
+    aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
+    aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
+    // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
+    aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
+    aligned_vector<int> cycle_count(1);
+    vector<int> X_copy(L * D);
+    vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
+    vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    // for(int i = 0; i < L * D; i++){
+    //     int val = (rand() % 8) + 1;
+    //     ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+    //     X_copy[i] = val;
+    //     X_acc0[i] = ap_int<8>(full(7, 0));
+    //     X_acc1[i] = ap_int<8>(full(7, 0));
+    // }
+    // for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
+    //     int val = (rand() % 6) - 1;
+    //     ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+    //     W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+    //     W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    // }
+    // for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
+    //     int val = (rand() % 6) - 1;
+    //     ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+    //     W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+    //     W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    // }
+    // for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
+    //     int val = (rand() % 6) - 1;
+    //     int ind = i - D * D_head * NUM_DUM_SLR * 4;
+    //     ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+    //     W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+    //     W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+    //     W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
+    // }
+    // // cpu
+    // for(int i = 0; i < NUM_SLR; i++){
+    //     // WqX
+    //     for(int j = 0; j < L; j++){
+    //         for(int k = 0; k < D_head; k++){
+    //             int acc = 0;
+    //             for(int l = 0; l < D; l++){
+    //                 acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
+    //             }
+    //             q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+    //         }
+    //     }
+    //     //WvX
+    //     for(int j = 0; j < L; j++){
+    //         for(int k = 0; k < D_head; k++){
+    //             int acc = 0;
+    //             for(int l = 0; l < D; l++){
+    //                 acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
+    //             }
+    //             acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+    //         }
+    //     }
+    //     //WkX
+    //     for(int j = 0; j < L; j++){
+    //         for(int k = 0; k < D_head; k++){
+    //             int acc = 0;
+    //             for(int l = 0; l < D; l++){
+    //                 acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
+    //             }
+    //             k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+    //         }
+    //     }
+    //     // QK^T
+    //     for(int j = 0; j < L; j++){
+    //         for(int k = 0; k < L; k++){
+    //             int acc = 0;
+    //             for(int l = 0; l < D_head; l++){
+    //                 acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
+    //             }
+    //             attn_golden[i][j*D_head+k] = acc;
+    //         }
+    //     }
+    // }
+    // invoke the kernel
+    int64_t kernel_time_ns = 0;
+    for(int i = 0; i < 24; i++){
+        kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
+            L * D, L * D / 16, L,
+            // tapa::read_only_mmap<int>(inst),
+            tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
+            tapa::write_only_mmap<ap_uint<128>>(acc0_out),
+            // tapa::write_only_mmap<ap_uint<64>>(acc1_out),
+            tapa::write_only_mmap<int>(cycle_count));
+    }
+    std::clog << "cycle time: " << cycle_count[0] << std::endl;
+    std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
+}

gpt-2-medium/host-versal.cpp ADDED Viewed

	@@ -0,0 +1,194 @@

+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <ctime>
+#include <cmath>
+#include <tapa.h>
+#include <gflags/gflags.h>
+#include <ap_int.h>
+constexpr int D = 1024;
+constexpr int D_ffn = 4096;
+constexpr int N_head = 16;
+constexpr int MAX_SEQ_LEN = 1024;
+constexpr int NUM_SLR = 4;
+constexpr int NUM_DUM_SLR = 4;
+constexpr int D_head = D / N_head;
+constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
+constexpr int OUT_WEIGHT_SIZE = D * D;
+constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
+using std::vector;
+using int_v16 = tapa::vec_t<int, 16>;
+using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
+using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
+void opt_kernel(
+    const int L,
+    const int L_out,
+    const int seq_len,
+    // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
+    tapa::mmap<ap_uint<512>> X_acc0,
+    tapa::mmap<ap_uint<512>> X_acc1,
+    tapa::mmap<ap_uint<512>> W_acc0,
+    tapa::mmap<ap_uint<512>> W_acc1,
+    tapa::mmap<ap_uint<128>> acc0_out,
+    // tapa::mmap<ap_uint<64>> acc1_out,
+    tapa::mmap<int> cycle_count
+);
+template <typename T>
+using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
+DEFINE_string(bitstream, "", "path to bitstream file");
+int main(int argc, char *argv[]){
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
+    srand((unsigned)time(nullptr));
+    // data preparation
+    aligned_vector<int> inst = {L, 1};
+    aligned_vector<ap_int<8>> X_acc0(L * D);
+    aligned_vector<ap_int<8>> X_acc1(L * D);
+    aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
+    aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
+    aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
+    // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
+    aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
+    aligned_vector<int> cycle_count(1);
+    vector<int> X_copy(L * D);
+    vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
+    vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
+    vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    for(int i = 0; i < L * D; i++){
+        int val = (rand() % 8) + 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        X_copy[i] = val;
+        X_acc0[i] = ap_int<8>(full(7, 0));
+        X_acc1[i] = ap_int<8>(full(7, 0));
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
+        int val = (rand() % 6) - 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
+        int val = (rand() % 6) - 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    }
+    for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
+        int val = (rand() % 6) - 1;
+        int ind = i - D * D_head * NUM_DUM_SLR * 4;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
+    }
+    // cpu
+    for(int i = 0; i < NUM_SLR; i++){
+        // WqX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
+                }
+                q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        //WvX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
+                }
+                acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        //WkX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
+                }
+                k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        // QK^T
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < L; k++){
+                int acc = 0;
+                for(int l = 0; l < D_head; l++){
+                    acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
+                }
+                attn_golden[i][j*D_head+k] = acc;
+            }
+        }
+    }
+    // invoke the kernel
+    int64_t kernel_time_ns = 0;
+    for(int i = 0; i < 1; i++){
+        kernel_time_ns = tapa::invoke(opt_kernel, FLAGS_bitstream,
+            L * D, L * D / 16, L,
+            // tapa::read_only_mmap<int>(inst),
+            tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
+            tapa::write_only_mmap<ap_uint<128>>(acc0_out),
+            // tapa::write_only_mmap<ap_uint<64>>(acc1_out),
+            tapa::write_only_mmap<int>(cycle_count));
+    }
+    std::clog << "cycle time: " << cycle_count[0] << std::endl;
+    std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
+    int error = 0;
+    // compare
+    // for(int i = 0; i < NUM_SLR; i++){
+    //     for(int j = 0; j < 4; j++){
+    //         for(int k = 0; k < 16; k++){
+    //             if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
+    //                 std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
+    //                 error++;
+    //             }
+    //         }
+    //     }
+    // }
+    if (error == 0) {
+        std::clog << "PASSED" << std::endl;
+    } else {
+        std::clog << "FAILED" << std::endl;
+        return 1;
+    }
+    return 0;
+}

gpt-2-medium/host.cpp ADDED Viewed

	@@ -0,0 +1,194 @@

+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <ctime>
+#include <cmath>
+#include <tapa.h>
+#include <gflags/gflags.h>
+#include <ap_int.h>
+constexpr int D = 1024;
+constexpr int D_ffn = 4096;
+constexpr int N_head = 16;
+constexpr int MAX_SEQ_LEN = 1024;
+constexpr int NUM_SLR = 3;
+constexpr int NUM_DUM_SLR = 4;
+constexpr int D_head = D / N_head;
+constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
+constexpr int OUT_WEIGHT_SIZE = D * D;
+constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
+using std::vector;
+using int_v16 = tapa::vec_t<int, 16>;
+using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
+using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
+void opt_kernel(
+    const int L,
+    const int L_out,
+    const int seq_len,
+    // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
+    tapa::mmap<ap_uint<512>> X_acc0,
+    tapa::mmap<ap_uint<512>> X_acc1,
+    tapa::mmap<ap_uint<512>> W_acc0,
+    tapa::mmap<ap_uint<512>> W_acc1,
+    tapa::mmap<ap_uint<64>> acc0_out,
+    tapa::mmap<ap_uint<64>> acc1_out,
+    tapa::mmap<int> cycle_count
+);
+template <typename T>
+using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
+DEFINE_string(bitstream, "", "path to bitstream file");
+int main(int argc, char *argv[]){
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
+    srand((unsigned)time(nullptr));
+    // data preparation
+    aligned_vector<int> inst = {L, 1};
+    aligned_vector<ap_int<8>> X_acc0(L * D);
+    aligned_vector<ap_int<8>> X_acc1(L * D);
+    aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10);
+    aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10);
+    aligned_vector<ap_uint<64>> acc0_out(NUM_SLR * L * D / 8);
+    // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
+    aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
+    aligned_vector<int> cycle_count(1);
+    vector<int> X_copy(L * D);
+    vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
+    vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
+    vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
+    vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
+    vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
+    for(int i = 0; i < L * D; i++){
+        int val = (rand() % 8) + 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        X_copy[i] = val;
+        X_acc0[i] = ap_int<8>(full(7, 0));
+        X_acc1[i] = ap_int<8>(full(7, 0));
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
+        int val = (rand() % 6) - 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
+        int val = (rand() % 6) - 1;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
+    }
+    for(int i = D * D_head * NUM_DUM_SLR * 5; i < D * D_head * NUM_DUM_SLR * 15; i++){
+        int val = (rand() % 6) - 1;
+        int ind = i - D * D_head * NUM_DUM_SLR * 5;
+        ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
+        W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
+        W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
+    }
+    // cpu
+    for(int i = 0; i < NUM_SLR; i++){
+        // WqX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
+                }
+                q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        //WvX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
+                }
+                acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        //WkX
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < D_head; k++){
+                int acc = 0;
+                for(int l = 0; l < D; l++){
+                    acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
+                }
+                k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
+            }
+        }
+        // QK^T
+        for(int j = 0; j < L; j++){
+            for(int k = 0; k < L; k++){
+                int acc = 0;
+                for(int l = 0; l < D_head; l++){
+                    acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
+                }
+                attn_golden[i][j*D_head+k] = acc;
+            }
+        }
+    }
+    // invoke the kernel
+    int64_t kernel_time_ns = 0;
+    for(int i = 0; i < 24; i++){
+        kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
+            L * D, L * D / 8, L,
+            // tapa::read_only_mmap<int>(inst),
+            tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
+            tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
+            tapa::write_only_mmap<ap_uint<64>>(acc0_out),
+            tapa::write_only_mmap<ap_uint<64>>(acc1_out),
+            tapa::write_only_mmap<int>(cycle_count));
+    }
+    // std::clog << "cycle time: " << cycle_count[0] << std::endl;
+    std::clog << "kernel time: " << kernel_time_ns * 2e-9 << " s" << std::endl;
+    int error = 0;
+    // compare
+    // for(int i = 0; i < NUM_SLR; i++){
+    //     for(int j = 0; j < 4; j++){
+    //         for(int k = 0; k < 16; k++){
+    //             if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
+    //                 std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
+    //                 error++;
+    //             }
+    //         }
+    //     }
+    // }
+    if (error == 0) {
+        std::clog << "PASSED" << std::endl;
+    } else {
+        std::clog << "FAILED" << std::endl;
+        return 1;
+    }
+    return 0;
+}

gpt-2-medium/host_opencl.cpp ADDED Viewed

	@@ -0,0 +1,273 @@

+/*******************************************************************************
+Vendor: Xilinx
+Associated Filename: vadd.cpp
+Purpose: VITIS vector addition
+*******************************************************************************
+Copyright (C) 2019 XILINX, Inc.
+This file contains confidential and proprietary information of Xilinx, Inc. and
+is protected under U.S. and international copyright and other intellectual
+property laws.
+DISCLAIMER
+This disclaimer is not a license and does not grant any rights to the materials
+distributed herewith. Except as otherwise provided in a valid license issued to
+you by Xilinx, and to the maximum extent permitted by applicable law:
+(1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
+HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
+FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+in contract or tort, including negligence, or under any other theory of
+liability) for any loss or damage of any kind or nature related to, arising under
+or in connection with these materials, including for any direct, or any indirect,
+special, incidental, or consequential loss or damage (including loss of data,
+profits, goodwill, or any type of loss or damage suffered as a result of any
+action brought by a third party) even if such damage or loss was reasonably
+foreseeable or Xilinx had been advised of the possibility of the same.
+CRITICAL APPLICATIONS
+Xilinx products are not designed or intended to be fail-safe, or for use in any
+application requiring fail-safe performance, such as life-support or safety
+devices or systems, Class III medical devices, nuclear facilities, applications
+related to the deployment of airbags, or any other applications that could lead
+to death, personal injury, or severe property or environmental damage
+(individually and collectively, "Critical Applications"). Customer assumes the
+sole risk and liability of any use of Xilinx products in Critical Applications,
+subject only to applicable laws and regulations governing limitations on product
+liability.
+THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
+ALL TIMES.
+*******************************************************************************/
+#define OCL_CHECK(error, call)                                                                   \
+    call;                                                                                        \
+    if (error != CL_SUCCESS) {                                                                   \
+        printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \
+        exit(EXIT_FAILURE);                                                                      \
+    }
+#include "host_opencl.h"
+#include <fstream>
+#include <iostream>
+#include <stdlib.h>
+#include <ap_int.h>
+static const int DATA_SIZE = 4096;
+static const std::string error_message =
+    "Error: Result mismatch:\n"
+    "i = %d CPU result = %d Device result = %d\n";
+int main(int argc, char* argv[]) {
+    // TARGET_DEVICE macro needs to be passed from gcc command line
+    if (argc < 2) {
+        std::cout << "Usage: " << argv[0] << " <xclbin>" << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::string xclbinFilename = argv[1];
+    // Compute the size of array in bytes
+    size_t size_in_bytes = DATA_SIZE * sizeof(int);
+    int L = 64;
+    if (argc == 3) {
+        L = atoi(argv[2]);
+    }
+    const int D = 1024;
+    const int NUM_DUM_SLR = 4;
+    const int NUM_SLR = 4;
+    const int D_head = 64;
+    const int D_ffn = 4096;
+    // Creates a vector of DATA_SIZE elements with an initial value of 10 and 32
+    // using customized allocator for getting buffer alignment to 4k boundary
+    std::vector<cl::Device> devices;
+    cl_int err;
+    cl::Context context;
+    cl::CommandQueue q;
+    cl::Kernel krnl_vector_add;
+    cl::Program program;
+    std::vector<cl::Platform> platforms;
+    bool found_device = false;
+    // traversing all Platforms To find Xilinx Platform and targeted
+    // Device in Xilinx Platform
+    cl::Platform::get(&platforms);
+    for (size_t i = 0; (i < platforms.size()) & (found_device == false); i++) {
+        cl::Platform platform = platforms[i];
+        std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
+        if (platformName == "Xilinx") {
+            devices.clear();
+            platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+            if (devices.size()) {
+                found_device = true;
+                break;
+            }
+        }
+    }
+    if (found_device == false) {
+        std::cout << "Error: Unable to find Target Device " << std::endl;
+        return EXIT_FAILURE;
+    }
+    std::cout << "INFO: Reading " << xclbinFilename << std::endl;
+    FILE* fp;
+    if ((fp = fopen(xclbinFilename.c_str(), "r")) == nullptr) {
+        printf("ERROR: %s xclbin not available please build\n", xclbinFilename.c_str());
+        exit(EXIT_FAILURE);
+    }
+    // Load xclbin
+    std::cout << "Loading: '" << xclbinFilename << "'\n";
+    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
+    bin_file.seekg(0, bin_file.end);
+    unsigned nb = bin_file.tellg();
+    bin_file.seekg(0, bin_file.beg);
+    char* buf = new char[nb];
+    bin_file.read(buf, nb);
+    // Creating Program from Binary File
+    cl::Program::Binaries bins;
+    bins.push_back({buf, nb});
+    bool valid_device = false;
+    for (unsigned int i = 0; i < devices.size(); i++) {
+        auto device = devices[i];
+        // Creating Context and Command Queue for selected Device
+        OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
+        OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+        std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+        cl::Program program(context, {device}, bins, nullptr, &err);
+        if (err != CL_SUCCESS) {
+            std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
+        } else {
+            std::cout << "Device[" << i << "]: program successful!\n";
+            OCL_CHECK(err, krnl_vector_add = cl::Kernel(program, "opt_kernel", &err));
+            valid_device = true;
+            break; // we break because we found a valid device
+        }
+    }
+    if (!valid_device) {
+        std::cout << "Failed to program any device found, exit!\n";
+        exit(EXIT_FAILURE);
+    }
+    // These commands will allocate memory on the Device. The cl::Buffer objects can
+    // be used to reference the memory locations on the device.
+    OCL_CHECK(err, cl::Buffer buffer_X_acc0(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
+    OCL_CHECK(err, cl::Buffer buffer_X_acc1(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
+    OCL_CHECK(err, cl::Buffer buffer_W_acc0(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
+    OCL_CHECK(err, cl::Buffer buffer_W_acc1(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
+    OCL_CHECK(err, cl::Buffer buffer_acc0_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D * 8), NULL, &err));
+    // OCL_CHECK(err, cl::Buffer buffer_acc1_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D), NULL, &err));
+    OCL_CHECK(err, cl::Buffer buffer_cycle(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &err));
+    std::cout << "Finish creating buffer\n";
+    // set the kernel Arguments
+    int narg = 0;
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D/16));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc0));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc1));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc0));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc1));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc0_out));
+    // OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc1_out));
+    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_cycle));
+    std::cout << "Finish setArgs\n";
+    // We then need to map our OpenCL buffers to get the pointers
+    ap_int<8>* X_acc0;
+    ap_int<8>* X_acc1;
+    ap_int<8>* W_acc0;
+    ap_int<8>* W_acc1;
+    ap_uint<128>* acc0_out;
+    // ap_uint<64>* acc1_out;
+    int* cycle;
+    OCL_CHECK(err,
+              X_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc0, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
+    OCL_CHECK(err,
+              X_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc1, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
+    OCL_CHECK(err,
+              W_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc0, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
+    OCL_CHECK(err,
+              W_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc1, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
+    OCL_CHECK(err, acc0_out = (ap_uint<128>*)q.enqueueMapBuffer(buffer_acc0_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D * 2, NULL,
+                                                         NULL, &err));
+    // OCL_CHECK(err, acc1_out = (ap_uint<64>*)q.enqueueMapBuffer(buffer_acc1_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D, NULL,
+    //                                                      NULL, &err));
+    OCL_CHECK(err, cycle = (int*)q.enqueueMapBuffer(buffer_cycle, CL_TRUE, CL_MAP_READ, 0, sizeof(int), NULL,
+                                                         NULL, &err));
+    // Initialize the vectors used in the test
+    for(int i = 0; i < L * D; i++){
+        X_acc0[i] = 1;
+        X_acc1[i] = 1;
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
+        W_acc1[i] = 1;
+    }
+    for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
+        W_acc0[i] = 1;
+    }
+    std::cout << "Finish assigning values\n";
+    cl::Event event;
+    uint64_t nstimestart, nstimeend;
+    uint64_t exe_time = 0;
+    // Data will be migrated to kernel space
+    OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_X_acc0, buffer_X_acc1, buffer_W_acc0, buffer_W_acc1}, 0 /* 0 means from host*/));
+    std::cout << "Start kernel\n";
+    // Launch the Kernel
+    OCL_CHECK(err, err = q.enqueueTask(krnl_vector_add, nullptr, &event));
+    std::cout << "Finish kernel\n";
+    // The result of the previous kernel execution will need to be retrieved in
+    // order to view the results. This call will transfer the data from FPGA to
+    // source_results vector
+    OCL_CHECK(err, q.enqueueMigrateMemObjects({buffer_acc0_out, buffer_cycle}, CL_MIGRATE_MEM_OBJECT_HOST));
+    std::cout << "Receive data\n";
+    OCL_CHECK(err, q.finish());
+    OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_START, &nstimestart));
+    OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_END, &nstimeend));
+    exe_time += nstimeend - nstimestart;
+    // Verify the result
+    int match = 0;
+    // for (int i = 0; i < DATA_SIZE; i++) {
+    //     int host_result = ptr_a[i] + ptr_b[i];
+    //     if (ptr_result[i] != host_result) {
+    //         printf(error_message.c_str(), i, host_result, ptr_result[i]);
+    //         match = 1;
+    //         break;
+    //     }
+    // }
+    std::cout << "Cycle count: " << cycle[0] << std::endl;
+    std::cout << "Latency: " << exe_time << " ns" << std::endl;
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc0, X_acc0));
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc1, X_acc1));
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc0, W_acc0));
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc1, W_acc1));
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc0_out, acc0_out));
+    // OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc1_out, acc1_out));
+    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_cycle, cycle));
+    OCL_CHECK(err, err = q.finish());
+    std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
+    return (match ? EXIT_FAILURE : EXIT_SUCCESS);
+}

gpt-2-medium/host_opencl.h ADDED Viewed

	@@ -0,0 +1,71 @@

+/*******************************************************************************
+Vendor: Xilinx
+Associated Filename: vadd.h
+Purpose: VITIS vector addition
+Revision History: January 28, 2016
+*******************************************************************************
+Copyright (C) 2019 XILINX, Inc.
+This file contains confidential and proprietary information of Xilinx, Inc. and
+is protected under U.S. and international copyright and other intellectual
+property laws.
+DISCLAIMER
+This disclaimer is not a license and does not grant any rights to the materials
+distributed herewith. Except as otherwise provided in a valid license issued to
+you by Xilinx, and to the maximum extent permitted by applicable law:
+(1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
+HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
+FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+in contract or tort, including negligence, or under any other theory of
+liability) for any loss or damage of any kind or nature related to, arising under
+or in connection with these materials, including for any direct, or any indirect,
+special, incidental, or consequential loss or damage (including loss of data,
+profits, goodwill, or any type of loss or damage suffered as a result of any
+action brought by a third party) even if such damage or loss was reasonably
+foreseeable or Xilinx had been advised of the possibility of the same.
+CRITICAL APPLICATIONS
+Xilinx products are not designed or intended to be fail-safe, or for use in any
+application requiring fail-safe performance, such as life-support or safety
+devices or systems, Class III medical devices, nuclear facilities, applications
+related to the deployment of airbags, or any other applications that could lead
+to death, personal injury, or severe property or environmental damage
+(individually and collectively, "Critical Applications"). Customer assumes the
+sole risk and liability of any use of Xilinx products in Critical Applications,
+subject only to applicable laws and regulations governing limitations on product
+liability.
+THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
+ALL TIMES.
+*******************************************************************************/
+#pragma once
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#include <CL/cl2.hpp>
+//Customized buffer allocation for 4K boundary alignment
+template <typename T>
+struct aligned_allocator
+{
+  using value_type = T;
+  T* allocate(std::size_t num)
+  {
+    void* ptr = nullptr;
+    if (posix_memalign(&ptr,4096,num*sizeof(T)))
+      throw std::bad_alloc();
+    return reinterpret_cast<T*>(ptr);
+  }
+  void deallocate(T* p, std::size_t num)
+  {
+    free(p);
+  }
+};

gpt-2-medium/kernel-ultrascale.cpp ADDED Viewed

	@@ -0,0 +1,2091 @@

+#include <cmath>
+#include <string>
+#include <tapa.h>
+#include <ap_int.h>
+#include <hls_math.h>
+constexpr int D = 1024;
+constexpr int D_div_2 = D / 2;
+constexpr int D_div_4 = D / 4;
+constexpr int D_ffn = 3072;
+constexpr int N_head = 16;
+constexpr int MAX_SEQ_LEN = 1024;
+constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
+constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
+constexpr int NUM_SLR = 3;
+constexpr int NUM_DUM_SLR = 4;
+constexpr int TOTAL_PORT = NUM_SLR * 2;
+constexpr int D_head = D / N_head;
+constexpr int D_head_div_32 = D_head / 32;
+constexpr int D_head_div_16 = D_head / 16;
+constexpr int D_head_div_8 = D_head / 8;
+constexpr int D_head_div_4 = D_head / 4;
+constexpr int D_head_div_2 = D_head / 2;
+constexpr int D_div_8 = D / 8;
+constexpr int D_div_16 = D / 16;
+constexpr int D_ffn_SLR = 1376;
+constexpr int D_ffn_SLR_div_8 = D_ffn_SLR / 8;
+constexpr int D_ffn_SLR_div_2 = D_ffn_SLR / 2;
+constexpr int FFN_WEIGHT_SIZE = D * D_ffn_SLR * NUM_DUM_SLR;
+constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
+constexpr int WEIGHT_D = D * 2;
+constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 15 / 2; // multi-head attention
+constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE + FFN_WEIGHT_SIZE;
+constexpr int CONTEXT_D = D_head_div_8 * 5;
+constexpr int D_head_mul_4 = D_head * 4;
+constexpr int D_write_zero_acc0 = D / 32;
+constexpr int D_write_zero_acc1 = D / 32 + D / 16;
+using int_v16 = tapa::vec_t<int, 16>;
+using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
+using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
+template <typename data_t>
+inline void bh(tapa::istream<data_t> & q) {
+#pragma HLS inline
+    for (;;) {
+#pragma HLS pipeline II=1 style=stp
+        data_t tmp; q.try_read(tmp);
+    }
+}
+struct ConfigInst {
+    ap_uint<3> stage; // stage 7 -> read L
+    ap_uint<11> weight_bound;
+    ap_uint<7> i_bound;
+    ap_uint<8> j_bound;
+    ap_uint<8> k_bound;
+};
+void black_hole_int(tapa::istream<int> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_inst(tapa::istream<ConfigInst> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
+    bh(fifo_in);
+}
+void read_W(
+    tapa::async_mmap<ap_uint<512>>& vec,
+    tapa::ostream<ap_uint<512>>& fifo_out
+){
+    for(int i_req = 0, i_resp = 0; i_resp < (TOTAL_WEIGHT_SIZE >> 7);){
+        #pragma HLS pipeline II=1 style=stp
+        if((i_req < (TOTAL_WEIGHT_SIZE >> 7)) & !vec.read_addr.full()){
+            vec.read_addr.write(i_req);
+            i_req++;
+        }
+        ap_uint<512> tmp_o;
+        bool success = vec.read_data.try_read(tmp_o);
+        if(success){
+            fifo_out.write(tmp_o);
+            i_resp++;
+        }
+    }
+}
+void read_X(
+    const int N,
+    tapa::async_mmap<ap_uint<512>>& vec,
+    tapa::ostream<ap_uint<512>>& fifo_out
+){
+    for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
+        #pragma HLS pipeline II=1 style=stp
+        if((i_req < (N >> 6)) & !vec.read_addr.full()){
+            vec.read_addr.write(i_req);
+            i_req++;
+        }
+        ap_uint<512> tmp_o;
+        bool success = vec.read_data.try_read(tmp_o);
+        if(success){
+            fifo_out.write(tmp_o);
+            i_resp++;
+        }
+    }
+}
+void read_inst(
+    const int L,
+    tapa::ostream<ConfigInst>& fifo_out_acc0,
+    tapa::ostream<ConfigInst>& fifo_out_acc1
+){
+    ConfigInst len;
+    len.stage = 7;
+    len.weight_bound = L;
+    fifo_out_acc0.write(len);
+    fifo_out_acc1.write(len);
+    for(int stage_i = 0; stage_i < 17; stage_i++){
+        #pragma HLS pipeline II=1 style=stp
+        ConfigInst inst_acc0;
+        ConfigInst inst_acc1;
+        const int stage = (stage_i < 15) ? (stage_i % 3) : (stage_i - 12);
+        inst_acc0.stage = ap_uint<3>(stage);
+        inst_acc1.stage = ap_uint<3>(stage);
+        if(stage == 0){
+            inst_acc0.weight_bound = D_head_div_4;
+            inst_acc0.i_bound = (L >> 4);
+            inst_acc0.j_bound = D_head_div_16;
+            inst_acc0.k_bound = D_div_8;
+            inst_acc1 = inst_acc0;
+        } else if (stage == 1){
+            inst_acc0.weight_bound = D_head_div_8;
+            inst_acc0.i_bound = (L >> 4);
+            inst_acc0.j_bound = D_head_div_32;
+            inst_acc0.k_bound = D_div_8;
+            inst_acc1 = inst_acc0;
+        } else if (stage == 2){
+            inst_acc0.weight_bound = 0;
+            inst_acc0.i_bound = (L >> 4);
+            inst_acc0.j_bound = (L >> 4);
+            inst_acc0.k_bound = D_head_div_8;
+            inst_acc1.weight_bound = 0;
+            inst_acc1.i_bound = (L >> 4);
+            inst_acc1.j_bound = D_head_div_16;
+            inst_acc1.k_bound = (L >> 3);
+        } else if (stage == 3){
+            inst_acc0.weight_bound = (CONTEXT_D << 1);
+            inst_acc0.i_bound = (L >> 5);
+            inst_acc0.j_bound = D_div_16;
+            inst_acc0.k_bound = CONTEXT_D;
+            inst_acc1 = inst_acc0;
+        } else {
+            inst_acc0.weight_bound = (D_ffn_SLR >> 2);
+            inst_acc0.i_bound = (L >> 4);
+            inst_acc0.j_bound = (D_ffn_SLR >> 4);
+            inst_acc0.k_bound = D_div_8;
+            inst_acc1.weight_bound = D_div_4;
+            inst_acc1.i_bound = (L >> 4);
+            inst_acc1.j_bound = D_div_16;
+            inst_acc1.k_bound = D_ffn_SLR_div_8;
+        }
+        fifo_out_acc0.write(inst_acc0);
+        fifo_out_acc1.write(inst_acc1);
+    }
+}
+void packet_switch_acc(
+    tapa::istream<int>& fifo_inst_in,
+    tapa::ostream<int>& fifo_sfu_out,
+    tapa::ostream<int>& fifo_sfu_gelu
+) {
+    const int L = fifo_inst_in.read();
+    fifo_sfu_out.write(L);
+    fifo_sfu_gelu.write(L);
+}
+void write_mtx(
+    const int N,
+    tapa::async_mmap<ap_uint<128>>& output_mtx,
+    tapa::istream<ap_uint<128>>& fifo_in,
+    tapa::ostream<bool>& fifo_fin
+){
+    for(int i_req = 0, i_resp = 0; i_resp < N;){
+        #pragma HLS pipeline II=1 style=stp
+        if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
+            output_mtx.write_addr.try_write(i_req);
+            ap_uint<128> tmp; fifo_in.try_read(tmp);
+            output_mtx.write_data.try_write(tmp);
+            ++i_req;
+        }
+        bool success = false;
+        auto resp = output_mtx.write_resp.read(success);
+        if(success){
+            i_resp += unsigned(resp)+1;
+        }
+    }
+    fifo_fin.write(true);
+}
+void write_zero(
+    const int L,
+    const int D,
+    tapa::ostream<ap_uint<512>>& fifo_zero
+){
+    for(int i = 0; i < L * D;){
+        if(!fifo_zero.full()){
+            ap_uint<512> tmp = 0;
+            fifo_zero.try_write(tmp);
+            i++;
+        }
+    }
+}
+// acc slr0 master node
+void temporal_acc0_slr0(
+    tapa::istream<ConfigInst>& fifo_inst_in,
+    tapa::ostream<ConfigInst>& fifo_inst_out,
+    tapa::ostream<int>& fifo_len_sfu,
+    tapa::istream<ap_uint<512>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::istream<ap_uint<256>>& fifo_from_acc1,
+    tapa::ostream<ap_uint<512>>& fifo_O_out,
+    tapa::ostream<ap_uint<512>>& fifo_ffn_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<1024>>& fifo_ffn_in,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_res_send
+    // tapa::ostream<ap_uint<64>>& fifo_write,
+    // tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
+    ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
+    ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
+    #pragma HLS array_partition variable=X cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=X cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=X type=ram_2p impl=uram
+    ConfigInst len = fifo_inst_in.read();
+    const int L = len.weight_bound;
+    fifo_inst_out.write(len);
+    fifo_len_sfu.write(L);
+    for(int stage_i = 0; stage_i < 17; stage_i++){
+        //TODO: stage send from inst
+        // stage 0: WqX
+        // stage 1: WkX0 <- acc1
+        // stage 2: QK^T
+        ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // TODO: reduce dimension
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=8
+        #pragma HLS bind_storage variable=W type=ram_2p impl=uram
+        ConfigInst inst = fifo_inst_in.read();
+        fifo_inst_out.write(inst);
+        const ap_uint<3> stage = inst.stage;
+        // load weights and forward
+        if(stage != 2) { // TODO: 1d array & uniform access
+            const int weight_bound = inst.weight_bound;
+            for(int i = 0; i < weight_bound; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        // stage 1: compute Q
+        const int i_bound = inst.i_bound;
+        const int j_bound = inst.j_bound;
+        const int k_bound = inst.k_bound;
+        for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 16
+            if(stage_i == 0){
+                for(int ii = 0; ii < 2; ii++){ // load only 1 time
+        load_x:
+                    for(int jj = 0; jj < D_div_8;){
+                        if(!fifo_X_in.empty()){
+                            ap_uint<512> val; fifo_X_in.try_read(val);
+                            for(int k = 0; k < 8; k++){
+                                #pragma HLS unroll
+                                X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
+                            }
+                            jj++;
+                        }
+                    }
+                }
+            }
+            for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
+                #pragma HLS loop_flatten off
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){ // reduction dim
+                    #pragma HLS pipeline II=1 style=stp
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    }else if(stage == 4) {
+                        recv_pkt = fifo_ffn_in.read();
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage > 2){
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else if(stage == 2) {
+                            op1_mtx[ii] = scratchpad_k[j*16+ii][k];
+                            op2_mtx[ii] = scratchpad_q[i*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = X[i*16+ii][k];
+                        }
+                    }
+                    if(stage < 2){
+                        ap_uint<1024> send_pkt = ap_uint<1024>((
+                            op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
+                            op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
+                        ));
+                        fifo_X_out.write(send_pkt);
+                    } else if (stage == 4) {
+                        fifo_X_out.write(recv_pkt);
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                if(stage == 2){
+                                    op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
+                                    op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
+                                } else {
+                                    op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                    op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                }
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                        }
+                    }
+                }
+               if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
+                        }
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 4; ii++){
+                        for(int jj = 0; jj < 2; jj++){
+                            #pragma HLS pipeline II=1 style=stp
+                            ap_uint<256> tmp = fifo_from_acc1.read();
+                            for(int l = 0; l < 4; l++){
+                                #pragma HLS unroll
+                                ap_uint<64> tmp_pack;
+                                for(int k = 0; k < 8; k++){
+                                    #pragma HLS unroll
+                                    tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
+                                }
+                                scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
+                            }
+                            for(int l = 0; l < 4; l++){
+                                #pragma HLS unroll
+                                scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
+                            }
+                        }
+                    }
+                } else if(stage == 2 || stage == 4){
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<512> tmp;
+                        for(int ii = 0; ii < 16; ii++){
+                            #pragma HLS unroll
+                            if(stage == 2 && (i*16+ii < j*16+kk)){
+                                tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
+                            } else {
+                                tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
+                            }
+                        }
+                        if(stage == 2) fifo_O_out.write(tmp);
+                        else fifo_ffn_out.write(tmp);
+                    }
+                } else {
+                final_acc:
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        #pragma HLS dependence variable=X type=inter false
+                        ap_uint<512> tmp_recv = fifo_reduce_recv.read();
+                        ap_uint<512> tmp_send;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            ap_int<32> tmp = acc_final[ii][k] + ap_int<32>(tmp_recv(k*32+31, k*32));
+                            tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
+                            tmp_send(k*32+31, k*32) = tmp;
+                        }
+                        fifo_res_send.write(tmp_send);
+                    }
+                }
+            }
+        }
+    }
+    // fifo_fin.write(true);
+    // write:
+    // for(int i = 0; i < L; i++){
+    //     for(int j = 0; j < D_div_8; j++){
+    //         #pragma HLS pipeline II=1 style=stp
+    //         fifo_write.write(X[i][j]);
+    //     }
+    // }
+}
+void temporal_acc0(
+    tapa::istream<ConfigInst>& fifo_inst_in,
+    tapa::ostream<ConfigInst>& fifo_inst_out,
+    tapa::ostream<int>& fifo_len_sfu,
+    tapa::istream<ap_uint<1024>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::istream<ap_uint<256>>& fifo_from_acc1,
+    tapa::ostream<ap_uint<512>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::ostream<ap_uint<512>>& fifo_ffn_out,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_reduce_send
+){
+    ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
+    ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
+    ConfigInst len = fifo_inst_in.read();
+    const int L = len.weight_bound;
+    fifo_inst_out.write(len);
+    fifo_len_sfu.write(L);
+    for(int stage_i = 0; stage_i < 17; stage_i++){
+    #pragma HLS loop_flatten off
+        // stage 0: WqX
+        // stage 1: WkX0 <- acc1
+        // stage 2: QK^T
+        // stage 3: WoO
+        ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=8
+        #pragma HLS bind_storage variable=W type=ram_2p impl=uram
+        ConfigInst inst = fifo_inst_in.read();
+        fifo_inst_out.write(inst);
+        const ap_uint<3> stage = inst.stage;
+        // load weights and forward
+        if(stage != 2) {
+            const int weight_bound = inst.weight_bound;
+            for(int i = 0; i < weight_bound; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        const int i_bound = inst.i_bound;
+        const int j_bound = inst.j_bound;
+        const int k_bound = inst.k_bound;
+        // stage 1: compute Q
+        for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 64
+            for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
+                #pragma HLS loop_flatten off
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){ // reduction dim
+                    #pragma HLS pipeline II=1 style=stp
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3){
+                        recv_pkt = fifo_context.read();
+                    } else if(stage != 2) {
+                        recv_pkt = fifo_X_in.read();
+                        fifo_X_out.write(recv_pkt);
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage == 2) {
+                            op1_mtx[ii] = scratchpad_q[i*16+ii][k];
+                            op2_mtx[ii] = scratchpad_k[j*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
+                        }
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                if(stage == 2){
+                                    op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
+                                    op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
+                                } else {
+                                    op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                    op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                }
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
+                        }
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 4; ii++){
+                        for(int jj = 0; jj < 2; jj++){
+                            #pragma HLS pipeline II=1 style=stp
+                            ap_uint<256> tmp = fifo_from_acc1.read();
+                            for(int l = 0; l < 4; l++){
+                                #pragma HLS unroll
+                                ap_uint<64> tmp_pack;
+                                for(int k = 0; k < 8; k++){
+                                    #pragma HLS unroll
+                                    tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
+                                }
+                                scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
+                            }
+                            for(int l = 0; l < 4; l++){
+                                #pragma HLS unroll
+                                scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
+                            }
+                        }
+                    }
+                } else if(stage == 2 || stage == 4){
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<512> tmp;
+                        for(int ii = 0; ii < 16; ii++){
+                            #pragma HLS unroll
+                            if(stage == 2 && (i*16+ii < j*16+kk)){
+                                tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
+                            } else {
+                                tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
+                            }
+                        }
+                        if(stage == 2) fifo_O_out.write(tmp);
+                        else fifo_ffn_out.write(tmp);
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<512> tmp_recv = fifo_reduce_recv.read();
+                        ap_uint<512> tmp;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            acc_final[ii][k] += ap_int<24>(tmp_recv(k*32+23, k*32));
+                            tmp(k*32+23, k*32) = acc_final[ii][k];
+                        }
+                        fifo_reduce_send.write(tmp);
+                    }
+                }
+            }
+        }
+    }
+}
+// acc slr0 master node
+void temporal_acc1_slr0(
+    tapa::istream<ConfigInst>& fifo_inst_in,
+    tapa::ostream<ConfigInst>& fifo_inst_out,
+    tapa::ostream<int>& fifo_len_context,
+    tapa::istream<ap_uint<512>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::ostream<ap_uint<256>>& fifo_to_acc0,
+    tapa::istream<ap_uint<128>>& fifo_from_sfu,
+    tapa::ostream<ap_uint<1024>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_res_send,
+    tapa::istream<ap_uint<1024>>& fifo_gelu_in,
+    tapa::ostream<ap_uint<512>>& fifo_ffn_out
+    // tapa::ostream<ap_uint<64>>& fifo_write,
+    // tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
+    #pragma HLS array_partition variable=X cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=X cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=X type=ram_2p impl=uram
+    ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
+    #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
+    #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
+    // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
+    ConfigInst len = fifo_inst_in.read();
+    const int L = len.weight_bound;
+    fifo_inst_out.write(len);
+    fifo_len_context.write(L);
+    for(int stage_i = 0; stage_i < 17; stage_i++){
+        // stage 0: WvX
+        // stage 1: WkX1 -> acc0
+        // stage 2: Softmax(QK)V <- acc0
+        // stage 3: WoO
+        ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=8
+        #pragma HLS bind_storage variable=W type=ram_2p impl=uram
+        ConfigInst inst = fifo_inst_in.read();
+        fifo_inst_out.write(inst);
+        const ap_uint<3> stage = inst.stage;
+        // load weights and forward
+        if(stage != 2) {
+            const int weight_bound = inst.weight_bound;
+            int sub_bound = D_div_8;
+            if (stage == 4) sub_bound = D_ffn_SLR_div_8;
+            for(int i = 0; i < weight_bound; i++){
+                load_weight:
+                for(int j = 0; j < sub_bound;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        const int i_bound = inst.i_bound;
+        const int j_bound = inst.j_bound;
+        for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
+            const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
+            ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
+            #pragma HLS array_partition variable=cache_attn dim=2 complete
+            #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
+            if(stage_i == 0){
+                for(int ii = 0; ii < 2; ii++){ // load only 1 time
+        load_x:
+                    for(int jj = 0; jj < D_div_8;){
+                        if(!fifo_X_in.empty()){
+                            ap_uint<512> val; fifo_X_in.try_read(val);
+                            for(int k = 0; k < 8; k++){
+                                #pragma HLS unroll
+                                X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
+                            }
+                            jj++;
+                        }
+                    }
+                }
+            } else if (stage == 2) {
+                for(int ii = 0; ii < ((i+1)*2); ii++){
+                    ap_uint<32> fuse_reg[16];
+                    load_attn:
+                    for(int offset = 0; offset < 8;){
+                        #pragma HLS pipeline II=1 style=stp
+                        if(!fifo_from_sfu.empty()){
+                            ap_uint<128> val; fifo_from_sfu.try_read(val);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
+                            }
+                            offset++;
+                        }
+                    }
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        cache_attn[ii][k] = fuse_reg[k];
+                    }
+                }
+            }
+            for(int j = 0; j < j_bound; j++){
+                #pragma HLS loop_flatten off
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){
+                    #pragma HLS pipeline II=1 style=stp
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    } else if(stage == 4) {
+                        recv_pkt = fifo_gelu_in.read();
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage == 3){
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else if(stage != 2) {
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = X[i*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
+                            op2_mtx[ii] = scratchpad[k][j*16+ii];
+                        }
+                    }
+                    if(stage < 2){
+                        ap_uint<1024> send_pkt = ap_uint<1024>((
+                            op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
+                            op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
+                        ));
+                        fifo_X_out.write(send_pkt);
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = ii%8;
+                            scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
+                        }
+                    }
+                } else if (stage == 2){
+                    for(int ii = 0; ii < 2; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<1024> tmp;
+                        for(int jj = 0; jj < 8; jj++){
+                            #pragma HLS unroll
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
+                            }
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else if (stage == 1) {
+                    for(int ii = 0; ii < 4; ii++){
+                        for(int jj = 0; jj < 2; jj++){
+                            ap_uint<256> tmp;
+                            for(int k = 0; k < 32; k++){
+                                #pragma HLS unroll
+                                tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
+                            }
+                            fifo_to_acc0.write(tmp);
+                        }
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        #pragma HLS dependence variable=X type=inter false
+                        ap_uint<512> tmp_recv = fifo_reduce_recv.read();
+                        ap_uint<512> tmp_send;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            ap_int<32> tmp = acc_final[ii][k] + ap_int<24>(tmp_recv(k*32+23, k*32));
+                            if(stage == 3) tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
+                            tmp_send(k*32+31, k*32) = tmp;
+                        }
+                        if(stage == 3) fifo_res_send.write(tmp_send);
+                        else fifo_ffn_out.write(tmp_send);
+                    }
+                }
+            }
+        }
+    }
+}
+void residual(
+    const int L,
+    tapa::istream<ap_uint<512>>& fifo_res_in,
+    tapa::ostream<ap_uint<512>>& fifo_res_out
+){
+    for(int i = 0; i < (L >> 5); i++){
+        for(int j = 0; j < D_div_16; j++){
+            ap_uint<32> res_buffer[16][16];
+            #pragma HLS array_partition variable=res_buffer complete dim=1
+            #pragma HLS array_partition variable=res_buffer complete dim=2
+            read:
+            for(int k = 0; k < 16;){
+                #pragma HLS pipeline II=1 style=stp
+                ap_uint<512> tmp;
+                bool success = fifo_res_in.try_read(tmp);
+                if(success){
+                    for(int l = 0; l < 16; l++){
+                        #pragma HLS unroll
+                        res_buffer[k][l] = ap_uint<32>(tmp(l*32+31, l*32));
+                    }
+                    k++;
+                }
+            }
+            transpose:
+            for(int k = 0; k < 16; k++){
+                #pragma HLS pipeline II=1 style=stp
+                ap_uint<512> tmp;
+                for(int l = 0; l < 16; l++){
+                    #pragma HLS unroll
+                    tmp(l*32+31, l*32) = ap_uint<32>(res_buffer[l][k]);
+                }
+                fifo_res_out.write(tmp);
+            }
+        }
+    }
+}
+void temporal_acc1(
+    tapa::istream<ConfigInst>& fifo_inst_in,
+    tapa::ostream<ConfigInst>& fifo_inst_out,
+    tapa::ostream<int>& fifo_len_context,
+    tapa::istream<ap_uint<1024>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::ostream<ap_uint<256>>& fifo_to_acc0,
+    tapa::istream<ap_uint<128>>& fifo_from_sfu,
+    tapa::ostream<ap_uint<1024>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_reduce_send,
+    tapa::istream<ap_uint<1024>>& fifo_gelu_in
+){
+    ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
+    #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
+    #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
+    // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
+    ConfigInst len = fifo_inst_in.read();
+    const int L = len.weight_bound;
+    fifo_inst_out.write(len);
+    fifo_len_context.write(L);
+    for(int stage_i = 0; stage_i < 17; stage_i++){
+        // stage 0: WvX
+        // stage 1: WkX1 -> acc0
+        // stage 2: Softmax(QK)V <- acc0
+        // stage 3: WoO
+        ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=8
+        #pragma HLS bind_storage variable=W type=ram_2p impl=uram
+        ConfigInst inst = fifo_inst_in.read();
+        fifo_inst_out.write(inst);
+        const ap_uint<3> stage = inst.stage;
+        // load weights and forward
+        if(stage != 2) {
+            const int weight_bound = inst.weight_bound;
+            int sub_bound = D_div_8;
+            if (stage == 4) sub_bound = D_ffn_SLR_div_8;
+            for(int i = 0; i < weight_bound; i++){
+                load_weight:
+                for(int j = 0; j < sub_bound;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        const int i_bound = inst.i_bound;
+        const int j_bound = inst.j_bound;
+        for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
+            const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
+            ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
+            #pragma HLS array_partition variable=cache_attn dim=2 complete
+            #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
+            if(stage == 2){
+                for(int ii = 0; ii < ((i+1)*2); ii++){
+                    ap_uint<32> fuse_reg[16];
+                    load_attn:
+                    for(int offset = 0; offset < 8;){
+                        #pragma HLS pipeline II=1 style=stp
+                        if(!fifo_from_sfu.empty()){
+                            ap_uint<128> val; fifo_from_sfu.try_read(val);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
+                            }
+                            offset++;
+                        }
+                    }
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        cache_attn[ii][k] = fuse_reg[k];
+                    }
+                }
+            }
+            for(int j = 0; j < j_bound; j++){
+                #pragma HLS loop_flatten off
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){
+                    #pragma HLS pipeline II=1 style=stp
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    }else if(stage == 4) {
+                        recv_pkt = fifo_gelu_in.read();
+                    }else if(stage != 2) {
+                        recv_pkt = fifo_X_in.read();
+                        fifo_X_out.write(recv_pkt);
+                    }
+                    for(int ii = 0; ii < 16; ii++){ //TODO: change logic
+                        #pragma HLS unroll
+                        if (stage != 2) {
+                            op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
+                            op2_mtx[ii] = scratchpad[k][j*16+ii];
+                        }
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = ii%8;
+                            scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
+                        }
+                    }
+                } else if (stage == 2){
+                    for(int ii = 0; ii < 2; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<1024> tmp;
+                        for(int jj = 0; jj < 8; jj++){
+                            #pragma HLS unroll
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
+                            }
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 4; ii++){
+                        for(int jj = 0; jj < 2; jj++){
+                            ap_uint<256> tmp;
+                            for(int k = 0; k < 32; k++){
+                                #pragma HLS unroll
+                                tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
+                            }
+                            fifo_to_acc0.write(tmp);
+                        }
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1 style=stp
+                        ap_uint<512> tmp_recv = fifo_reduce_recv.read();
+                        ap_uint<512> tmp;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
+                            tmp(k*32+21, k*32) = acc_final[ii][k];
+                        }
+                        fifo_reduce_send.write(tmp);
+                    }
+                }
+            }
+        }
+    }
+    // write out for debug
+// write:
+//     for(int i = 0; i < L; i++){
+//         for(int j = 0; j < D_head_div_8; j++){
+//             #pragma HLS pipeline II=1 style=stp
+//             fifo_O_out.write(scratchpad_out[i][j]);
+//         }
+//     }
+}
+void sfu_buffer( // double buffering
+    tapa::istream<int>& fifo_inst,
+    tapa::istream<ap_uint<512>>& fifo_data_in,
+    tapa::ostream<ap_uint<512>>& fifo_data_out
+){
+    const int L = fifo_inst.read();
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 5); l++){
+            float sum[8][16];
+            float cache[MAX_SEQ_LEN][16];
+            #pragma HLS array_partition variable=cache dim=2 complete
+            #pragma HLS array_partition variable=sum dim=2 complete
+            const int hidden_bound = fifo_inst.read();
+            for(int i = 0; i < 8; i++){
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    sum[i][j] = 0.0;
+                }
+            }
+        acc:
+            for(int i = 0; i < hidden_bound; i++){
+                #pragma HLS pipeline II=1 style=stp
+                #pragma HLS dependence false variable=sum
+                #pragma HLS dependence true variable=sum distance=8
+                ap_uint<512> tmp = fifo_data_in.read();
+                for(int k = 0; k < 16; k++){
+                    #pragma HLS unroll
+                    float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
+                    sum[i%8][k] += res;
+                    cache[i][k] = res;
+                }
+            }
+        reduce:
+            for(int i = 1; i < 8; i++){
+                for(int j = 0; j < 8; j++){
+                    #pragma HLS pipeline II=1 style=stp
+                    #pragma HLS dependence true variable=sum distance=8
+                    for(int k = 0; k < 2; k++){
+                        sum[0][j*2+k] += sum[i][j*2+k];
+                    }
+                }
+            }
+            ap_uint<512> tmp;
+            for(int i = 0; i < 16; i++){
+                #pragma HLS unroll
+                tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
+            }
+            fifo_data_out.write(tmp);
+        write:
+            for(int i = 0; i < hidden_bound; i++){
+                #pragma HLS pipeline II=1 style=stp
+                ap_uint<512> tmp;
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
+                }
+                fifo_data_out.write(tmp);
+            }
+        }
+    }
+}
+void sfu_buffer_slr0( // double buffering
+    tapa::istream<int>& fifo_inst,
+    tapa::istream<ap_uint<512>>& fifo_data_in_exp,
+    tapa::istream<ap_uint<512>>& fifo_data_in_ln,
+    tapa::istream<ap_uint<512>>& fifo_data_in_ffn,
+    tapa::ostream<ap_uint<512>>& fifo_data_out
+){
+    const int L = fifo_inst.read();
+    for(int stage = 0; stage < 7; stage++){
+        int hidden_bound = D;
+        for(int l = 0; l < (L >> 5); l++){
+            float sum[8][16];
+            float var[8][16];
+            float cache[MAX_SEQ_LEN][16];
+            #pragma HLS array_partition variable=cache dim=2 complete
+            #pragma HLS array_partition variable=sum dim=2 complete
+            #pragma HLS array_partition variable=var dim=2 complete
+            if(stage < 5) hidden_bound = fifo_inst.read();
+            for(int i = 0; i < 8; i++){
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    sum[i][j] = 0.0;
+                    var[i][j] = 0.0;
+                }
+            }
+        acc:
+            for(int i = 0; i < hidden_bound; i++){
+                #pragma HLS pipeline II=1 style=stp
+                #pragma HLS dependence false variable=sum
+                #pragma HLS dependence true variable=sum distance=8
+                ap_uint<512> tmp;
+                if(stage < 5) {
+                    tmp = fifo_data_in_exp.read();
+                } else if(stage == 5){
+                    tmp = fifo_data_in_ln.read();
+                } else {
+                    tmp = fifo_data_in_ffn.read();
+                }
+                for(int k = 0; k < 16; k++){
+                    #pragma HLS unroll
+                    float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
+                    sum[i%8][k] = res;
+                    if(stage >= 4) var[i%8][k] = res;
+                    cache[i][k] = res;
+                }
+            }
+        reduce:
+            for(int i = 1; i < 8; i++){
+                for(int j = 0; j < 8; j++){
+                    #pragma HLS pipeline II=1 style=stp
+                    #pragma HLS dependence true variable=sum distance=8
+                    #pragma HLS dependence true variable=var distance=8
+                    for(int k = 0; k < 2; k++){
+                        sum[0][j*2+k] += sum[i][j*2+k];
+                        if(stage >= 5) var[0][j*2+k] += var[i][j*2+k];
+                    }
+                }
+            }
+            ap_uint<512> tmp;
+            ap_uint<512> tmp_var;
+            for(int i = 0; i < 16; i++){
+                #pragma HLS unroll
+                tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
+                if(stage >= 5) tmp_var(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(var[0][i]);
+            }
+            fifo_data_out.write(tmp);
+            if(stage >= 5) fifo_data_out.write(tmp_var);
+        write:
+            for(int i = 0; i < hidden_bound; i++){
+                #pragma HLS pipeline II=1 style=stp
+                ap_uint<512> tmp;
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
+                }
+                fifo_data_out.write(tmp);
+            }
+        }
+    }
+}
+void sfu_acc_exp(
+    tapa::istream<int>& fifo_inst,
+    tapa::istream<ap_uint<512>>& fifo_data_in,
+    tapa::ostreams<ap_uint<512>, 2>& fifo_buf,
+    tapa::ostreams<int, 2>& fifo_inst_out
+) {
+    const int L = fifo_inst.read();
+    fifo_inst_out[0].write(L);
+    fifo_inst_out[1].write(L);
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 4); l++){
+            fifo_inst_out[l%2].write(((l+1) << 4));
+            exp_acc:
+            for(int i = 0; i < ((l+1) << 4);){
+                #pragma HLS pipeline II=1 style=stp
+                if(!fifo_data_in.empty()){
+                    ap_uint<512> tmp; fifo_data_in.try_read(tmp);
+                    ap_uint<512> tmp_o;
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
+                        float res_exp = 0.0;
+                        res_exp = hls::exp(ap_int<32>(res >> 10));
+                        tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
+                    }
+                    fifo_buf[l%2].write(tmp_o);
+                    i++;
+                }
+            }
+        }
+    }
+}
+void sfu_gelu(
+    tapa::istream<int>& fifo_inst,
+    tapa::ostream<int>& fifo_inst_out,
+    tapa::istream<ap_uint<512>>& fifo_ffn,
+    tapa::ostream<ap_uint<128>>& fifo_out
+){
+    const int L = fifo_inst.read();
+    fifo_inst_out.write(L);
+    for(int i = 0; i < (L >> 4); i++){
+        for(int j = 0; j < D_ffn_SLR;){
+            if(!fifo_ffn.empty()){
+                ap_uint<512> tmp; fifo_ffn.try_read(tmp);
+                ap_uint<128> tmp_out;
+                for(int k = 0; k < 16; k++){
+                    // table based approximation
+                    float val = (float) tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
+                    float outp_data = 0.0;
+                    if (val < -2 || val == 0)
+                        outp_data = 0;
+                    else if(val < -1.5)
+                        outp_data = -0.09754;
+                    else if(val < -1)
+                        outp_data = -0.15743;
+                    else if(val < -0.5)
+                        outp_data = -0.15383;
+                    else if(val < 0)
+                        outp_data = -0.10153;
+                    else
+                        outp_data = val;
+                    tmp_out(k*8+7, k*8) = ap_int<8>((int) (outp_data) >> 8);
+                }
+                fifo_out.write(tmp_out);
+                j++;
+            }
+        }
+    }
+}
+void data_packing(
+    tapa::istream<int>& fifo_inst,
+    tapa::istream<ap_uint<128>>& fifo_in,
+    tapa::ostream<ap_uint<1024>>& fifo_out
+){
+    const int L = fifo_inst.read();
+    for(int i = 0; i < (L >> 4); i++){
+        ap_uint<1024> cache[D_ffn_SLR_div_8];
+        for(int j = 0; j < D_ffn_SLR_div_8; j++){
+            ap_uint<64> fuse_reg[16];
+            ap_uint<1024> send_pkt;
+            #pragma HLS array_partition variable=fuse_reg complete
+            for(int k = 0; k < 8;){
+                #pragma HLS pipeline II=1
+                if(!fifo_in.empty()){
+                    ap_uint<128> tmp; fifo_in.try_read(tmp);
+                    for(int l = 0; l < 16; l++){
+                        #pragma HLS unroll
+                        fuse_reg[l](k*8+7, k*8) = tmp(l*8+7, l*8);
+                    }
+                    k++;
+                }
+            }
+            send_pkt = ap_uint<1024>((
+                fuse_reg[0], fuse_reg[1], fuse_reg[2], fuse_reg[3], fuse_reg[4], fuse_reg[5], fuse_reg[6], fuse_reg[7],
+                fuse_reg[8], fuse_reg[9], fuse_reg[10], fuse_reg[11], fuse_reg[12], fuse_reg[13], fuse_reg[14], fuse_reg[15]
+            ));
+            cache[j] = send_pkt;
+            fifo_out.write(send_pkt);
+        }
+        for(int iter = 0; iter < D_div_16*2 - 1; iter++){
+            for(int j = 0; j < D_ffn_SLR_div_8; j++){
+                #pragma HLS pipeline II=1
+                fifo_out.write(cache[j]);
+            }
+        }
+    }
+}
+void sfu_norm(
+    tapa::istream<int>& fifo_inst,
+    tapa::istreams<ap_uint<512>, 2>& fifo_buf,
+    tapa::ostream<ap_uint<128>>& fifo_data_out
+){
+    const int L = fifo_inst.read();
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 4); l++){
+            float sum[16];
+            #pragma HLS array_partition variable=sum complete
+            ap_uint<512> tmp_in = fifo_buf[l%2].read();
+            for(int i = 0; i < 16; i++){
+                #pragma HLS unroll factor=8
+                sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
+            }
+            for(int i = 0; i < ((l+1) << 4);){
+                #pragma HLS pipeline II=1 style=stp
+                if(!fifo_buf[l%2].empty()){
+                    ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
+                    ap_uint<128> tmp;
+                    for(int j = 0; j < 16; j++){
+                        #pragma HLS unroll
+                        ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
+                        tmp(j*8 + 7, j*8) = res;
+                    }
+                    fifo_data_out.write(tmp);
+                    i++;
+                }
+            }
+        }
+    }
+}
+void sfu_norm_slr0(
+    tapa::istream<int>& fifo_inst,
+    tapa::istreams<ap_uint<512>, 2>& fifo_buf,
+    tapa::ostream<ap_uint<128>>& fifo_data_out,
+    tapa::ostream<ap_uint<128>>& fifo_data_off,
+    tapa::ostream<ap_uint<128>>& fifo_out
+){
+    const int L = fifo_inst.read();
+    for(int stage = 0; stage < 7; stage++){
+        for(int l = 0; l < (L >> 4); l++){
+            int sum[16];
+            int var[16];
+            #pragma HLS array_partition variable=sum complete
+            #pragma HLS array_partition variable=var complete
+            const int fifo_idx = l%2;
+            const int hidden_bound = (stage < 5) ? ((l+1) << 4) : D;
+            ap_uint<512> tmp_in = fifo_buf[fifo_idx].read();
+            ap_uint<512> tmp_var;
+            if(stage >= 5) tmp_var = fifo_buf[fifo_idx].read();
+            if(stage >= 5){
+                for(int i = 0; i < 16; i++){
+                    #pragma HLS unroll
+                    var[i] = ap_uint<32>(tmp_in(i*32+31, i*32));
+                }
+            } else {
+                for(int i = 0; i < 16; i++){
+                    #pragma HLS unroll
+                    sum[i] = ap_uint<32>(tmp_in(i*32+31, i*32)) * 2;
+                }
+            }
+            for(int i = 0; i < hidden_bound;){
+                #pragma HLS pipeline II=1 style=stp
+                if(!fifo_buf[fifo_idx].empty()){
+                    ap_uint<512> tmp_cache; fifo_buf[fifo_idx].try_read(tmp_cache);
+                    ap_uint<128> tmp;
+                    for(int j = 0; j < 16; j++){
+                        #pragma HLS unroll
+                        ap_int<8> res;
+                        int op1; int op2;
+                        if(stage >= 5){
+                            op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
+                            op2 = var[j];
+                        } else {
+                            op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
+                            op2 = sum[j];
+                        }
+                        res = op1 + op2;
+                        tmp(j*8 + 7, j*8) = res;
+                    }
+                    if(stage == 5) {
+                        fifo_data_off.write(tmp);
+                    } else if(stage == 6){
+                        fifo_out.write(tmp);
+                    } else {
+                        fifo_data_out.write(tmp);
+                    }
+                    i++;
+                }
+            }
+        }
+    }
+}
+void context_buffer(
+    tapa::istream<int>& fifo_inst,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
+    tapa::ostream<ap_uint<1024>>& fifo_to_acc1
+){
+    ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
+    #pragma HLS array_partition variable=context cyclic dim=1 factor=32
+    #pragma HLS bind_storage variable=context type=ram_2p impl=uram
+    const int L = fifo_inst.read();
+    for(int stage = 0; stage < 5; stage++){
+        for(int i = 0; i < (L >> 4); i++){
+            for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
+                if(!fifo_context.empty()){
+                    ap_uint<1024> tmp; fifo_context.try_read(tmp);
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        context[i*16+ii][j] = tmp(ii*64+63, ii*64);
+                    }
+                    j++;
+                }
+            }
+        }
+    }
+    // NOTE: change it to write to HBM for debugging
+    // write ops to acc0 and acc1 in parallel
+    for(int i = 0; i < (L >> 5); i++){
+        for(int l = 0; l < D_div_16; l++){
+            for(int iter = 0; iter < 2; iter++){
+            for(int j = 0; j < CONTEXT_D; j++){
+                ap_uint<1024> tmp_acc0;
+                ap_uint<1024> tmp_acc1;
+                for(int k = 0; k < 16; k++){
+                    #pragma HLS unroll
+                    tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
+                    tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
+                }
+                fifo_to_acc0.write(tmp_acc0);
+                fifo_to_acc1.write(tmp_acc1);
+            }
+            }
+        }
+    }
+}
+void ffn_buffer(
+    const int L,
+    tapa::istream<ap_uint<128>>& fifo_ffn_in,
+    tapa::ostream<ap_uint<1024>>& fifo_ffn_out,
+    tapa::ostream<ap_uint<1024>>& fifo_ffn_res
+){
+    ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
+    #pragma HLS array_partition variable=X cyclic dim=1 factor=16
+    #pragma HLS bind_storage variable=X type=ram_2p impl=uram
+    for(int i = 0; i < (L >> 4); i++){
+        for(int j = 0; j < D_div_8; j++){
+            ap_uint<64> fuse_reg[16];
+            #pragma HLS array_partition variable=fuse_reg complete
+            for(int l = 0; l < 8;){
+                #pragma HLS pipeline II=1
+                if(!fifo_ffn_in.empty()){
+                    ap_uint<128> tmp; fifo_ffn_in.try_read(tmp);
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        fuse_reg[k](l*8+7, l*8) = tmp(k*8+7, k*8);
+                    }
+                    l++;
+                }
+            }
+            for(int k = 0; k < 16; k++){
+                #pragma HLS unroll
+                X[i*16+k][j] = fuse_reg[k];
+            }
+        }
+    }
+    for(int i = 0; i < (L >> 4); i++){
+        for(int iter = 0; iter < (D_ffn_SLR >> 4); iter++){
+            for(int it = 0; it < 2; it++){
+                for(int j = 0; j < D_div_8; j++){
+                    ap_uint<1024> tmp;
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        tmp(k*64+63, k*64) = X[i*16+k][j];
+                    }
+                    fifo_ffn_out.write(tmp);
+                }
+            }
+            if(iter < D_div_16){
+                for(int j = 0; j < 2; j++){
+                    ap_uint<1024> send;
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        send(k*64+63, k*64) = X[i*16+k][iter*2+j];
+                    }
+                    fifo_ffn_res.write(send);
+                }
+            }
+        }
+    }
+}
+void ffn_residual(
+    const int L,
+    tapa::istream<ap_uint<1024>>& fifo_x,
+    tapa::istream<ap_uint<512>>& fifo_in,
+    tapa::ostreams<ap_uint<512>, 2>& fifo_out
+){
+    for(int i = 0; i < (L >> 4); i++){
+        for(int j = 0; j < D_div_8; j++){
+            ap_uint<1024> tmp_x = fifo_x.read();
+            for(int k = 0; k < 8;){
+                if(!fifo_in.empty()){
+                    ap_uint<512> tmp; fifo_in.try_read(tmp);
+                    ap_uint<512> tmp_o;
+                    ap_uint<128> x = tmp_x(k*128+127, k*128);
+                    for(int l = 0; l < 16; l++){
+                        #pragma HLS unroll
+                        ap_int<22> a = tmp(l*32+31, l*32);
+                        ap_int<8> b = x(l*8+7, l*8);
+                        ap_int<22> res = a + b;
+                        tmp_o(l*32+31, l*32) = res;
+                    }
+                    fifo_out[i%2].write(tmp_o);
+                    k++;
+                }
+            }
+        }
+    }
+}
+void measure_cycle(tapa::istream<bool>& fifo_fin, tapa::mmap<int> cycle_count){
+    for(int cycle = 0;;cycle++){
+        if(!fifo_fin.empty()){
+            fifo_fin.read(nullptr);
+            cycle_count[0] = cycle;
+            break;
+        }
+    }
+}
+void opt_kernel(
+    const int L,
+    const int L_out,
+    const int seq_len,
+    // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
+    tapa::mmap<ap_uint<512>> X_acc0,
+    tapa::mmap<ap_uint<512>> X_acc1,
+    tapa::mmap<ap_uint<512>> W_acc0,
+    tapa::mmap<ap_uint<512>> W_acc1,
+    tapa::mmap<ap_uint<128>> acc0_out,
+    // tapa::mmap<ap_uint<64>> acc1_out,
+    tapa::mmap<int> cycle_count
+){
+    tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
+    tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
+    tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
+    tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
+    tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
+    tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
+    tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
+    tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
+    // tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
+    tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
+    tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
+    tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
+    // tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
+    tapa::streams<ap_uint<256>, NUM_SLR, 8> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
+    tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
+    tapa::stream<bool> fifo_fin("fifo_fin");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
+    tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
+    tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
+    // tapa::stream<ap_uint<128>> fifo_acc0_out("fifo_acc0_out");
+    tapa::stream<ap_uint<128>> fifo_acc1_out("fifo_acc1_out");
+    tapa::stream<ap_uint<512>, 16> fifo_res_acc0("fifo_res_acc0");
+    tapa::stream<ap_uint<512>, 16> fifo_res_acc1("fifo_res_acc1");
+    tapa::stream<ap_uint<512>, D> fifo_ln_acc0("fifo_ln_acc0");
+    tapa::stream<ap_uint<512>, D> fifo_ln_acc1("fifo_ln_acc1");
+    tapa::stream<ap_uint<128>> fifo_ffn_buffer_in("fifo_ffn_buffer_in");
+    tapa::stream<ap_uint<1024>> fifo_ffn_buffer_out("fifo_ffn_buffer_out");
+    tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_gelu_in("fifo_gelu_in");
+    tapa::streams<ap_uint<128>, NUM_SLR, D> fifo_gelu_out("fifo_gelu_out");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_gelu_full("fifo_gelu_full");
+    tapa::stream<ap_uint<512>, 8> fifo_ffn2("fifo_ffn2");
+    tapa::stream<ap_uint<1024>, D_div_8+2> fifo_skip_x("fifo_skip_x");
+    tapa::streams<ap_uint<512>, 2> fifo_res2("fifo_res2");
+    tapa::streams<int, NUM_SLR> fifo_inst_switch_acc0("fifo_inst_switch_acc0");
+    tapa::streams<int, NUM_SLR> fifo_inst_switch_acc1("fifo_inst_switch_acc1");
+    tapa::streams<int, NUM_SLR> fifo_inst_switch_sfu("fifo_inst_switch_sfu");
+    tapa::streams<int, NUM_SLR> fifo_inst_switch_context("fifo_inst_switch_context");
+    tapa::streams<int, NUM_SLR> fifo_inst_switch_gelu("fifo_inst_switch_gelu");
+    tapa::streams<int, NUM_SLR*2> fifo_inst_sfu_buffer("fifo_inst_sfu_buffer");
+    tapa::streams<int, NUM_SLR> fifo_inst_data_pack("fifo_inst_data_pack");
+    tapa::streams<int, NUM_SLR> fifo_inst_norm("fifo_inst_norm");
+    tapa::task()
+        .invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
+        .invoke<tapa::join>(read_W, W_acc0, fifo_W_acc0)
+        .invoke<tapa::join>(read_W, W_acc1, fifo_W_acc1)
+        .invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
+        .invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
+        .invoke<tapa::join>(
+            temporal_acc0_slr0,
+            fifo_inst_acc0, fifo_inst_acc0,
+            fifo_inst_switch_acc0,
+            fifo_X_acc0_slr0, fifo_X_acc0,
+            fifo_W_acc0, fifo_W_acc0,
+            fifo_from_acc1_to_acc0,
+            fifo_acc0_to_sfu,
+            fifo_gelu_in,
+            fifo_cont_to_acc0,
+            fifo_ffn_buffer_out,
+            fifo_reduce_acc0,
+            fifo_res_acc0
+            // fifo_fin
+        )
+        .invoke<tapa::join>(
+            temporal_acc1_slr0,
+            fifo_inst_acc1, fifo_inst_acc1,
+            fifo_inst_switch_acc1,
+            fifo_X_acc1_slr0, fifo_X_acc1,
+            fifo_W_acc1, fifo_W_acc1,
+            fifo_from_acc1_to_acc0,
+            fifo_from_sfu_to_acc1,
+            fifo_context,
+            fifo_cont_to_acc1,
+            fifo_reduce_acc1,
+            fifo_res_acc1,
+            fifo_gelu_full,
+            fifo_ffn2
+            // fifo_fin
+        )
+        .invoke<tapa::join>(
+            residual, seq_len,
+            fifo_res_acc0,
+            fifo_ln_acc0
+        )
+        .invoke<tapa::join>(
+            residual, seq_len,
+            fifo_res_acc1,
+            fifo_ln_acc1
+        )
+        .invoke<tapa::join, NUM_SLR-1>(
+            temporal_acc0,
+            fifo_inst_acc0, fifo_inst_acc0,
+            fifo_inst_switch_acc0,
+            fifo_X_acc0, fifo_X_acc0,
+            fifo_W_acc0, fifo_W_acc0,
+            fifo_from_acc1_to_acc0,
+            fifo_acc0_to_sfu,
+            fifo_cont_to_acc0,
+            fifo_gelu_in,
+            fifo_reduce_acc0, fifo_reduce_acc0
+        )
+        .invoke<tapa::join, NUM_SLR-1>(
+            temporal_acc1,
+            fifo_inst_acc1, fifo_inst_acc1,
+            fifo_inst_switch_acc1,
+            fifo_X_acc1, fifo_X_acc1,
+            fifo_W_acc1, fifo_W_acc1,
+            fifo_from_acc1_to_acc0,
+            fifo_from_sfu_to_acc1,
+            fifo_context,
+            fifo_cont_to_acc1,
+            fifo_reduce_acc1, fifo_reduce_acc1,
+            fifo_gelu_full
+        )
+        .invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc0, fifo_inst_switch_sfu, fifo_inst_switch_gelu)
+        .invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc1, fifo_inst_switch_context, fifo_inst_norm)
+        .invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc0, fifo_reduce_acc0)
+        .invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc1, fifo_reduce_acc1)
+        .invoke<tapa::join, NUM_SLR>(
+            sfu_acc_exp, fifo_inst_switch_sfu,
+            fifo_acc0_to_sfu,
+            fifo_sfu_buf_in,
+            fifo_inst_sfu_buffer
+        )
+        .invoke<tapa::join>(
+            sfu_buffer_slr0, fifo_inst_sfu_buffer,
+            fifo_sfu_buf_in,
+            fifo_ln_acc0,
+            fifo_res2,
+            fifo_sfu_buf_out
+        )
+        .invoke<tapa::join>(
+            sfu_buffer_slr0, fifo_inst_sfu_buffer,
+            fifo_sfu_buf_in,
+            fifo_ln_acc1,
+            fifo_res2,
+            fifo_sfu_buf_out
+        )
+        .invoke<tapa::join, (NUM_SLR-1)*2>(
+            sfu_buffer, fifo_inst_sfu_buffer,
+            fifo_sfu_buf_in,
+            fifo_sfu_buf_out
+        )
+        .invoke<tapa::join>(
+            sfu_norm_slr0, fifo_inst_norm,
+            fifo_sfu_buf_out,
+            fifo_from_sfu_to_acc1,
+            fifo_ffn_buffer_in,
+            fifo_acc1_out
+        )
+        .invoke<tapa::join, NUM_SLR-1>(
+            sfu_norm, fifo_inst_norm,
+            fifo_sfu_buf_out,
+            fifo_from_sfu_to_acc1
+        )
+        .invoke<tapa::join>(
+            ffn_buffer, seq_len,
+            fifo_ffn_buffer_in,
+            fifo_ffn_buffer_out,
+            fifo_skip_x
+        )
+        .invoke<tapa::join>(
+            ffn_residual, seq_len,
+            fifo_skip_x,
+            fifo_ffn2,
+            fifo_res2
+        )
+        .invoke<tapa::join, NUM_SLR>(
+            context_buffer, fifo_inst_switch_context,
+            fifo_context,
+            fifo_cont_to_acc0, fifo_cont_to_acc1
+        )
+        .invoke<tapa::join, NUM_SLR>(
+            sfu_gelu, fifo_inst_switch_gelu, fifo_inst_data_pack,
+            fifo_gelu_in,
+            fifo_gelu_out
+        )
+        .invoke<tapa::join, NUM_SLR>(
+            data_packing, fifo_inst_data_pack,
+            fifo_gelu_out,
+            fifo_gelu_full
+        )
+        // .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
+        .invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc1_out, fifo_fin)
+        // .invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
+        .invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
+        .invoke<tapa::detach>(black_hole_inst, fifo_inst_acc0)
+        .invoke<tapa::detach>(black_hole_inst, fifo_inst_acc1)
+        .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
+        .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
+        .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
+        .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
+}

gpt-2-medium/kernel-versal.cpp ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt-2-medium/kernel.cpp ADDED Viewed

	@@ -0,0 +1,1528 @@

+#include <cmath>
+#include <string>
+#include <tapa.h>
+#include <ap_int.h>
+#include <hls_math.h>
+constexpr int D = 1024;
+constexpr int D_ffn = 4096;
+constexpr int N_head = 16;
+constexpr int MAX_SEQ_LEN = 1024;
+constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
+constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
+constexpr int NUM_SLR = 3;
+constexpr int NUM_DUM_SLR = 4;
+constexpr int TOTAL_PORT = NUM_SLR * 2;
+constexpr int D_head = D / N_head;
+constexpr int D_head_div_16 = D_head / 16;
+constexpr int D_head_div_8 = D_head / 8;
+constexpr int D_head_div_4 = D_head / 4;
+constexpr int D_head_div_2 = D_head / 2;
+constexpr int D_div_8 = D / 8;
+constexpr int D_div_16 = D / 16;
+constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
+constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
+constexpr int WEIGHT_D = D * 2;
+constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 10; // multi-head attention
+constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE;
+constexpr int CONTEXT_D = D_head_div_8 * 5;
+constexpr int D_head_mul_5 = D_head * 5;
+constexpr int D_write_zero = D / 32 * 5;
+using int_v16 = tapa::vec_t<int, 16>;
+using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
+using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
+template <typename data_t>
+inline void bh(tapa::istream<data_t> & q) {
+#pragma HLS inline
+    for (;;) {
+#pragma HLS pipeline II=1
+        data_t tmp; q.try_read(tmp);
+    }
+}
+void black_hole_int(tapa::istream<int> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
+    bh(fifo_in);
+}
+void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
+    bh(fifo_in);
+}
+void read_W(
+    const int N,
+    tapa::async_mmap<ap_uint<512>>& vec,
+    tapa::ostream<ap_uint<512>>& fifo_out
+){
+    for(int i_req = 0, i_resp = 0; i_resp < (N >> 7);){
+        #pragma HLS pipeline II=1
+        if((i_req < (N >> 7)) & !vec.read_addr.full()){
+            vec.read_addr.write(i_req);
+            i_req++;
+        }
+        if(!vec.read_data.empty()){
+            ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
+            fifo_out.write(tmp_o);
+            i_resp++;
+        }
+    }
+}
+void read_X(
+    const int N,
+    tapa::async_mmap<ap_uint<512>>& vec,
+    tapa::ostream<ap_uint<512>>& fifo_out
+){
+    for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
+        #pragma HLS pipeline II=1
+        if((i_req < (N >> 6)) & !vec.read_addr.full()){
+            vec.read_addr.write(i_req);
+            i_req++;
+        }
+        if(!vec.read_data.empty()){
+            ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
+            fifo_out.write(tmp_o);
+            i_resp++;
+        }
+    }
+}
+void read_inst(
+    const int L,
+    tapa::ostream<int>& fifo_out_acc0,
+    tapa::ostream<int>& fifo_out_acc1
+){
+    for(int stage_i = 0; stage_i < 20; stage_i++){
+        #pragma HLS pipeline II=1
+        const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
+        if(stage == 3){
+            fifo_out_acc0.write(0);
+            fifo_out_acc1.write(0);
+            fifo_out_acc0.write(L/2);
+            fifo_out_acc1.write(L/2);
+        }
+        else if(stage != 1){
+            fifo_out_acc0.write(0);
+            fifo_out_acc1.write(0);
+            fifo_out_acc0.write(L);
+            fifo_out_acc1.write(L);
+        } else {
+            fifo_out_acc0.write(0);
+            fifo_out_acc0.write(L/2);
+            fifo_out_acc1.write(L/2);
+            fifo_out_acc1.write(L);
+        }
+    }
+}
+void write_mtx(
+    const int N,
+    tapa::async_mmap<ap_uint<64>>& output_mtx,
+    tapa::istream<ap_uint<64>>& fifo_in
+){
+    for(int i_req = 0, i_resp = 0; i_resp < N;){
+        #pragma HLS pipeline II=1
+        if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
+            output_mtx.write_addr.try_write(i_req);
+            ap_uint<64> tmp; fifo_in.try_read(tmp);
+            output_mtx.write_data.try_write(tmp);
+            ++i_req;
+        }
+        if(!output_mtx.write_resp.empty()){
+            i_resp += unsigned(output_mtx.write_resp.read(nullptr))+1;
+        }
+    }
+}
+void write_zero(
+    const int L,
+    tapa::ostream<ap_uint<512>>& fifo_zero
+){
+    for(int i = 0; i < L * D_write_zero;){
+        if(!fifo_zero.full()){
+            ap_uint<512> tmp = 0;
+            fifo_zero.try_write(tmp);
+            i++;
+        }
+    }
+}
+// acc slr0 master node
+void temporal_acc0_slr0(
+    const int L,
+    tapa::istream<int>& fifo_len_in,
+    tapa::ostream<int>& fifo_len_out,
+    tapa::istream<ap_uint<512>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::istream<ap_uint<128>>& fifo_from_acc1,
+    tapa::ostream<ap_uint<512>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<64>>& fifo_write,
+    tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
+    ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
+    ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
+    #pragma HLS array_partition variable=X cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=X cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=X type=ram_2p impl=uram
+    for(int stage_i = 0; stage_i < 20; stage_i++){
+        //TODO: stage send from inst
+        // stage 0: WqX
+        // stage 1: WkX0 <- acc1
+        // stage 2: QK^T
+        ap_uint<32> W[D_head][D_div_8]; // TODO: reduce dimension
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=16
+        const int start = fifo_len_in.read();
+        const int end = fifo_len_in.read();
+        fifo_len_out.write(start);
+        fifo_len_out.write(end);
+        const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
+        // load weights and forward
+        if(stage != 2) { // TODO: 1d array & uniform access
+            for(int i = 0; i < D_head_div_4; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 4; k++){
+                            #pragma HLS unroll
+                            W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
+        j_bound = (stage == 3) ? D_div_16 : j_bound;
+        int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
+        // stage 1: compute Q
+        for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 16
+            if(stage_i == 0){
+                for(int ii = 0; ii < 2; ii++){ // load only 1 time
+        load_x:
+                    for(int jj = 0; jj < D_div_8;){
+                        if(!fifo_X_in.empty()){
+                            ap_uint<512> val; fifo_X_in.try_read(val);
+                            for(int k = 0; k < 8; k++){
+                                #pragma HLS unroll
+                                X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
+                            }
+                            jj++;
+                        }
+                    }
+                }
+            }
+            for(int j = 0; j < j_bound; j++){
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){ // reduction dim
+                    #pragma HLS pipeline II=1
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage == 3){
+                            op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]); // change it
+                            op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
+                        } else if(stage == 2) {
+                            op1_mtx[ii] = scratchpad_q[i*16+ii][k];
+                            op2_mtx[ii] = scratchpad_k[j*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
+                            op2_mtx[ii] = X[i*16+ii][k];
+                        }
+                    }
+                    if(stage < 2){
+                        ap_uint<1024> send_pkt = ap_uint<1024>((
+                            op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
+                            op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
+                        ));
+                        fifo_X_out.write(send_pkt);
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                if(stage == 2){
+                                    op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
+                                    op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
+                                } else {
+                                    op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                    op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                }
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                            if(kk == 7 && stage < 2) {
+                                acc_final[ii][k*2] = acc_final[ii][k*2] >> 8;
+                                acc_final[ii][k*2+1] = acc_final[ii][k*2] >> 8;
+                            }
+                        }
+                    }
+                }
+               if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 16; ii++){
+                        ap_uint<128> tmp = fifo_from_acc1.read();
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
+                        }
+                    }
+                } else if(stage == 2){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1
+                        ap_uint<512> tmp;
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else {
+                final_acc:
+                    for(int ii = 0; ii < 16;){
+                        #pragma HLS pipeline II=1
+                        #pragma HLS dependence variable=X type=inter false
+                        if(!fifo_reduce_recv.empty()){
+                            ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
+                                X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
+                            }
+                            ii++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    fifo_fin.write(true);
+    write:
+    for(int i = 0; i < L; i++){
+        for(int j = 0; j < D_div_8; j++){
+            #pragma HLS pipeline II=1
+            fifo_write.write(X[i][j]);
+        }
+    }
+}
+void temporal_acc0(
+    const int L,
+    tapa::istream<int>& fifo_len_in,
+    tapa::ostream<int>& fifo_len_out,
+    tapa::istream<ap_uint<1024>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::istream<ap_uint<128>>& fifo_from_acc1,
+    tapa::ostream<ap_uint<512>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_reduce_send,
+    tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
+    ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
+    for(int stage_i = 0; stage_i < 20; stage_i++){
+    #pragma HLS loop_flatten off
+        // stage 0: WqX
+        // stage 1: WkX0 <- acc1
+        // stage 2: QK^T
+        // stage 3: WoO
+        ap_uint<32> W[D_head][D_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=16
+        const int start = fifo_len_in.read();
+        const int end = fifo_len_in.read();
+        fifo_len_out.write(start);
+        fifo_len_out.write(end);
+        const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
+        // load weights and forward
+        if(stage != 2) {
+            for(int i = 0; i < D_head_div_4; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 4; k++){
+                            #pragma HLS unroll
+                            W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
+        j_bound = (stage == 3) ? D_div_16 : j_bound;
+        int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
+        // stage 1: compute Q
+        for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 64
+            for(int j = 0; j < j_bound; j++){
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){ // reduction dim
+                    #pragma HLS pipeline II=1
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3){
+                        recv_pkt = fifo_context.read();
+                    } else if(stage != 2) {
+                        recv_pkt = fifo_X_in.read();
+                        fifo_X_out.write(recv_pkt);
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage == 3){
+                            op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
+                            op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
+                        } else if(stage == 2) {
+                            op1_mtx[ii] = scratchpad_q[i*16+ii][k];
+                            op2_mtx[ii] = scratchpad_k[j*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
+                            op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
+                        }
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                if(stage == 2){
+                                    op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
+                                    op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
+                                } else {
+                                    op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                    op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                }
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                            if(kk == 7 && stage < 2) {
+                                acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
+                                acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
+                            }
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 16; ii++){
+                        ap_uint<128> tmp = fifo_from_acc1.read();
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                        for(int k = 0; k < 2; k++){
+                            #pragma HLS unroll
+                            scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
+                        }
+                    }
+                } else if(stage == 2){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS pipeline II=1
+                        ap_uint<512> tmp;
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16;){
+                        #pragma HLS pipeline II=1
+                        if(!fifo_reduce_recv.empty()){
+                            ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
+                            ap_uint<512> tmp;
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
+                                tmp(k*32+21, k*32) = acc_final[ii][k];
+                            }
+                            fifo_reduce_send.write(tmp);
+                            ii++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    fifo_fin.write(true);
+}
+// acc slr0 master node
+void temporal_acc1_slr0(
+    const int L,
+    tapa::istream<int>& fifo_len_in,
+    tapa::ostream<int>& fifo_len_out,
+    tapa::istream<ap_uint<512>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::ostream<ap_uint<128>>& fifo_to_acc0,
+    tapa::istream<ap_uint<128>>& fifo_from_sfu,
+    tapa::ostream<ap_uint<1024>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<64>>& fifo_write,
+    tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
+    #pragma HLS array_partition variable=X cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=X cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=X type=ram_2p impl=uram
+    ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
+    // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
+    for(int stage_i = 0; stage_i < 20; stage_i++){
+        // stage 0: WvX
+        // stage 1: WkX1 -> acc0
+        // stage 2: Softmax(QK)V <- acc0
+        // stage 3: WoO
+        ap_uint<32> W[D_head][D_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=16
+        const int start = fifo_len_in.read();
+        const int end = fifo_len_in.read();
+        fifo_len_out.write(start);
+        fifo_len_out.write(end);
+        const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
+        // load weights and forward
+        if(stage != 2) {
+            for(int i = 0; i < D_head_div_4; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 4; k++){
+                            #pragma HLS unroll
+                            W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
+        k_bound = (stage == 3) ? D_head_div_8 : k_bound;
+        int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
+        for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
+            ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
+            #pragma HLS array_partition variable=cache_attn dim=2 complete
+            #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
+            if(stage_i == 0){
+                for(int ii = 0; ii < 2; ii++){ // load only 1 time
+        load_x:
+                    for(int jj = 0; jj < D_div_8;){
+                        if(!fifo_X_in.empty()){
+                            ap_uint<512> val; fifo_X_in.try_read(val);
+                            for(int k = 0; k < 8; k++){
+                                #pragma HLS unroll
+                                X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
+                            }
+                            jj++;
+                        }
+                    }
+                }
+            } else if (stage == 2) {
+                for(int ii = 0; ii < (L >> 3); ii++){
+                    ap_uint<32> fuse_reg[16];
+                    load_attn:
+                    for(int offset = 0; offset < 8;){
+                        #pragma HLS pipeline II=1
+                        if(!fifo_from_sfu.empty()){
+                            ap_uint<128> val; fifo_from_sfu.try_read(val);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
+                            }
+                            offset++;
+                        }
+                    }
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        cache_attn[ii][k] = fuse_reg[k];
+                    }
+                }
+            }
+            for(int j = 0; j < j_bound; j++){
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){
+                    #pragma HLS pipeline II=1
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    }
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        if(stage == 3){
+                            op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else if(stage != 2) {
+                            op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
+                            op2_mtx[ii] = X[i*16+ii][k];
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
+                            op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
+                        }
+                    }
+                    if(stage < 2){
+                        ap_uint<1024> send_pkt = ap_uint<1024>((
+                            op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
+                            op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
+                        ));
+                        fifo_X_out.write(send_pkt);
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                            if(kk == 7 && stage != 3) {
+                                acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
+                                acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
+                            }
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
+                        }
+                    }
+                } else if (stage == 2){
+                    for(int ii = 0; ii < 2; ii++){
+                        #pragma HLS pipeline II=1
+                        ap_uint<1024> tmp;
+                        for(int jj = 0; jj < 8; jj++){
+                            #pragma HLS unroll
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
+                            }
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else if (stage == 1) {
+                    for(int ii = 0; ii < 16; ii++){
+                        ap_uint<128> tmp;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                        fifo_to_acc0.write(tmp);
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16;){
+                        #pragma HLS pipeline II=1
+                        #pragma HLS dependence variable=X type=inter false
+                        if(!fifo_reduce_recv.empty()){
+                            ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
+                                X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
+                            }
+                            ii++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    fifo_fin.write(true);
+    // write out for debug
+write:
+    for(int i = 0; i < L; i++){
+        for(int j = 0; j < D_div_8; j++){
+            #pragma HLS pipeline II=1
+            fifo_write.write(X[i][j]);
+        }
+    }
+}
+void temporal_acc1(
+    const int L,
+    tapa::istream<int>& fifo_len_in,
+    tapa::ostream<int>& fifo_len_out,
+    tapa::istream<ap_uint<1024>>& fifo_X_in,
+    tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
+    tapa::istream<ap_uint<512>>& fifo_W_in,
+    tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
+    tapa::ostream<ap_uint<128>>& fifo_to_acc0,
+    tapa::istream<ap_uint<128>>& fifo_from_sfu,
+    tapa::ostream<ap_uint<1024>>& fifo_O_out,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::istream<ap_uint<512>>& fifo_reduce_recv,
+    tapa::ostream<ap_uint<512>>& fifo_reduce_send,
+    tapa::ostream<bool>& fifo_fin
+){
+    ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
+    #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
+    #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
+    #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
+    // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
+    // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
+    for(int stage_i = 0; stage_i < 20; stage_i++){
+        // stage 0: WvX
+        // stage 1: WkX1 -> acc0
+        // stage 2: Softmax(QK)V <- acc0
+        // stage 3: WoO
+        ap_uint<32> W[D_head][D_div_8]; // 4 bit
+        #pragma HLS array_partition variable=W cyclic dim=1 factor=16
+        const int start = fifo_len_in.read();
+        const int end = fifo_len_in.read();
+        fifo_len_out.write(start);
+        fifo_len_out.write(end);
+        const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
+        // load weights and forward
+        if(stage != 2) {
+            for(int i = 0; i < D_head_div_4; i++){
+                load_weight:
+                for(int j = 0; j < D_div_8;){
+                    if(!fifo_W_in.empty()){
+                        ap_uint<512> val; fifo_W_in.try_read(val);
+                        for(int k = 0; k < 4; k++){
+                            #pragma HLS unroll
+                            W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
+                        }
+                        val = ap_uint<512>(val >> 128);
+                        fifo_W_out.write(val);
+                        j++;
+                    }
+                }
+            }
+        }
+        int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
+        k_bound = (stage == 3) ? D_head_div_8 : k_bound;
+        int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
+        for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
+            ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
+            #pragma HLS array_partition variable=cache_attn dim=2 complete
+            #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
+            if(stage == 2){
+                for(int ii = 0; ii < (L >> 3); ii++){
+                    ap_uint<32> fuse_reg[16];
+                    load_attn:
+                    for(int offset = 0; offset < 8;){
+                        #pragma HLS pipeline II=1
+                        if(!fifo_from_sfu.empty()){
+                            ap_uint<128> val; fifo_from_sfu.try_read(val);
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
+                            }
+                            offset++;
+                        }
+                    }
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        cache_attn[ii][k] = fuse_reg[k];
+                    }
+                }
+            }
+            for(int j = 0; j < j_bound; j++){
+                ap_int<38> acc_vec[8][16][8];
+                #pragma HLS array_partition variable=acc_vec dim=1 complete
+                #pragma HLS array_partition variable=acc_vec dim=2 complete
+                #pragma HLS array_partition variable=acc_vec dim=3 complete
+                for(int ii = 0; ii < 8; ii++){
+                    #pragma HLS unroll
+                    for(int kk = 0; kk < 16; kk++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            acc_vec[ii][kk][k] = 0;
+                        }
+                    }
+                }
+        compute:
+                for(int k = 0; k < k_bound; k++){
+                    #pragma HLS pipeline II=1
+                    ap_uint<64> op1_mtx[16];
+                    ap_uint<64> op2_mtx[16];
+                    #pragma HLS array_partition variable=op1_mtx complete
+                    #pragma HLS array_partition variable=op2_mtx complete
+                    ap_uint<1024> recv_pkt;
+                    if(stage == 3) {
+                        recv_pkt = fifo_context.read();
+                    } else if(stage != 2) {
+                        recv_pkt = fifo_X_in.read();
+                        fifo_X_out.write(recv_pkt);
+                    }
+                    for(int ii = 0; ii < 16; ii++){ //TODO: change logic
+                        #pragma HLS unroll
+                        if(stage == 3){
+                            op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else if(stage != 2) {
+                            op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
+                            op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
+                        } else {
+                            op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
+                            op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
+                        }
+                    }
+                    for(int ii = 0; ii < 8; ii++){
+                        #pragma HLS unroll
+                        for(int kk = 0; kk < 16; kk++){
+                            #pragma HLS unroll
+                            for(int l = 0; l < 8; l++){
+                                #pragma HLS unroll
+                                ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
+                                op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
+                                op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
+                                op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
+                                ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
+                                acc_vec[ii][kk][l] += w_pack * op3;
+                            }
+                        }
+                    }
+                }
+                ap_int<22> acc_final[16][16];
+                #pragma HLS array_partition variable=acc_final dim=1 complete
+                #pragma HLS array_partition variable=acc_final dim=2 complete
+                for(int ii = 0; ii < 16; ii++){
+                    #pragma HLS unroll
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        acc_final[ii][k] = 0;
+                    }
+                }
+        reduction:
+                for(int kk = 0; kk < 8; kk++){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 8; k++){
+                            #pragma HLS unroll
+                            ap_int<19> res0; ap_int<19> res1;
+                            (res1, res0) = acc_vec[kk][ii][k];
+                            res1 = res1 + res0[18];
+                            acc_final[ii][k*2] += res0;
+                            acc_final[ii][k*2+1] += res1;
+                            if(kk == 7 && stage != 3) {
+                                acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
+                                acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
+                            }
+                        }
+                    }
+                }
+                if(stage == 0){
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            int offset = k%8;
+                            scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
+                        }
+                    }
+                } else if (stage == 2){
+                    for(int ii = 0; ii < 2; ii++){
+                        #pragma HLS pipeline II=1
+                        ap_uint<1024> tmp;
+                        for(int jj = 0; jj < 8; jj++){
+                            #pragma HLS unroll
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
+                            }
+                        }
+                        fifo_O_out.write(tmp);
+                    }
+                } else if (stage == 1){
+                    for(int ii = 0; ii < 16; ii++){
+                        ap_uint<128> tmp;
+                        for(int k = 0; k < 16; k++){
+                            #pragma HLS unroll
+                            tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
+                        }
+                        fifo_to_acc0.write(tmp);
+                    }
+                } else {
+                    final_acc:
+                    for(int ii = 0; ii < 16;){
+                        #pragma HLS pipeline II=1
+                        if(!fifo_reduce_recv.empty()){
+                            ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
+                            ap_uint<512> tmp;
+                            for(int k = 0; k < 16; k++){
+                                #pragma HLS unroll
+                                acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
+                                tmp(k*32+21, k*32) = acc_final[ii][k];
+                            }
+                            fifo_reduce_send.write(tmp);
+                            ii++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    fifo_fin.write(true);
+    // write out for debug
+// write:
+//     for(int i = 0; i < L; i++){
+//         for(int j = 0; j < D_head_div_8; j++){
+//             #pragma HLS pipeline II=1
+//             fifo_O_out.write(scratchpad_out[i][j]);
+//         }
+//     }
+}
+void sfu_buffer( // double buffering
+    const int L,
+    tapa::istream<ap_uint<512>>& fifo_data_in,
+    tapa::ostream<ap_uint<512>>& fifo_data_out
+){
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 5); l++){
+            float sum[8][16];
+            float cache[MAX_SEQ_LEN][16];
+            #pragma HLS array_partition variable=cache dim=2 complete
+            #pragma HLS array_partition variable=sum dim=2 complete
+            for(int i = 0; i < 8; i++){
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    sum[i][j] = 0.0;
+                }
+            }
+        acc:
+            for(int i = 0; i < L; i++){
+                #pragma HLS pipeline II=1
+                #pragma HLS dependence false variable=sum
+                #pragma HLS dependence true variable=sum distance=8
+                ap_uint<512> tmp = fifo_data_in.read();
+                for(int k = 0; k < 16; k++){
+                    #pragma HLS unroll
+                    float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
+                    sum[i%8][k] += res;
+                    cache[i][k] = res;
+                }
+            }
+        reduce:
+            for(int i = 1; i < 8; i++){
+                for(int j = 0; j < 8; j++){
+                    #pragma HLS pipeline II=1
+                    #pragma HLS dependence true variable=sum distance=8
+                    for(int k = 0; k < 2; k++){
+                        sum[0][j*2+k] += sum[i][j*2+k];
+                    }
+                }
+            }
+            ap_uint<512> tmp;
+            for(int i = 0; i < 16; i++){
+                #pragma HLS unroll
+                tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
+            }
+            fifo_data_out.write(tmp);
+        write:
+            for(int i = 0; i < L; i++){
+                #pragma HLS pipeline II=1
+                ap_uint<512> tmp;
+                for(int j = 0; j < 16; j++){
+                    #pragma HLS unroll
+                    tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
+                }
+                fifo_data_out.write(tmp);
+            }
+        }
+    }
+}
+void sfu_acc_exp(
+    const int L,
+    tapa::istream<ap_uint<512>>& fifo_data_in,
+    tapa::ostreams<ap_uint<512>, 2>& fifo_buf
+) {
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 4); l++){
+            exp_acc:
+            for(int i = 0; i < L;){
+                #pragma HLS pipeline II=1
+                if(!fifo_data_in.empty()){
+                    ap_uint<512> tmp; fifo_data_in.try_read(tmp);
+                    ap_uint<512> tmp_o;
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
+                        float res_exp = 0.0;
+                        res_exp = hls::exp(ap_int<32>(res >> 10));
+                        tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
+                    }
+                    fifo_buf[l%2].write(tmp_o);
+                    i++;
+                }
+            }
+        }
+    }
+}
+void sfu_norm(
+    const int L,
+    tapa::istreams<ap_uint<512>, 2>& fifo_buf,
+    tapa::ostream<ap_uint<128>>& fifo_data_out
+){
+    for(int stage = 0; stage < 5; stage++){
+        for(int l = 0; l < (L >> 4); l++){
+            float sum[16];
+            #pragma HLS array_partition variable=sum complete
+            ap_uint<512> tmp_in = fifo_buf[l%2].read();
+            for(int i = 0; i < 16; i++){
+                #pragma HLS unroll
+                sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
+            }
+            for(int i = 0; i < L;){
+                #pragma HLS pipeline II=1
+                if(!fifo_buf[l%2].empty()){
+                    ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
+                    ap_uint<128> tmp;
+                    for(int j = 0; j < 16; j++){
+                        #pragma HLS unroll
+                        ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
+                        tmp(j*8 + 7, j*8) = res;
+                    }
+                    fifo_data_out.write(tmp);
+                    i++;
+                }
+            }
+        }
+    }
+}
+void context_buffer(
+    const int L,
+    tapa::istream<ap_uint<1024>>& fifo_context,
+    tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
+    tapa::ostream<ap_uint<1024>>& fifo_to_acc1
+){
+    ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
+    #pragma HLS array_partition variable=context cyclic dim=1 factor=32
+    #pragma HLS bind_storage variable=context type=ram_2p impl=uram
+    for(int stage = 0; stage < 5; stage++){
+        for(int i = 0; i < (L >> 4); i++){
+            for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
+                if(!fifo_context.empty()){
+                    ap_uint<1024> tmp; fifo_context.try_read(tmp);
+                    for(int ii = 0; ii < 16; ii++){
+                        #pragma HLS unroll
+                        context[i*16+ii][j] = tmp(ii*64+63, ii*64);
+                    }
+                    j++;
+                }
+            }
+        }
+    }
+    // NOTE: change it to write to HBM for debugging
+    // write ops to acc0 and acc1 in parallel
+    for(int stage = 0; stage < 5; stage++){
+        for(int i = 0; i < (L >> 5); i++){
+            for(int l = 0; l < D_div_16; l++){
+                for(int j = 0; j < D_head_div_8; j++){
+                    ap_uint<1024> tmp_acc0;
+                    ap_uint<1024> tmp_acc1;
+                    for(int k = 0; k < 16; k++){
+                        #pragma HLS unroll
+                        tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
+                        tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
+                    }
+                    fifo_to_acc0.write(tmp_acc0);
+                    fifo_to_acc1.write(tmp_acc1);
+                }
+            }
+        }
+    }
+}
+void measure_cycle(tapa::istreams<bool, TOTAL_PORT>& fifo_fin, tapa::mmap<int> cycle_count){
+    for(int cycle = 0;;cycle++){
+        bool flag_cont = false;
+        for(int i = 0; i < TOTAL_PORT; i++){
+            flag_cont |= fifo_fin[i].empty();
+        }
+        if(!flag_cont){
+            for(int i = 0; i < TOTAL_PORT; i++){
+                fifo_fin[i].read(nullptr);
+            }
+            cycle_count[0] = cycle;
+            break;
+        }
+    }
+}
+void opt_kernel(
+    const int L,
+    const int L_out,
+    const int seq_len,
+    // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
+    tapa::mmap<ap_uint<512>> X_acc0,
+    tapa::mmap<ap_uint<512>> X_acc1,
+    tapa::mmap<ap_uint<512>> W_acc0,
+    tapa::mmap<ap_uint<512>> W_acc1,
+    tapa::mmap<ap_uint<64>> acc0_out,
+    tapa::mmap<ap_uint<64>> acc1_out,
+    tapa::mmap<int> cycle_count
+){
+    tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
+    tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
+    tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
+    tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
+    tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
+    tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
+    tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
+    tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
+    // tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
+    tapa::streams<ap_uint<512>, NUM_SLR> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
+    tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
+    tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
+    // tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
+    tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
+    tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
+    tapa::streams<bool, NUM_SLR*2> fifo_fin("fifo_fin");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
+    tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
+    tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
+    tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
+    tapa::stream<ap_uint<64>> fifo_acc0_out("fifo_acc0_out");
+    tapa::stream<ap_uint<64>> fifo_acc1_out("fifo_acc1_out");
+    tapa::task()
+        .invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
+        .invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc0, fifo_W_acc0)
+        .invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc1, fifo_W_acc1)
+        .invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
+        .invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
+        .invoke<tapa::join>(
+            temporal_acc0_slr0,
+            seq_len,
+            fifo_inst_acc0, fifo_inst_acc0,
+            fifo_X_acc0_slr0, fifo_X_acc0,
+            fifo_W_acc0, fifo_W_acc0,
+            fifo_from_acc1_to_acc0,
+            fifo_acc0_to_sfu,
+            fifo_cont_to_acc0,
+            fifo_reduce_acc0,
+            fifo_acc0_out,
+            fifo_fin
+        )
+        .invoke<tapa::join>(
+            temporal_acc1_slr0,
+            seq_len,
+            fifo_inst_acc1, fifo_inst_acc1,
+            fifo_X_acc1_slr0, fifo_X_acc1,
+            fifo_W_acc1, fifo_W_acc1,
+            fifo_from_acc1_to_acc0,
+            fifo_from_sfu_to_acc1,
+            fifo_context,
+            fifo_cont_to_acc1,
+            fifo_reduce_acc1,
+            fifo_acc1_out,
+            fifo_fin
+        )
+        .invoke<tapa::join, NUM_SLR-1>(
+            temporal_acc0,
+            seq_len,
+            fifo_inst_acc0, fifo_inst_acc0,
+            fifo_X_acc0, fifo_X_acc0,
+            fifo_W_acc0, fifo_W_acc0,
+            fifo_from_acc1_to_acc0,
+            fifo_acc0_to_sfu,
+            fifo_cont_to_acc0,
+            fifo_reduce_acc0, fifo_reduce_acc0,
+            fifo_fin
+        )
+        .invoke<tapa::join, NUM_SLR-1>(
+            temporal_acc1,
+            seq_len,
+            fifo_inst_acc1, fifo_inst_acc1,
+            fifo_X_acc1, fifo_X_acc1,
+            fifo_W_acc1, fifo_W_acc1,
+            fifo_from_acc1_to_acc0,
+            fifo_from_sfu_to_acc1,
+            fifo_context,
+            fifo_cont_to_acc1,
+            fifo_reduce_acc1, fifo_reduce_acc1,
+            fifo_fin
+        )
+        .invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc0)
+        .invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc1)
+        .invoke<tapa::join, NUM_SLR>(
+            sfu_acc_exp, seq_len,
+            fifo_acc0_to_sfu,
+            fifo_sfu_buf_in
+        )
+        .invoke<tapa::join, NUM_SLR*2>(
+            sfu_buffer, seq_len,
+            fifo_sfu_buf_in,
+            fifo_sfu_buf_out
+        )
+        .invoke<tapa::join, NUM_SLR>(
+            sfu_norm, seq_len,
+            fifo_sfu_buf_out,
+            fifo_from_sfu_to_acc1
+        )
+        .invoke<tapa::join, NUM_SLR>(
+            context_buffer, seq_len,
+            fifo_context,
+            fifo_cont_to_acc0, fifo_cont_to_acc1
+        )
+        // .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
+        .invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc0_out)
+        .invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
+        .invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
+        .invoke<tapa::detach>(black_hole_int, fifo_inst_acc0)
+        .invoke<tapa::detach>(black_hole_int, fifo_inst_acc1)
+        .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
+        .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
+        .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
+        .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
+}

gpt-2-medium/link_config_versal.ini ADDED Viewed

	@@ -0,0 +1,7 @@

+[connectivity]
+sp=opt_kernel.X_acc0:DDR
+sp=opt_kernel.X_acc1:DDR
+sp=opt_kernel.W_acc0:DDR
+sp=opt_kernel.W_acc1:DDR
+sp=opt_kernel.acc0_out:DDR
+sp=opt_kernel.cycle_count:DDR

gpt-2-medium/opt-versal-rs.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from rapidstream import RapidStreamTAPA, DeviceFactory
+rs = RapidStreamTAPA("rs_build/")
+rs.reset()
+factory = DeviceFactory(
+    row=4,
+    col=2,
+    part_num="xcvp1802-lsvc4072-2MP-e-S"
+)
+# Set the pblocks of the device so that each slot contains half of an SLR:
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
+factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
+factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
+factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
+factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
+# There are 18870 total SLL nodes for VP1552:
+factory.set_slot_crossing_capacity(0, 0, north=9435)
+factory.set_slot_crossing_capacity(1, 0, north=9435)
+factory.set_slot_crossing_capacity(0, 1, north=9435)
+factory.set_slot_crossing_capacity(1, 1, north=9435)
+factory.set_slot_crossing_capacity(0, 2, north=9435)
+factory.set_slot_crossing_capacity(1, 2, north=9435)
+# Call factory to extract the slot resources automatically from Vivado:
+factory.extract_slot_resources()
+# The device can be supplied as the virtual device for the RapidStream APIs:
+device = factory.generate_virtual_device()
+rs.set_virtual_device(device)
+rs.add_xo_file("./opt-stage4-dot-prod.tapa/opt.hw.xo")
+rs.set_top_module_name("opt_kernel")
+rs.add_clock("ap_clk", period_ns=3.33)
+rs.set_vitis_connectivity_config("link_config_versal.ini")
+rs.assign_port_to_region(".*", "SLOT_X0Y0:SLOT_X1Y0")
+rs.run_dse(max_workers=1, max_dse_limit=0.9, min_dse_limit=0.6)

gpt-2-medium/package_sample.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/bash
+# TARGET=hw
+TARGET=hw_emu
+DEBUG=-g
+TOP=opt_kernel
+XO='/path/to/opt_kernel.xo'
+CONSTRAINT='/path/to/constraints.tcl'
+>&2 echo "Using the default clock target of the platform."
+PLATFORM="/path/to/vpk180_pfm_vitis.xpfm"
+VERSAL="/path/to/xilinx-versal-common-v2023.2"
+TARGET_FREQUENCY=300000000
+if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
+OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
+MAX_SYNTH_JOBS=16
+STRATEGY="Default"
+PLACEMENT_STRATEGY="Default"
+emconfigutil --platform ${PLATFORM} --od "${OUTPUT_DIR}/"
+v++ ${DEBUG}\
+  --platform ${PLATFORM} \
+  --target ${TARGET} \
+  --package \
+  "${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
+  --temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp/package.build" \
+  --save-temps \
+  --package.out_dir "${OUTPUT_DIR}/package" \
+  --package.boot_mode sd \
+  --package.rootfs "${VERSAL}/rootfs.ext4" \
+  --package.kernel_image "${VERSAL}/Image" \
+  --package.sd_file "${OUTPUT_DIR}/emconfig.json" \
+  --package.sd_file "./host-opencl" \
+  --package.sd_file "./run_app.sh" \
+  --package.sd_file "./xrt.ini" \
+  -o "${OUTPUT_DIR}/${TOP}_vpk180.xclbin"

gpt-2-medium/parse_floorplan.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import json
+from enum import Enum, auto
+from typing import Any
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("-f", "--file", dest="filename", type=str,
+                    help="input floorplan json file", metavar="FILE")
+class IREnum(Enum):
+    """Enums to parse Rapidstream NOC IR."""
+    PIPELINE = "__rs_hs_pipeline"
+    REGION = "REGION"
+    BODY = "BODY"
+    HEAD_REGION = "__HEAD_REGION"
+    TAIL_REGION = "__TAIL_REGION"
+    DATA_WIDTH = "DATA_WIDTH"
+    DEPTH = "DEPTH"
+    BODY_LEVEL = "BODY_LEVEL"
+    IF_DOUT = "if_dout"
+    IF_EMPTY_N = "if_empty_n"
+    IF_READ = "if_read"
+    IF_DIN = "if_din"
+    IF_FULL_N = "if_full_n"
+    IF_WRITE = "if_write"
+    NMU = "nmu_"
+    NSU = "nsu_"
+    CC_MASTER = "_cc_master"
+    CC_RET = "_cc_ret"
+    RS_ROUTE = "RS_ROUTE"
+    FLOORPLAN_REGION = "floorplan_region"
+    PRAGMAS = "pragmas"
+    LIT = "lit"
+PIPELINE_MAPPING = {
+    "__rs_ap_ctrl_start_ready_pipeline": "AP",
+    "__rs_ff_pipeline": "FF",
+    "__rs_hs_pipeline": "HS",
+}
+def parse_top_mod(ir: dict[str, Any]) -> Any:
+    """Parses the top_mod dict in the Rapidstream IR.
+    Return a dictionary.
+    Example:
+    >>> design = {
+    ...     "modules": {
+    ...         "top_name": "FINDME",
+    ...         "module_definitions": [{"name": "FINDME"}],
+    ...     }
+    ... }
+    >>> parse_top_mod(design)
+    {'name': 'FINDME'}
+    """
+    top_mod = ir["modules"]["top_name"]
+    for mod in ir["modules"]["module_definitions"]:
+        if mod["name"] == top_mod:
+            return mod
+    raise AssertionError()
+def parse_mod(ir: dict[str, Any], name: str) -> Any:
+    """Parses a given module's IR in the Rapidstream IR.
+    Return a dictionary.
+    """
+    for mod in ir["modules"]["module_definitions"]:
+        if mod["name"] == name:
+            return mod
+    return {}
+def find_repr(source: list[dict[str, Any]], key: str) -> str:
+    """Finds the first type repr value of a key in the Rapidstream list IR.
+    Returns a string.
+    """
+    for e in find_expr(source, key):
+        return str(e["repr"])
+    print(f"WARNING: repr for key {key} not found!")
+    return ""
+def find_expr(
+    source: list[dict[str, Any | list[dict[str, str]]]], key: str
+) -> list[dict[str, str]]:
+    """Finds the expr value of a key in the Rapidstream list IR.
+    Returns a string.
+    """
+    for c in source:
+        if c["name"] == key:
+            return c["expr"]
+    print(f"WARNING: expr for key {key} not found!")
+    return []
+def parse_floorplan(ir: dict[str, Any], grouped_mod_name: str) -> dict[str, list[str]]:
+    """Parses the top module and grouped module's floorplan regions.
+    Return a dictionary where keys are slots and values are submodules.
+    """
+    combined_mods = {
+        # top
+        "": parse_top_mod(ir)["submodules"],
+    }
+    if grouped_mod_ir := parse_mod(ir, grouped_mod_name):
+        # grouped module
+        combined_mods[f"{grouped_mod_name}_0/"] = grouped_mod_ir["submodules"]
+    insts = {}
+    for parent, mods in combined_mods.items():
+        for sub_mod in mods:
+            sub_mod_name = parent + sub_mod["name"]
+            if sub_mod["floorplan_region"] is not None:
+                # regular module
+                insts[sub_mod_name] = sub_mod["floorplan_region"]
+            elif sub_mod["module"] in PIPELINE_MAPPING:
+                # pipeline module, needs to extract slot of each reg
+                mapped_name = PIPELINE_MAPPING[sub_mod["module"]]
+                body_level = find_repr(sub_mod["parameters"], IREnum.BODY_LEVEL.value)
+                insts[f"{sub_mod_name}/RS_{mapped_name}_PP_HEAD"] = find_repr(
+                    sub_mod["parameters"], IREnum.HEAD_REGION.value
+                ).strip('"')
+                insts[f"{sub_mod_name}/RS_{mapped_name}_PP_TAIL"] = find_repr(
+                    sub_mod["parameters"], IREnum.TAIL_REGION.value
+                ).strip('"')
+                for i in range(int(body_level)):
+                    insts[f"{sub_mod_name}/RS_{mapped_name}_PP_BODY_{i}"] = find_repr(
+                        sub_mod["parameters"], f"__BODY_{i}_REGION"
+                    ).strip('"')
+    # convert {instance: slot} to {slot: [instances]}
+    floorplan: dict[str, list[str]] = {}
+    for sub_mod_name, slot in insts.items():
+        assert slot is not None, f"{sub_mod_name} cannot have null slot!"
+        if slot not in floorplan:
+            floorplan[slot] = []
+        floorplan[slot].append(sub_mod_name)
+    return floorplan
+def extract_slot_coord(slot_name: str) -> tuple[int, int]:
+    """Extracts the x and y coordinates from the slot name.
+    Returns a coordinate tuple as (x, y) in int.
+    Example:
+    >>> extract_slot_coord("SLOT_X0Y1")
+    (0, 1)
+    """
+    return int(slot_name.split("X")[1].split("Y")[0]), int(slot_name.split("Y")[1])
+def export_constraint(floorplan: dict[str, list[str]], kernel_name: str) -> list[str]:
+    """Generates tcl constraints given the floorplan dictionary.
+    Returns a list of tcl commands.
+    """
+    tcl = [
+        """
+# Initialize an empty list to store undefined cells
+set undefined_cells {}
+"""
+    ]
+    cr_map = [
+        ["CLOCKREGION_X0Y1:CLOCKREGION_X4Y4", "CLOCKREGION_X0Y5:CLOCKREGION_X4Y7", "CLOCKREGION_X0Y8:CLOCKREGION_X4Y10", "CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"],
+        ["CLOCKREGION_X5Y1:CLOCKREGION_X9Y4", "CLOCKREGION_X5Y5:CLOCKREGION_X9Y7", "CLOCKREGION_X5Y8:CLOCKREGION_X9Y10", "CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"]
+    ]
+    for slot in floorplan.keys():
+        slot1, slot2 = slot.split("_TO_")
+        assert slot1 == slot2
+        x, y = extract_slot_coord(slot1)
+        cr = cr_map[x][y]
+        tcl += [
+            f"""
+# begin defining a slot for logic resources
+create_pblock {slot}
+resize_pblock {slot} -add {cr}
+"""
+        ]
+    for slot, _ in floorplan.items():
+        tcl += [f"set {slot}_cells {{"]
+        tcl += [f"    ext_platform_i/VitisRegion/{kernel_name}/inst/{slot}_0/.*"]
+        tcl += [
+            f"""}}
+add_cells_to_pblock [get_pblocks {slot}] [get_cells -regex ${slot}_cells]
+# Iterate through each cell in the list
+foreach cell ${slot}_cells {{
+    set defined [llength [get_cells $cell]]
+    if {{ $defined == 0 }} {{
+        lappend undefined_cells $cell
+    }}
+}}
+"""
+        ]
+    tcl += [
+        """
+if {[llength $undefined_cells] > 0} {
+    puts "Undefined cells:"
+    foreach cell $undefined_cells {
+        puts $cell
+    }
+}
+"""
+    ]
+    return tcl
+if __name__ == "__main__":
+    args = parser.parse_args()
+    with open(args.filename, "r", encoding="utf-8") as file:
+        ir = json.load(file)
+    pipeline_dict = parse_floorplan(ir, "")
+    tcl = export_constraint(pipeline_dict, "opt_kernel")
+    with open("constraints.tcl", "w", encoding="utf-8") as file:
+        file.write("\n".join(tcl))

gpt-2-medium/run_app.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+export LD_LIBRARY_PATH=/mnt:/tmp:$LD_LIBRARY_PATH
+export XCL_EMULATION_MODE=hw_emu
+export XILINX_XRT=/usr
+export XILINX_VITIS=/mnt
+./host-opencl opt_kernel_vpk180.xclbin

gpt-2-medium/run_tapa.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+tapac \
+    -o opt.hw.xo \
+    --platform xilinx_u280_xdma_201920_3 \
+    --top opt_kernel \
+    --work-dir opt-stage3.tapa \
+    --connectivity hbm_config.ini \
+    --enable-hbm-binding-adjustment \
+    --enable-synth-util \
+    --run-floorplan-dse \
+    --min-area-limit 0.55 \
+    --min-slr-width-limit 5000 \
+    --max-slr-width-limit 19000 \
+    --max-parallel-synth-jobs 16 \
+    --floorplan-output opt-floorplan.tcl \
+    kernel.cpp

gpt-2-medium/run_tapa_rs.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env bash
+ml load xilinx/vivado/2024.1
+tapac \
+    --work-dir opt-stage4-dot-prod.tapa \
+    --top opt_kernel \
+    --part-num xcvp1802-lsvc4072-2MP-e-S \
+    --clock-period 3.33 \
+    -o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
+    --connectivity link_config_versal.ini \
+    --run-tapacc        \
+    --run-hls           \
+    --generate-task-rtl \
+    --run-floorplanning \
+    --generate-top-rtl \
+    kernel-versal.cpp
+ml load xilinx/vivado/2024.1
+tapac \
+    --work-dir opt-stage4-dot-prod.tapa \
+    --top opt_kernel \
+    --part-num xcvp1802-lsvc4072-2MP-e-S \
+    --clock-period 3.33 \
+    -o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
+    --connectivity link_config_versal.ini \
+    --pack-xo \
+    kernel-versal.cpp

gpt-2-medium/xo/constraints.tcl ADDED Viewed

	@@ -0,0 +1,157 @@

+# Initialize an empty list to store undefined cells
+set undefined_cells {}
+# begin defining a slot for logic resources
+create_pblock SLOT_X0Y0_TO_SLOT_X0Y0
+resize_pblock SLOT_X0Y0_TO_SLOT_X0Y0 -add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4
+# begin defining a slot for logic resources
+create_pblock SLOT_X0Y2_TO_SLOT_X0Y2
+resize_pblock SLOT_X0Y2_TO_SLOT_X0Y2 -add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10
+# begin defining a slot for logic resources
+create_pblock SLOT_X1Y2_TO_SLOT_X1Y2
+resize_pblock SLOT_X1Y2_TO_SLOT_X1Y2 -add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10
+# begin defining a slot for logic resources
+create_pblock SLOT_X0Y3_TO_SLOT_X0Y3
+resize_pblock SLOT_X0Y3_TO_SLOT_X0Y3 -add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13
+# begin defining a slot for logic resources
+create_pblock SLOT_X1Y3_TO_SLOT_X1Y3
+resize_pblock SLOT_X1Y3_TO_SLOT_X1Y3 -add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13
+# begin defining a slot for logic resources
+create_pblock SLOT_X1Y0_TO_SLOT_X1Y0
+resize_pblock SLOT_X1Y0_TO_SLOT_X1Y0 -add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4
+# begin defining a slot for logic resources
+create_pblock SLOT_X1Y1_TO_SLOT_X1Y1
+resize_pblock SLOT_X1Y1_TO_SLOT_X1Y1 -add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7
+# begin defining a slot for logic resources
+create_pblock SLOT_X0Y1_TO_SLOT_X0Y1
+resize_pblock SLOT_X0Y1_TO_SLOT_X0Y1 -add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7
+set SLOT_X0Y0_TO_SLOT_X0Y0_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y0_TO_SLOT_X0Y0.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X0Y0_TO_SLOT_X0Y0] [get_cells -regex $SLOT_X0Y0_TO_SLOT_X0Y0_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X0Y0_TO_SLOT_X0Y0_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X0Y2_TO_SLOT_X0Y2_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y2_TO_SLOT_X0Y2.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X0Y2_TO_SLOT_X0Y2] [get_cells -regex $SLOT_X0Y2_TO_SLOT_X0Y2_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X0Y2_TO_SLOT_X0Y2_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X1Y2_TO_SLOT_X1Y2_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y2_TO_SLOT_X1Y2.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X1Y2_TO_SLOT_X1Y2] [get_cells -regex $SLOT_X1Y2_TO_SLOT_X1Y2_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X1Y2_TO_SLOT_X1Y2_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X0Y3_TO_SLOT_X0Y3_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y3_TO_SLOT_X0Y3.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X0Y3_TO_SLOT_X0Y3] [get_cells -regex $SLOT_X0Y3_TO_SLOT_X0Y3_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X0Y3_TO_SLOT_X0Y3_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X1Y3_TO_SLOT_X1Y3_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y3_TO_SLOT_X1Y3.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X1Y3_TO_SLOT_X1Y3] [get_cells -regex $SLOT_X1Y3_TO_SLOT_X1Y3_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X1Y3_TO_SLOT_X1Y3_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X1Y0_TO_SLOT_X1Y0_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y0_TO_SLOT_X1Y0.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X1Y0_TO_SLOT_X1Y0] [get_cells -regex $SLOT_X1Y0_TO_SLOT_X1Y0_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X1Y0_TO_SLOT_X1Y0_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X1Y1_TO_SLOT_X1Y1_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y1_TO_SLOT_X1Y1.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X1Y1_TO_SLOT_X1Y1] [get_cells -regex $SLOT_X1Y1_TO_SLOT_X1Y1_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X1Y1_TO_SLOT_X1Y1_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+set SLOT_X0Y1_TO_SLOT_X0Y1_cells {
+    ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y1_TO_SLOT_X0Y1.*
+}
+add_cells_to_pblock [get_pblocks SLOT_X0Y1_TO_SLOT_X0Y1] [get_cells -regex $SLOT_X0Y1_TO_SLOT_X0Y1_cells]
+# Iterate through each cell in the list
+foreach cell $SLOT_X0Y1_TO_SLOT_X0Y1_cells {
+    set defined [llength [get_cells $cell]]
+    if { $defined == 0 } {
+        lappend undefined_cells $cell
+    }
+}
+if {[llength $undefined_cells] > 0} {
+    puts "Undefined cells:"
+    foreach cell $undefined_cells {
+        puts $cell
+    }
+}

gpt-2-medium/xo/opt_kernel.xo ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50ccf71a9ffd437e6e800624723a66b21391130e200641b2c8c7af0875ef73ce
+size 2049244

gpt-2-medium/xrt.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [Emulation]
2	+ debug_mode=batch