Upload Bitstreams
Browse files- .gitattributes +10 -0
- gpt-2-medium/Makefile +30 -0
- gpt-2-medium/README.md +26 -0
- gpt-2-medium/bitstreams/opt_kernel_latest.xclbin +3 -0
- gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info +497 -0
- gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin +3 -0
- gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info +490 -0
- gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin +3 -0
- gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info +502 -0
- gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa +3 -0
- gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa +3 -0
- gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa +3 -0
- gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa +3 -0
- gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info +485 -0
- gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin +3 -0
- gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin +3 -0
- gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info +502 -0
- gpt-2-medium/export_xo.py +52 -0
- gpt-2-medium/generate_bitstream_sample.sh +40 -0
- gpt-2-medium/hbm_config.ini +7 -0
- gpt-2-medium/host-u280.cpp +172 -0
- gpt-2-medium/host-versal.cpp +194 -0
- gpt-2-medium/host.cpp +194 -0
- gpt-2-medium/host_opencl.cpp +273 -0
- gpt-2-medium/host_opencl.h +71 -0
- gpt-2-medium/kernel-ultrascale.cpp +2091 -0
- gpt-2-medium/kernel-versal.cpp +0 -0
- gpt-2-medium/kernel.cpp +1528 -0
- gpt-2-medium/link_config_versal.ini +7 -0
- gpt-2-medium/opt-versal-rs.py +43 -0
- gpt-2-medium/package_sample.sh +38 -0
- gpt-2-medium/parse_floorplan.py +223 -0
- gpt-2-medium/run_app.sh +8 -0
- gpt-2-medium/run_tapa.sh +15 -0
- gpt-2-medium/run_tapa_rs.sh +28 -0
- gpt-2-medium/xo/constraints.tcl +157 -0
- gpt-2-medium/xo/opt_kernel.xo +3 -0
- gpt-2-medium/xrt.ini +2 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
gpt-2-medium/xo/opt_kernel.xo filter=lfs diff=lfs merge=lfs -text
|
gpt-2-medium/Makefile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GCC=g++
|
| 2 |
+
ARMGCC=$(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++
|
| 3 |
+
SYSROOT=/home/oswaldhe/vpk180_custom_platform/vpk180_custom_platform.vitis/xilinx-versal-common-v2023.2/sysroots/cortexa72-cortexa53-xilinx-linux
|
| 4 |
+
# TAPA_ROOT=$(shell spack location -i tapa@2024-05-18)
|
| 5 |
+
# FRT_ROOT=$(shell spack location -i fpga-runtime)
|
| 6 |
+
# GLOG_ROOT=$(shell spack location -i glog/pqucikz)
|
| 7 |
+
# GFLAGS_ROOT=$(shell spack location -i gflags/y2uaz43)
|
| 8 |
+
INCLUDE_FLAGS=-I$(TAPA_ROOT)/include -I$(FRT_ROOT)/include -I$(GLOG_ROOT)/include -I$(GFLAGS_ROOT)/include -I$(XILINX_HLS)/include
|
| 9 |
+
LDFLAGS=-L$(TAPA_ROOT)/lib -L$(FRT_ROOT)/lib -L$(GLOG_ROOT)/lib -L$(GFLAGS_ROOT)/lib -ltapa -lfrt -lglog -lgflags -lm
|
| 10 |
+
# RPATH_FLAGS=-Wl,-rpath,$(TAPA_ROOT)/lib -Wl,-rpath,$(FRT_ROOT)/lib -Wl,-rpath,$(GLOG_ROOT)/lib -Wl,-rpath,$(GFLAGS_ROOT)/lib
|
| 11 |
+
#OPT=-I$(shell spack location -i tapa@2023-01-08)/include -I$(shell spack location -i fpga-runtime)/include -I$(shell spack location -i glog/pqucikz)/include -I${shell spack location -i gflags/y2uaz43}/include -ltapa -lfrt -lglog -lgflags -lOpenCL -lm -I${XILINX_HLS}/include
|
| 12 |
+
#RPATH_FLAGS=-Wl,-rpath,$(shell spack location -i tapa@2023-01-08)/lib -Wl,-rpath,$(shell spack location -i fpga-runtime)/lib -Wl,-rpath,$(shell spack location -i glog/pqucikz)/lib -Wl,-rpath,$(shell spack location -i gflags/y2uaz43)/lib
|
| 13 |
+
|
| 14 |
+
opt350: kernel.cpp host.cpp
|
| 15 |
+
$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
|
| 16 |
+
|
| 17 |
+
opt350-ultrascale: kernel-ultrascale.cpp host-u280.cpp
|
| 18 |
+
$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
|
| 19 |
+
|
| 20 |
+
host-opencl: host_opencl.o
|
| 21 |
+
$(ARMGCC) -o $@ $^ -L$(SYSROOT)/usr/lib/ -lxrt_coreutil -lpthread -lrt -lstdc++ -lgmp -lOpenCL --sysroot=$(SYSROOT)
|
| 22 |
+
|
| 23 |
+
host_opencl.o: host_opencl.cpp
|
| 24 |
+
$(ARMGCC) -c -D__USE_XOPEN2K8 -I$(SYSROOT)/usr/include/xrt -I$(XILINX_VIVADO)/include -I$(SYSROOT)/usr/include -I$(XILINX_HLS)/include -fmessage-length=0 -std=c++17 --sysroot=$(SYSROOT) -o $@ $^
|
| 25 |
+
|
| 26 |
+
opt350-versal: kernel-versal.cpp host-versal.cpp
|
| 27 |
+
$(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lglog -lgflags -lm -lOpenCL -I$(XILINX_HLS)/include
|
| 28 |
+
|
| 29 |
+
clean:
|
| 30 |
+
rm opt350 opt-versal opt350-ultrascale
|
gpt-2-medium/README.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Place & Route Instructions
|
| 2 |
+
|
| 3 |
+
### Generate Vitis Platform
|
| 4 |
+
|
| 5 |
+
Follow this [tutorial](https://docs.amd.com/r/2023.2-English/Vitis-Tutorials-Vitis-Platform-Creation/Versal-Platform-Creation-Quick-Start) to generate the Vitis Platform for VPK180. There are a couple of changes:
|
| 6 |
+
|
| 7 |
+
1. Step 1-3: Select VPK180 as the device. Generate 3 clocks: 100MHz, 200MHz, 300MHz.
|
| 8 |
+
2. Step 2-2: git-branch should be `xlnx_rel_v2023.2`. `system-user.dtsi` is on [Vitis Tutorial Github Repo](https://github.com/Xilinx/Vitis-Tutorials/blob/2023.2/Vitis_Platform_Creation/Design_Tutorials/03_Edge_VCK190/ref_files/step2_pfm/system-user.dtsi). Change the name to Xilinx custom-vpk180. Board name is `versal-vpk180-reva`.
|
| 9 |
+
|
| 10 |
+
### Launch V++ Script for P&R
|
| 11 |
+
|
| 12 |
+
After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh` and launch the script to start P&R.
|
| 13 |
+
|
| 14 |
+
### Hardware Emulation Using QEMU
|
| 15 |
+
|
| 16 |
+
After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh`. Change target to `hw_emu` and turn on debug mode `-g`. After generating the xsa file for hardware emulation, run `package_sample.sh` with the same modifications as `generate_bitstream_sample.sh`, with the files you want to include in the SD card image (including the host binary, launch scripts, and configuration file `xrt.ini`). You will find a script `/package/launch_hw_emu.sh` to start QEMU directly.
|
| 17 |
+
|
| 18 |
+
## Latency References vs. SoTA (ms)
|
| 19 |
+
|
| 20 |
+
|Seq Length | Allo | DFX | NVIDIA T4 | NVIDIA A100 | AMD MI210 |
|
| 21 |
+
| ---- | ---- | ---- | ---- | ---- | ---- |
|
| 22 |
+
| 64 | 205.46 | 349.1 | 47.26 | 39.8 | 7.776 |
|
| 23 |
+
| 128 | 370.56 | 692.8 | 56.4 | 39.51 | 8.541 |
|
| 24 |
+
| 256 | 740.76 | 1412.5 | 81.0 | 39.82 | 10.12 |
|
| 25 |
+
| 512 | 1333.79 | 2825.1 | 162.91 | 49.06 | 15.52 |
|
| 26 |
+
| 1024 | 3777.4 | 6079 | 360.9 | 49.17 | 33.08 |
|
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:090f0f57d4d3450a0a44c8bc3c50c3271fe5af186e2c7d165d62ec70ac48dbe7
|
| 3 |
+
size 76134932
|
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
==============================================================================
|
| 3 |
+
XRT Build Version: 2.14.384 (2022.2)
|
| 4 |
+
Build Date: 2022-12-09 00:55:08
|
| 5 |
+
Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
|
| 6 |
+
==============================================================================
|
| 7 |
+
xclbin Information
|
| 8 |
+
------------------
|
| 9 |
+
Generated by: v++ (2021.2) on 2021-10-14-04:41:01
|
| 10 |
+
Version: 2.14.384
|
| 11 |
+
Kernels: opt_kernel
|
| 12 |
+
Signature:
|
| 13 |
+
Content: Bitstream
|
| 14 |
+
UUID (xclbin): 41b0c8a4-f618-a8f7-0b11-d3c822641412
|
| 15 |
+
Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
|
| 16 |
+
CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
|
| 17 |
+
EMBEDDED_METADATA, SYSTEM_METADATA,
|
| 18 |
+
GROUP_CONNECTIVITY, GROUP_TOPOLOGY
|
| 19 |
+
==============================================================================
|
| 20 |
+
Hardware Platform (Shell) Information
|
| 21 |
+
-------------------------------------
|
| 22 |
+
Vendor: xilinx
|
| 23 |
+
Board: u280
|
| 24 |
+
Name: xdma
|
| 25 |
+
Version: 201920.3
|
| 26 |
+
Generated Version: Vivado 2019.2 (SW Build: 2742762)
|
| 27 |
+
Created:
|
| 28 |
+
Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
|
| 29 |
+
Board Vendor: xilinx.com
|
| 30 |
+
Board Name: xilinx.com:au280:1.0
|
| 31 |
+
Board Part: xilinx.com:au280:part0:1.0
|
| 32 |
+
Platform VBNV: xilinx_u280_xdma_201920_3
|
| 33 |
+
Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
|
| 34 |
+
Feature ROM TimeStamp: 1579649056
|
| 35 |
+
|
| 36 |
+
Scalable Clocks
|
| 37 |
+
---------------
|
| 38 |
+
Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
|
| 39 |
+
Index: 0
|
| 40 |
+
Type: SYSTEM
|
| 41 |
+
Frequency: 450 MHz
|
| 42 |
+
|
| 43 |
+
Name: DATA_CLK
|
| 44 |
+
Index: 1
|
| 45 |
+
Type: DATA
|
| 46 |
+
Frequency: 224 MHz
|
| 47 |
+
|
| 48 |
+
Name: KERNEL_CLK
|
| 49 |
+
Index: 2
|
| 50 |
+
Type: KERNEL
|
| 51 |
+
Frequency: 500 MHz
|
| 52 |
+
|
| 53 |
+
System Clocks
|
| 54 |
+
------
|
| 55 |
+
Name: _bd_top_clkwiz_kernel2_clk_out1
|
| 56 |
+
Type: SCALABLE
|
| 57 |
+
Default Freq: 500 MHz
|
| 58 |
+
Requested Freq: 500 MHz
|
| 59 |
+
Achieved Freq: 500 MHz
|
| 60 |
+
|
| 61 |
+
Name: _bd_top_clkwiz_kernel_clk_out1
|
| 62 |
+
Type: SCALABLE
|
| 63 |
+
Default Freq: 300 MHz
|
| 64 |
+
Requested Freq: 300 MHz
|
| 65 |
+
Achieved Freq: 224.4 MHz
|
| 66 |
+
|
| 67 |
+
Memory Configuration
|
| 68 |
+
--------------------
|
| 69 |
+
Name: HBM[0]
|
| 70 |
+
Index: 0
|
| 71 |
+
Type: MEM_DDR4
|
| 72 |
+
Base Address: 0x0
|
| 73 |
+
Address Size: 0x10000000
|
| 74 |
+
Bank Used: Yes
|
| 75 |
+
|
| 76 |
+
Name: HBM[1]
|
| 77 |
+
Index: 1
|
| 78 |
+
Type: MEM_DDR4
|
| 79 |
+
Base Address: 0x10000000
|
| 80 |
+
Address Size: 0x10000000
|
| 81 |
+
Bank Used: Yes
|
| 82 |
+
|
| 83 |
+
Name: HBM[2]
|
| 84 |
+
Index: 2
|
| 85 |
+
Type: MEM_DRAM
|
| 86 |
+
Base Address: 0x20000000
|
| 87 |
+
Address Size: 0x10000000
|
| 88 |
+
Bank Used: Yes
|
| 89 |
+
|
| 90 |
+
Name: HBM[3]
|
| 91 |
+
Index: 3
|
| 92 |
+
Type: MEM_DRAM
|
| 93 |
+
Base Address: 0x30000000
|
| 94 |
+
Address Size: 0x10000000
|
| 95 |
+
Bank Used: No
|
| 96 |
+
|
| 97 |
+
Name: HBM[4]
|
| 98 |
+
Index: 4
|
| 99 |
+
Type: MEM_DRAM
|
| 100 |
+
Base Address: 0x40000000
|
| 101 |
+
Address Size: 0x10000000
|
| 102 |
+
Bank Used: No
|
| 103 |
+
|
| 104 |
+
Name: HBM[5]
|
| 105 |
+
Index: 5
|
| 106 |
+
Type: MEM_DRAM
|
| 107 |
+
Base Address: 0x50000000
|
| 108 |
+
Address Size: 0x10000000
|
| 109 |
+
Bank Used: No
|
| 110 |
+
|
| 111 |
+
Name: HBM[6]
|
| 112 |
+
Index: 6
|
| 113 |
+
Type: MEM_DRAM
|
| 114 |
+
Base Address: 0x60000000
|
| 115 |
+
Address Size: 0x10000000
|
| 116 |
+
Bank Used: No
|
| 117 |
+
|
| 118 |
+
Name: HBM[7]
|
| 119 |
+
Index: 7
|
| 120 |
+
Type: MEM_DRAM
|
| 121 |
+
Base Address: 0x70000000
|
| 122 |
+
Address Size: 0x10000000
|
| 123 |
+
Bank Used: No
|
| 124 |
+
|
| 125 |
+
Name: HBM[8]
|
| 126 |
+
Index: 8
|
| 127 |
+
Type: MEM_DRAM
|
| 128 |
+
Base Address: 0x80000000
|
| 129 |
+
Address Size: 0x10000000
|
| 130 |
+
Bank Used: No
|
| 131 |
+
|
| 132 |
+
Name: HBM[9]
|
| 133 |
+
Index: 9
|
| 134 |
+
Type: MEM_DRAM
|
| 135 |
+
Base Address: 0x90000000
|
| 136 |
+
Address Size: 0x10000000
|
| 137 |
+
Bank Used: No
|
| 138 |
+
|
| 139 |
+
Name: HBM[10]
|
| 140 |
+
Index: 10
|
| 141 |
+
Type: MEM_DRAM
|
| 142 |
+
Base Address: 0xa0000000
|
| 143 |
+
Address Size: 0x10000000
|
| 144 |
+
Bank Used: No
|
| 145 |
+
|
| 146 |
+
Name: HBM[11]
|
| 147 |
+
Index: 11
|
| 148 |
+
Type: MEM_DRAM
|
| 149 |
+
Base Address: 0xb0000000
|
| 150 |
+
Address Size: 0x10000000
|
| 151 |
+
Bank Used: No
|
| 152 |
+
|
| 153 |
+
Name: HBM[12]
|
| 154 |
+
Index: 12
|
| 155 |
+
Type: MEM_DRAM
|
| 156 |
+
Base Address: 0xc0000000
|
| 157 |
+
Address Size: 0x10000000
|
| 158 |
+
Bank Used: No
|
| 159 |
+
|
| 160 |
+
Name: HBM[13]
|
| 161 |
+
Index: 13
|
| 162 |
+
Type: MEM_DRAM
|
| 163 |
+
Base Address: 0xd0000000
|
| 164 |
+
Address Size: 0x10000000
|
| 165 |
+
Bank Used: No
|
| 166 |
+
|
| 167 |
+
Name: HBM[14]
|
| 168 |
+
Index: 14
|
| 169 |
+
Type: MEM_DRAM
|
| 170 |
+
Base Address: 0xe0000000
|
| 171 |
+
Address Size: 0x10000000
|
| 172 |
+
Bank Used: No
|
| 173 |
+
|
| 174 |
+
Name: HBM[15]
|
| 175 |
+
Index: 15
|
| 176 |
+
Type: MEM_DRAM
|
| 177 |
+
Base Address: 0xf0000000
|
| 178 |
+
Address Size: 0x10000000
|
| 179 |
+
Bank Used: No
|
| 180 |
+
|
| 181 |
+
Name: HBM[16]
|
| 182 |
+
Index: 16
|
| 183 |
+
Type: MEM_DRAM
|
| 184 |
+
Base Address: 0x100000000
|
| 185 |
+
Address Size: 0x10000000
|
| 186 |
+
Bank Used: Yes
|
| 187 |
+
|
| 188 |
+
Name: HBM[17]
|
| 189 |
+
Index: 17
|
| 190 |
+
Type: MEM_DRAM
|
| 191 |
+
Base Address: 0x110000000
|
| 192 |
+
Address Size: 0x10000000
|
| 193 |
+
Bank Used: Yes
|
| 194 |
+
|
| 195 |
+
Name: HBM[18]
|
| 196 |
+
Index: 18
|
| 197 |
+
Type: MEM_DRAM
|
| 198 |
+
Base Address: 0x120000000
|
| 199 |
+
Address Size: 0x10000000
|
| 200 |
+
Bank Used: Yes
|
| 201 |
+
|
| 202 |
+
Name: HBM[19]
|
| 203 |
+
Index: 19
|
| 204 |
+
Type: MEM_DRAM
|
| 205 |
+
Base Address: 0x130000000
|
| 206 |
+
Address Size: 0x10000000
|
| 207 |
+
Bank Used: Yes
|
| 208 |
+
|
| 209 |
+
Name: HBM[20]
|
| 210 |
+
Index: 20
|
| 211 |
+
Type: MEM_DRAM
|
| 212 |
+
Base Address: 0x140000000
|
| 213 |
+
Address Size: 0x10000000
|
| 214 |
+
Bank Used: No
|
| 215 |
+
|
| 216 |
+
Name: HBM[21]
|
| 217 |
+
Index: 21
|
| 218 |
+
Type: MEM_DRAM
|
| 219 |
+
Base Address: 0x150000000
|
| 220 |
+
Address Size: 0x10000000
|
| 221 |
+
Bank Used: No
|
| 222 |
+
|
| 223 |
+
Name: HBM[22]
|
| 224 |
+
Index: 22
|
| 225 |
+
Type: MEM_DRAM
|
| 226 |
+
Base Address: 0x160000000
|
| 227 |
+
Address Size: 0x10000000
|
| 228 |
+
Bank Used: No
|
| 229 |
+
|
| 230 |
+
Name: HBM[23]
|
| 231 |
+
Index: 23
|
| 232 |
+
Type: MEM_DRAM
|
| 233 |
+
Base Address: 0x170000000
|
| 234 |
+
Address Size: 0x10000000
|
| 235 |
+
Bank Used: No
|
| 236 |
+
|
| 237 |
+
Name: HBM[24]
|
| 238 |
+
Index: 24
|
| 239 |
+
Type: MEM_DRAM
|
| 240 |
+
Base Address: 0x180000000
|
| 241 |
+
Address Size: 0x10000000
|
| 242 |
+
Bank Used: No
|
| 243 |
+
|
| 244 |
+
Name: HBM[25]
|
| 245 |
+
Index: 25
|
| 246 |
+
Type: MEM_DRAM
|
| 247 |
+
Base Address: 0x190000000
|
| 248 |
+
Address Size: 0x10000000
|
| 249 |
+
Bank Used: No
|
| 250 |
+
|
| 251 |
+
Name: HBM[26]
|
| 252 |
+
Index: 26
|
| 253 |
+
Type: MEM_DRAM
|
| 254 |
+
Base Address: 0x1a0000000
|
| 255 |
+
Address Size: 0x10000000
|
| 256 |
+
Bank Used: No
|
| 257 |
+
|
| 258 |
+
Name: HBM[27]
|
| 259 |
+
Index: 27
|
| 260 |
+
Type: MEM_DRAM
|
| 261 |
+
Base Address: 0x1b0000000
|
| 262 |
+
Address Size: 0x10000000
|
| 263 |
+
Bank Used: No
|
| 264 |
+
|
| 265 |
+
Name: HBM[28]
|
| 266 |
+
Index: 28
|
| 267 |
+
Type: MEM_DRAM
|
| 268 |
+
Base Address: 0x1c0000000
|
| 269 |
+
Address Size: 0x10000000
|
| 270 |
+
Bank Used: No
|
| 271 |
+
|
| 272 |
+
Name: HBM[29]
|
| 273 |
+
Index: 29
|
| 274 |
+
Type: MEM_DRAM
|
| 275 |
+
Base Address: 0x1d0000000
|
| 276 |
+
Address Size: 0x10000000
|
| 277 |
+
Bank Used: No
|
| 278 |
+
|
| 279 |
+
Name: HBM[30]
|
| 280 |
+
Index: 30
|
| 281 |
+
Type: MEM_DRAM
|
| 282 |
+
Base Address: 0x1e0000000
|
| 283 |
+
Address Size: 0x10000000
|
| 284 |
+
Bank Used: No
|
| 285 |
+
|
| 286 |
+
Name: HBM[31]
|
| 287 |
+
Index: 31
|
| 288 |
+
Type: MEM_DRAM
|
| 289 |
+
Base Address: 0x1f0000000
|
| 290 |
+
Address Size: 0x10000000
|
| 291 |
+
Bank Used: No
|
| 292 |
+
|
| 293 |
+
Name: DDR[0]
|
| 294 |
+
Index: 32
|
| 295 |
+
Type: MEM_DRAM
|
| 296 |
+
Base Address: 0x0
|
| 297 |
+
Address Size: 0x0
|
| 298 |
+
Bank Used: No
|
| 299 |
+
|
| 300 |
+
Name: DDR[1]
|
| 301 |
+
Index: 33
|
| 302 |
+
Type: MEM_DRAM
|
| 303 |
+
Base Address: 0x0
|
| 304 |
+
Address Size: 0x0
|
| 305 |
+
Bank Used: No
|
| 306 |
+
|
| 307 |
+
Name: PLRAM[0]
|
| 308 |
+
Index: 34
|
| 309 |
+
Type: MEM_DRAM
|
| 310 |
+
Base Address: 0x0
|
| 311 |
+
Address Size: 0x0
|
| 312 |
+
Bank Used: No
|
| 313 |
+
|
| 314 |
+
Name: PLRAM[1]
|
| 315 |
+
Index: 35
|
| 316 |
+
Type: MEM_DRAM
|
| 317 |
+
Base Address: 0x0
|
| 318 |
+
Address Size: 0x0
|
| 319 |
+
Bank Used: No
|
| 320 |
+
|
| 321 |
+
Name: PLRAM[2]
|
| 322 |
+
Index: 36
|
| 323 |
+
Type: MEM_DRAM
|
| 324 |
+
Base Address: 0x0
|
| 325 |
+
Address Size: 0x0
|
| 326 |
+
Bank Used: No
|
| 327 |
+
|
| 328 |
+
Name: PLRAM[3]
|
| 329 |
+
Index: 37
|
| 330 |
+
Type: MEM_DRAM
|
| 331 |
+
Base Address: 0x0
|
| 332 |
+
Address Size: 0x0
|
| 333 |
+
Bank Used: No
|
| 334 |
+
|
| 335 |
+
Name: PLRAM[4]
|
| 336 |
+
Index: 38
|
| 337 |
+
Type: MEM_DRAM
|
| 338 |
+
Base Address: 0x0
|
| 339 |
+
Address Size: 0x0
|
| 340 |
+
Bank Used: No
|
| 341 |
+
|
| 342 |
+
Name: PLRAM[5]
|
| 343 |
+
Index: 39
|
| 344 |
+
Type: MEM_DRAM
|
| 345 |
+
Base Address: 0x0
|
| 346 |
+
Address Size: 0x0
|
| 347 |
+
Bank Used: No
|
| 348 |
+
==============================================================================
|
| 349 |
+
Kernel: opt_kernel
|
| 350 |
+
|
| 351 |
+
Definition
|
| 352 |
+
----------
|
| 353 |
+
Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
|
| 354 |
+
|
| 355 |
+
Ports
|
| 356 |
+
-----
|
| 357 |
+
Port: m_axi_X_acc0
|
| 358 |
+
Mode: master
|
| 359 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 360 |
+
Data Width: 512 bits
|
| 361 |
+
Port Type: addressable
|
| 362 |
+
|
| 363 |
+
Port: m_axi_X_acc1
|
| 364 |
+
Mode: master
|
| 365 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 366 |
+
Data Width: 512 bits
|
| 367 |
+
Port Type: addressable
|
| 368 |
+
|
| 369 |
+
Port: m_axi_W_acc0
|
| 370 |
+
Mode: master
|
| 371 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 372 |
+
Data Width: 512 bits
|
| 373 |
+
Port Type: addressable
|
| 374 |
+
|
| 375 |
+
Port: m_axi_W_acc1
|
| 376 |
+
Mode: master
|
| 377 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 378 |
+
Data Width: 512 bits
|
| 379 |
+
Port Type: addressable
|
| 380 |
+
|
| 381 |
+
Port: m_axi_acc0_out
|
| 382 |
+
Mode: master
|
| 383 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 384 |
+
Data Width: 64 bits
|
| 385 |
+
Port Type: addressable
|
| 386 |
+
|
| 387 |
+
Port: m_axi_acc1_out
|
| 388 |
+
Mode: master
|
| 389 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 390 |
+
Data Width: 64 bits
|
| 391 |
+
Port Type: addressable
|
| 392 |
+
|
| 393 |
+
Port: m_axi_cycle_count
|
| 394 |
+
Mode: master
|
| 395 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 396 |
+
Data Width: 32 bits
|
| 397 |
+
Port Type: addressable
|
| 398 |
+
|
| 399 |
+
Port: s_axi_control
|
| 400 |
+
Mode: slave
|
| 401 |
+
Range (bytes): 0x1000
|
| 402 |
+
Data Width: 32 bits
|
| 403 |
+
Port Type: addressable
|
| 404 |
+
|
| 405 |
+
--------------------------
|
| 406 |
+
Instance: opt_kernel
|
| 407 |
+
Base Address: 0x1800000
|
| 408 |
+
|
| 409 |
+
Argument: L
|
| 410 |
+
Register Offset: 0x10
|
| 411 |
+
Port: s_axi_control
|
| 412 |
+
Memory: <not applicable>
|
| 413 |
+
|
| 414 |
+
Argument: L_out
|
| 415 |
+
Register Offset: 0x18
|
| 416 |
+
Port: s_axi_control
|
| 417 |
+
Memory: <not applicable>
|
| 418 |
+
|
| 419 |
+
Argument: seq_len
|
| 420 |
+
Register Offset: 0x20
|
| 421 |
+
Port: s_axi_control
|
| 422 |
+
Memory: <not applicable>
|
| 423 |
+
|
| 424 |
+
Argument: X_acc0
|
| 425 |
+
Register Offset: 0x28
|
| 426 |
+
Port: m_axi_X_acc0
|
| 427 |
+
Memory: HBM[0] (MEM_DDR4)
|
| 428 |
+
|
| 429 |
+
Argument: X_acc1
|
| 430 |
+
Register Offset: 0x34
|
| 431 |
+
Port: m_axi_X_acc1
|
| 432 |
+
Memory: HBM[16] (MEM_DRAM)
|
| 433 |
+
|
| 434 |
+
Argument: W_acc0
|
| 435 |
+
Register Offset: 0x40
|
| 436 |
+
Port: m_axi_W_acc0
|
| 437 |
+
Memory: HBM[1] (MEM_DDR4)
|
| 438 |
+
|
| 439 |
+
Argument: W_acc1
|
| 440 |
+
Register Offset: 0x4c
|
| 441 |
+
Port: m_axi_W_acc1
|
| 442 |
+
Memory: HBM[17] (MEM_DRAM)
|
| 443 |
+
|
| 444 |
+
Argument: acc0_out
|
| 445 |
+
Register Offset: 0x58
|
| 446 |
+
Port: m_axi_acc0_out
|
| 447 |
+
Memory: HBM[2] (MEM_DRAM)
|
| 448 |
+
|
| 449 |
+
Argument: acc1_out
|
| 450 |
+
Register Offset: 0x64
|
| 451 |
+
Port: m_axi_acc1_out
|
| 452 |
+
Memory: HBM[18] (MEM_DRAM)
|
| 453 |
+
|
| 454 |
+
Argument: cycle_count
|
| 455 |
+
Register Offset: 0x70
|
| 456 |
+
Port: m_axi_cycle_count
|
| 457 |
+
Memory: HBM[19] (MEM_DRAM)
|
| 458 |
+
==============================================================================
|
| 459 |
+
Generated By
|
| 460 |
+
------------
|
| 461 |
+
Command: v++
|
| 462 |
+
Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
|
| 463 |
+
Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[19] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
|
| 464 |
+
Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini
|
| 465 |
+
--connectivity.nk opt_kernel:1:opt_kernel
|
| 466 |
+
--connectivity.sp opt_kernel.X_acc0:HBM[0]
|
| 467 |
+
--connectivity.sp opt_kernel.X_acc1:HBM[16]
|
| 468 |
+
--connectivity.sp opt_kernel.W_acc0:HBM[1]
|
| 469 |
+
--connectivity.sp opt_kernel.W_acc1:HBM[17]
|
| 470 |
+
--connectivity.sp opt_kernel.acc0_out:HBM[2]
|
| 471 |
+
--connectivity.sp opt_kernel.acc1_out:HBM[18]
|
| 472 |
+
--connectivity.sp opt_kernel.cycle_count:HBM[19]
|
| 473 |
+
--input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo
|
| 474 |
+
--kernel opt_kernel
|
| 475 |
+
--link
|
| 476 |
+
--optimize 3
|
| 477 |
+
--output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
|
| 478 |
+
--platform xilinx_u280_xdma_201920_3
|
| 479 |
+
--report_level 2
|
| 480 |
+
--save-temps
|
| 481 |
+
--target hw
|
| 482 |
+
--temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
|
| 483 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
|
| 484 |
+
--vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
|
| 485 |
+
-propconst
|
| 486 |
+
-sweep
|
| 487 |
+
-shift_register_opt}
|
| 488 |
+
--vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high
|
| 489 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
|
| 490 |
+
--vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
|
| 491 |
+
--vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl
|
| 492 |
+
--vivado.synth.jobs 8
|
| 493 |
+
==============================================================================
|
| 494 |
+
User Added Key Value Pairs
|
| 495 |
+
--------------------------
|
| 496 |
+
<empty>
|
| 497 |
+
==============================================================================
|
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0413a6d5d20f76bc7b5d5376f088edec7ee574db131b857215b5a2fbd99e6075
|
| 3 |
+
size 76961468
|
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
==============================================================================
|
| 3 |
+
XRT Build Version: 2.14.384 (2022.2)
|
| 4 |
+
Build Date: 2022-12-09 00:55:08
|
| 5 |
+
Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
|
| 6 |
+
==============================================================================
|
| 7 |
+
xclbin Information
|
| 8 |
+
------------------
|
| 9 |
+
Generated by: v++ (2021.2) on 2021-10-14-04:41:01
|
| 10 |
+
Version: 2.14.384
|
| 11 |
+
Kernels: opt_kernel
|
| 12 |
+
Signature:
|
| 13 |
+
Content: Bitstream
|
| 14 |
+
UUID (xclbin): 4617f7da-9790-9c63-864e-303bcf47c723
|
| 15 |
+
Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
|
| 16 |
+
CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
|
| 17 |
+
EMBEDDED_METADATA, SYSTEM_METADATA,
|
| 18 |
+
GROUP_CONNECTIVITY, GROUP_TOPOLOGY
|
| 19 |
+
==============================================================================
|
| 20 |
+
Hardware Platform (Shell) Information
|
| 21 |
+
-------------------------------------
|
| 22 |
+
Vendor: xilinx
|
| 23 |
+
Board: u280
|
| 24 |
+
Name: xdma
|
| 25 |
+
Version: 201920.3
|
| 26 |
+
Generated Version: Vivado 2019.2 (SW Build: 2742762)
|
| 27 |
+
Created:
|
| 28 |
+
Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
|
| 29 |
+
Board Vendor: xilinx.com
|
| 30 |
+
Board Name: xilinx.com:au280:1.0
|
| 31 |
+
Board Part: xilinx.com:au280:part0:1.0
|
| 32 |
+
Platform VBNV: xilinx_u280_xdma_201920_3
|
| 33 |
+
Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
|
| 34 |
+
Feature ROM TimeStamp: 1579649056
|
| 35 |
+
|
| 36 |
+
Scalable Clocks
|
| 37 |
+
---------------
|
| 38 |
+
Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
|
| 39 |
+
Index: 0
|
| 40 |
+
Type: SYSTEM
|
| 41 |
+
Frequency: 450 MHz
|
| 42 |
+
|
| 43 |
+
Name: DATA_CLK
|
| 44 |
+
Index: 1
|
| 45 |
+
Type: DATA
|
| 46 |
+
Frequency: 241 MHz
|
| 47 |
+
|
| 48 |
+
Name: KERNEL_CLK
|
| 49 |
+
Index: 2
|
| 50 |
+
Type: KERNEL
|
| 51 |
+
Frequency: 500 MHz
|
| 52 |
+
|
| 53 |
+
System Clocks
|
| 54 |
+
------
|
| 55 |
+
Name: _bd_top_clkwiz_kernel2_clk_out1
|
| 56 |
+
Type: SCALABLE
|
| 57 |
+
Default Freq: 500 MHz
|
| 58 |
+
Requested Freq: 500 MHz
|
| 59 |
+
Achieved Freq: 500 MHz
|
| 60 |
+
|
| 61 |
+
Name: _bd_top_clkwiz_kernel_clk_out1
|
| 62 |
+
Type: SCALABLE
|
| 63 |
+
Default Freq: 300 MHz
|
| 64 |
+
Requested Freq: 300 MHz
|
| 65 |
+
Achieved Freq: 241.4 MHz
|
| 66 |
+
|
| 67 |
+
Memory Configuration
|
| 68 |
+
--------------------
|
| 69 |
+
Name: HBM[0]
|
| 70 |
+
Index: 0
|
| 71 |
+
Type: MEM_DDR4
|
| 72 |
+
Base Address: 0x0
|
| 73 |
+
Address Size: 0x10000000
|
| 74 |
+
Bank Used: Yes
|
| 75 |
+
|
| 76 |
+
Name: HBM[1]
|
| 77 |
+
Index: 1
|
| 78 |
+
Type: MEM_DDR4
|
| 79 |
+
Base Address: 0x10000000
|
| 80 |
+
Address Size: 0x10000000
|
| 81 |
+
Bank Used: Yes
|
| 82 |
+
|
| 83 |
+
Name: HBM[2]
|
| 84 |
+
Index: 2
|
| 85 |
+
Type: MEM_DRAM
|
| 86 |
+
Base Address: 0x20000000
|
| 87 |
+
Address Size: 0x10000000
|
| 88 |
+
Bank Used: Yes
|
| 89 |
+
|
| 90 |
+
Name: HBM[3]
|
| 91 |
+
Index: 3
|
| 92 |
+
Type: MEM_DRAM
|
| 93 |
+
Base Address: 0x30000000
|
| 94 |
+
Address Size: 0x10000000
|
| 95 |
+
Bank Used: Yes
|
| 96 |
+
|
| 97 |
+
Name: HBM[4]
|
| 98 |
+
Index: 4
|
| 99 |
+
Type: MEM_DRAM
|
| 100 |
+
Base Address: 0x40000000
|
| 101 |
+
Address Size: 0x10000000
|
| 102 |
+
Bank Used: No
|
| 103 |
+
|
| 104 |
+
Name: HBM[5]
|
| 105 |
+
Index: 5
|
| 106 |
+
Type: MEM_DRAM
|
| 107 |
+
Base Address: 0x50000000
|
| 108 |
+
Address Size: 0x10000000
|
| 109 |
+
Bank Used: No
|
| 110 |
+
|
| 111 |
+
Name: HBM[6]
|
| 112 |
+
Index: 6
|
| 113 |
+
Type: MEM_DRAM
|
| 114 |
+
Base Address: 0x60000000
|
| 115 |
+
Address Size: 0x10000000
|
| 116 |
+
Bank Used: No
|
| 117 |
+
|
| 118 |
+
Name: HBM[7]
|
| 119 |
+
Index: 7
|
| 120 |
+
Type: MEM_DRAM
|
| 121 |
+
Base Address: 0x70000000
|
| 122 |
+
Address Size: 0x10000000
|
| 123 |
+
Bank Used: No
|
| 124 |
+
|
| 125 |
+
Name: HBM[8]
|
| 126 |
+
Index: 8
|
| 127 |
+
Type: MEM_DRAM
|
| 128 |
+
Base Address: 0x80000000
|
| 129 |
+
Address Size: 0x10000000
|
| 130 |
+
Bank Used: No
|
| 131 |
+
|
| 132 |
+
Name: HBM[9]
|
| 133 |
+
Index: 9
|
| 134 |
+
Type: MEM_DRAM
|
| 135 |
+
Base Address: 0x90000000
|
| 136 |
+
Address Size: 0x10000000
|
| 137 |
+
Bank Used: No
|
| 138 |
+
|
| 139 |
+
Name: HBM[10]
|
| 140 |
+
Index: 10
|
| 141 |
+
Type: MEM_DRAM
|
| 142 |
+
Base Address: 0xa0000000
|
| 143 |
+
Address Size: 0x10000000
|
| 144 |
+
Bank Used: No
|
| 145 |
+
|
| 146 |
+
Name: HBM[11]
|
| 147 |
+
Index: 11
|
| 148 |
+
Type: MEM_DRAM
|
| 149 |
+
Base Address: 0xb0000000
|
| 150 |
+
Address Size: 0x10000000
|
| 151 |
+
Bank Used: No
|
| 152 |
+
|
| 153 |
+
Name: HBM[12]
|
| 154 |
+
Index: 12
|
| 155 |
+
Type: MEM_DRAM
|
| 156 |
+
Base Address: 0xc0000000
|
| 157 |
+
Address Size: 0x10000000
|
| 158 |
+
Bank Used: No
|
| 159 |
+
|
| 160 |
+
Name: HBM[13]
|
| 161 |
+
Index: 13
|
| 162 |
+
Type: MEM_DRAM
|
| 163 |
+
Base Address: 0xd0000000
|
| 164 |
+
Address Size: 0x10000000
|
| 165 |
+
Bank Used: No
|
| 166 |
+
|
| 167 |
+
Name: HBM[14]
|
| 168 |
+
Index: 14
|
| 169 |
+
Type: MEM_DRAM
|
| 170 |
+
Base Address: 0xe0000000
|
| 171 |
+
Address Size: 0x10000000
|
| 172 |
+
Bank Used: No
|
| 173 |
+
|
| 174 |
+
Name: HBM[15]
|
| 175 |
+
Index: 15
|
| 176 |
+
Type: MEM_DRAM
|
| 177 |
+
Base Address: 0xf0000000
|
| 178 |
+
Address Size: 0x10000000
|
| 179 |
+
Bank Used: No
|
| 180 |
+
|
| 181 |
+
Name: HBM[16]
|
| 182 |
+
Index: 16
|
| 183 |
+
Type: MEM_DRAM
|
| 184 |
+
Base Address: 0x100000000
|
| 185 |
+
Address Size: 0x10000000
|
| 186 |
+
Bank Used: Yes
|
| 187 |
+
|
| 188 |
+
Name: HBM[17]
|
| 189 |
+
Index: 17
|
| 190 |
+
Type: MEM_DRAM
|
| 191 |
+
Base Address: 0x110000000
|
| 192 |
+
Address Size: 0x10000000
|
| 193 |
+
Bank Used: Yes
|
| 194 |
+
|
| 195 |
+
Name: HBM[18]
|
| 196 |
+
Index: 18
|
| 197 |
+
Type: MEM_DRAM
|
| 198 |
+
Base Address: 0x120000000
|
| 199 |
+
Address Size: 0x10000000
|
| 200 |
+
Bank Used: No
|
| 201 |
+
|
| 202 |
+
Name: HBM[19]
|
| 203 |
+
Index: 19
|
| 204 |
+
Type: MEM_DRAM
|
| 205 |
+
Base Address: 0x130000000
|
| 206 |
+
Address Size: 0x10000000
|
| 207 |
+
Bank Used: No
|
| 208 |
+
|
| 209 |
+
Name: HBM[20]
|
| 210 |
+
Index: 20
|
| 211 |
+
Type: MEM_DRAM
|
| 212 |
+
Base Address: 0x140000000
|
| 213 |
+
Address Size: 0x10000000
|
| 214 |
+
Bank Used: No
|
| 215 |
+
|
| 216 |
+
Name: HBM[21]
|
| 217 |
+
Index: 21
|
| 218 |
+
Type: MEM_DRAM
|
| 219 |
+
Base Address: 0x150000000
|
| 220 |
+
Address Size: 0x10000000
|
| 221 |
+
Bank Used: No
|
| 222 |
+
|
| 223 |
+
Name: HBM[22]
|
| 224 |
+
Index: 22
|
| 225 |
+
Type: MEM_DRAM
|
| 226 |
+
Base Address: 0x160000000
|
| 227 |
+
Address Size: 0x10000000
|
| 228 |
+
Bank Used: No
|
| 229 |
+
|
| 230 |
+
Name: HBM[23]
|
| 231 |
+
Index: 23
|
| 232 |
+
Type: MEM_DRAM
|
| 233 |
+
Base Address: 0x170000000
|
| 234 |
+
Address Size: 0x10000000
|
| 235 |
+
Bank Used: No
|
| 236 |
+
|
| 237 |
+
Name: HBM[24]
|
| 238 |
+
Index: 24
|
| 239 |
+
Type: MEM_DRAM
|
| 240 |
+
Base Address: 0x180000000
|
| 241 |
+
Address Size: 0x10000000
|
| 242 |
+
Bank Used: No
|
| 243 |
+
|
| 244 |
+
Name: HBM[25]
|
| 245 |
+
Index: 25
|
| 246 |
+
Type: MEM_DRAM
|
| 247 |
+
Base Address: 0x190000000
|
| 248 |
+
Address Size: 0x10000000
|
| 249 |
+
Bank Used: No
|
| 250 |
+
|
| 251 |
+
Name: HBM[26]
|
| 252 |
+
Index: 26
|
| 253 |
+
Type: MEM_DRAM
|
| 254 |
+
Base Address: 0x1a0000000
|
| 255 |
+
Address Size: 0x10000000
|
| 256 |
+
Bank Used: No
|
| 257 |
+
|
| 258 |
+
Name: HBM[27]
|
| 259 |
+
Index: 27
|
| 260 |
+
Type: MEM_DRAM
|
| 261 |
+
Base Address: 0x1b0000000
|
| 262 |
+
Address Size: 0x10000000
|
| 263 |
+
Bank Used: No
|
| 264 |
+
|
| 265 |
+
Name: HBM[28]
|
| 266 |
+
Index: 28
|
| 267 |
+
Type: MEM_DRAM
|
| 268 |
+
Base Address: 0x1c0000000
|
| 269 |
+
Address Size: 0x10000000
|
| 270 |
+
Bank Used: No
|
| 271 |
+
|
| 272 |
+
Name: HBM[29]
|
| 273 |
+
Index: 29
|
| 274 |
+
Type: MEM_DRAM
|
| 275 |
+
Base Address: 0x1d0000000
|
| 276 |
+
Address Size: 0x10000000
|
| 277 |
+
Bank Used: No
|
| 278 |
+
|
| 279 |
+
Name: HBM[30]
|
| 280 |
+
Index: 30
|
| 281 |
+
Type: MEM_DRAM
|
| 282 |
+
Base Address: 0x1e0000000
|
| 283 |
+
Address Size: 0x10000000
|
| 284 |
+
Bank Used: No
|
| 285 |
+
|
| 286 |
+
Name: HBM[31]
|
| 287 |
+
Index: 31
|
| 288 |
+
Type: MEM_DRAM
|
| 289 |
+
Base Address: 0x1f0000000
|
| 290 |
+
Address Size: 0x10000000
|
| 291 |
+
Bank Used: No
|
| 292 |
+
|
| 293 |
+
Name: DDR[0]
|
| 294 |
+
Index: 32
|
| 295 |
+
Type: MEM_DRAM
|
| 296 |
+
Base Address: 0x0
|
| 297 |
+
Address Size: 0x0
|
| 298 |
+
Bank Used: No
|
| 299 |
+
|
| 300 |
+
Name: DDR[1]
|
| 301 |
+
Index: 33
|
| 302 |
+
Type: MEM_DRAM
|
| 303 |
+
Base Address: 0x0
|
| 304 |
+
Address Size: 0x0
|
| 305 |
+
Bank Used: No
|
| 306 |
+
|
| 307 |
+
Name: PLRAM[0]
|
| 308 |
+
Index: 34
|
| 309 |
+
Type: MEM_DRAM
|
| 310 |
+
Base Address: 0x0
|
| 311 |
+
Address Size: 0x0
|
| 312 |
+
Bank Used: No
|
| 313 |
+
|
| 314 |
+
Name: PLRAM[1]
|
| 315 |
+
Index: 35
|
| 316 |
+
Type: MEM_DRAM
|
| 317 |
+
Base Address: 0x0
|
| 318 |
+
Address Size: 0x0
|
| 319 |
+
Bank Used: No
|
| 320 |
+
|
| 321 |
+
Name: PLRAM[2]
|
| 322 |
+
Index: 36
|
| 323 |
+
Type: MEM_DRAM
|
| 324 |
+
Base Address: 0x0
|
| 325 |
+
Address Size: 0x0
|
| 326 |
+
Bank Used: No
|
| 327 |
+
|
| 328 |
+
Name: PLRAM[3]
|
| 329 |
+
Index: 37
|
| 330 |
+
Type: MEM_DRAM
|
| 331 |
+
Base Address: 0x0
|
| 332 |
+
Address Size: 0x0
|
| 333 |
+
Bank Used: No
|
| 334 |
+
|
| 335 |
+
Name: PLRAM[4]
|
| 336 |
+
Index: 38
|
| 337 |
+
Type: MEM_DRAM
|
| 338 |
+
Base Address: 0x0
|
| 339 |
+
Address Size: 0x0
|
| 340 |
+
Bank Used: No
|
| 341 |
+
|
| 342 |
+
Name: PLRAM[5]
|
| 343 |
+
Index: 39
|
| 344 |
+
Type: MEM_DRAM
|
| 345 |
+
Base Address: 0x0
|
| 346 |
+
Address Size: 0x0
|
| 347 |
+
Bank Used: No
|
| 348 |
+
==============================================================================
|
| 349 |
+
Kernel: opt_kernel
|
| 350 |
+
|
| 351 |
+
Definition
|
| 352 |
+
----------
|
| 353 |
+
Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc1_out, int* cycle_count)
|
| 354 |
+
|
| 355 |
+
Ports
|
| 356 |
+
-----
|
| 357 |
+
Port: m_axi_X_acc0
|
| 358 |
+
Mode: master
|
| 359 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 360 |
+
Data Width: 512 bits
|
| 361 |
+
Port Type: addressable
|
| 362 |
+
|
| 363 |
+
Port: m_axi_X_acc1
|
| 364 |
+
Mode: master
|
| 365 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 366 |
+
Data Width: 512 bits
|
| 367 |
+
Port Type: addressable
|
| 368 |
+
|
| 369 |
+
Port: m_axi_W_acc0
|
| 370 |
+
Mode: master
|
| 371 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 372 |
+
Data Width: 512 bits
|
| 373 |
+
Port Type: addressable
|
| 374 |
+
|
| 375 |
+
Port: m_axi_W_acc1
|
| 376 |
+
Mode: master
|
| 377 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 378 |
+
Data Width: 512 bits
|
| 379 |
+
Port Type: addressable
|
| 380 |
+
|
| 381 |
+
Port: m_axi_acc1_out
|
| 382 |
+
Mode: master
|
| 383 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 384 |
+
Data Width: 64 bits
|
| 385 |
+
Port Type: addressable
|
| 386 |
+
|
| 387 |
+
Port: m_axi_cycle_count
|
| 388 |
+
Mode: master
|
| 389 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 390 |
+
Data Width: 32 bits
|
| 391 |
+
Port Type: addressable
|
| 392 |
+
|
| 393 |
+
Port: s_axi_control
|
| 394 |
+
Mode: slave
|
| 395 |
+
Range (bytes): 0x1000
|
| 396 |
+
Data Width: 32 bits
|
| 397 |
+
Port Type: addressable
|
| 398 |
+
|
| 399 |
+
--------------------------
|
| 400 |
+
Instance: opt_kernel
|
| 401 |
+
Base Address: 0x1800000
|
| 402 |
+
|
| 403 |
+
Argument: L
|
| 404 |
+
Register Offset: 0x10
|
| 405 |
+
Port: s_axi_control
|
| 406 |
+
Memory: <not applicable>
|
| 407 |
+
|
| 408 |
+
Argument: L_out
|
| 409 |
+
Register Offset: 0x18
|
| 410 |
+
Port: s_axi_control
|
| 411 |
+
Memory: <not applicable>
|
| 412 |
+
|
| 413 |
+
Argument: seq_len
|
| 414 |
+
Register Offset: 0x20
|
| 415 |
+
Port: s_axi_control
|
| 416 |
+
Memory: <not applicable>
|
| 417 |
+
|
| 418 |
+
Argument: reload
|
| 419 |
+
Register Offset: 0x28
|
| 420 |
+
Port: s_axi_control
|
| 421 |
+
Memory: <not applicable>
|
| 422 |
+
|
| 423 |
+
Argument: X_acc0
|
| 424 |
+
Register Offset: 0x30
|
| 425 |
+
Port: m_axi_X_acc0
|
| 426 |
+
Memory: HBM[16] (MEM_DRAM)
|
| 427 |
+
|
| 428 |
+
Argument: X_acc1
|
| 429 |
+
Register Offset: 0x3c
|
| 430 |
+
Port: m_axi_X_acc1
|
| 431 |
+
Memory: HBM[0] (MEM_DDR4)
|
| 432 |
+
|
| 433 |
+
Argument: W_acc0
|
| 434 |
+
Register Offset: 0x48
|
| 435 |
+
Port: m_axi_W_acc0
|
| 436 |
+
Memory: HBM[17] (MEM_DRAM)
|
| 437 |
+
|
| 438 |
+
Argument: W_acc1
|
| 439 |
+
Register Offset: 0x54
|
| 440 |
+
Port: m_axi_W_acc1
|
| 441 |
+
Memory: HBM[1] (MEM_DDR4)
|
| 442 |
+
|
| 443 |
+
Argument: acc1_out
|
| 444 |
+
Register Offset: 0x60
|
| 445 |
+
Port: m_axi_acc1_out
|
| 446 |
+
Memory: HBM[2] (MEM_DRAM)
|
| 447 |
+
|
| 448 |
+
Argument: cycle_count
|
| 449 |
+
Register Offset: 0x6c
|
| 450 |
+
Port: m_axi_cycle_count
|
| 451 |
+
Memory: HBM[3] (MEM_DRAM)
|
| 452 |
+
==============================================================================
|
| 453 |
+
Generated By
|
| 454 |
+
------------
|
| 455 |
+
Command: v++
|
| 456 |
+
Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
|
| 457 |
+
Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[16] --connectivity.sp opt_kernel.X_acc1:HBM[0] --connectivity.sp opt_kernel.W_acc0:HBM[17] --connectivity.sp opt_kernel.W_acc1:HBM[1] --connectivity.sp opt_kernel.acc1_out:HBM[2] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
|
| 458 |
+
Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini
|
| 459 |
+
--connectivity.nk opt_kernel:1:opt_kernel
|
| 460 |
+
--connectivity.sp opt_kernel.X_acc0:HBM[16]
|
| 461 |
+
--connectivity.sp opt_kernel.X_acc1:HBM[0]
|
| 462 |
+
--connectivity.sp opt_kernel.W_acc0:HBM[17]
|
| 463 |
+
--connectivity.sp opt_kernel.W_acc1:HBM[1]
|
| 464 |
+
--connectivity.sp opt_kernel.acc1_out:HBM[2]
|
| 465 |
+
--connectivity.sp opt_kernel.cycle_count:HBM[3]
|
| 466 |
+
--input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo
|
| 467 |
+
--kernel opt_kernel
|
| 468 |
+
--link
|
| 469 |
+
--optimize 3
|
| 470 |
+
--output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
|
| 471 |
+
--platform xilinx_u280_xdma_201920_3
|
| 472 |
+
--report_level 2
|
| 473 |
+
--save-temps
|
| 474 |
+
--target hw
|
| 475 |
+
--temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
|
| 476 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
|
| 477 |
+
--vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
|
| 478 |
+
-propconst
|
| 479 |
+
-sweep
|
| 480 |
+
-shift_register_opt}
|
| 481 |
+
--vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
|
| 482 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
|
| 483 |
+
--vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
|
| 484 |
+
--vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl
|
| 485 |
+
--vivado.synth.jobs 8
|
| 486 |
+
==============================================================================
|
| 487 |
+
User Added Key Value Pairs
|
| 488 |
+
--------------------------
|
| 489 |
+
<empty>
|
| 490 |
+
==============================================================================
|
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30d40b37a38089e3181996d9df02dd4371c8423c93304316bf22637e655992c3
|
| 3 |
+
size 76724924
|
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info
ADDED
|
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
==============================================================================
|
| 3 |
+
XRT Build Version: 2.14.384 (2022.2)
|
| 4 |
+
Build Date: 2022-12-09 00:55:08
|
| 5 |
+
Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
|
| 6 |
+
==============================================================================
|
| 7 |
+
xclbin Information
|
| 8 |
+
------------------
|
| 9 |
+
Generated by: v++ (2021.2) on 2021-10-14-04:41:01
|
| 10 |
+
Version: 2.14.384
|
| 11 |
+
Kernels: opt_kernel
|
| 12 |
+
Signature:
|
| 13 |
+
Content: Bitstream
|
| 14 |
+
UUID (xclbin): cbb0489a-3f5c-066e-845c-af93ba50ad0a
|
| 15 |
+
Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
|
| 16 |
+
CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
|
| 17 |
+
EMBEDDED_METADATA, SYSTEM_METADATA,
|
| 18 |
+
GROUP_CONNECTIVITY, GROUP_TOPOLOGY
|
| 19 |
+
==============================================================================
|
| 20 |
+
Hardware Platform (Shell) Information
|
| 21 |
+
-------------------------------------
|
| 22 |
+
Vendor: xilinx
|
| 23 |
+
Board: u280
|
| 24 |
+
Name: xdma
|
| 25 |
+
Version: 201920.3
|
| 26 |
+
Generated Version: Vivado 2019.2 (SW Build: 2742762)
|
| 27 |
+
Created:
|
| 28 |
+
Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
|
| 29 |
+
Board Vendor: xilinx.com
|
| 30 |
+
Board Name: xilinx.com:au280:1.0
|
| 31 |
+
Board Part: xilinx.com:au280:part0:1.0
|
| 32 |
+
Platform VBNV: xilinx_u280_xdma_201920_3
|
| 33 |
+
Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
|
| 34 |
+
Feature ROM TimeStamp: 1579649056
|
| 35 |
+
|
| 36 |
+
Scalable Clocks
|
| 37 |
+
---------------
|
| 38 |
+
Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
|
| 39 |
+
Index: 0
|
| 40 |
+
Type: SYSTEM
|
| 41 |
+
Frequency: 450 MHz
|
| 42 |
+
|
| 43 |
+
Name: DATA_CLK
|
| 44 |
+
Index: 1
|
| 45 |
+
Type: DATA
|
| 46 |
+
Frequency: 202 MHz
|
| 47 |
+
|
| 48 |
+
Name: KERNEL_CLK
|
| 49 |
+
Index: 2
|
| 50 |
+
Type: KERNEL
|
| 51 |
+
Frequency: 500 MHz
|
| 52 |
+
|
| 53 |
+
System Clocks
|
| 54 |
+
------
|
| 55 |
+
Name: _bd_top_clkwiz_kernel2_clk_out1
|
| 56 |
+
Type: SCALABLE
|
| 57 |
+
Default Freq: 500 MHz
|
| 58 |
+
Requested Freq: 500 MHz
|
| 59 |
+
Achieved Freq: 500 MHz
|
| 60 |
+
|
| 61 |
+
Name: _bd_top_clkwiz_kernel_clk_out1
|
| 62 |
+
Type: SCALABLE
|
| 63 |
+
Default Freq: 300 MHz
|
| 64 |
+
Requested Freq: 300 MHz
|
| 65 |
+
Achieved Freq: 202.5 MHz
|
| 66 |
+
|
| 67 |
+
Memory Configuration
|
| 68 |
+
--------------------
|
| 69 |
+
Name: HBM[0]
|
| 70 |
+
Index: 0
|
| 71 |
+
Type: MEM_DDR4
|
| 72 |
+
Base Address: 0x0
|
| 73 |
+
Address Size: 0x10000000
|
| 74 |
+
Bank Used: Yes
|
| 75 |
+
|
| 76 |
+
Name: HBM[1]
|
| 77 |
+
Index: 1
|
| 78 |
+
Type: MEM_DDR4
|
| 79 |
+
Base Address: 0x10000000
|
| 80 |
+
Address Size: 0x10000000
|
| 81 |
+
Bank Used: Yes
|
| 82 |
+
|
| 83 |
+
Name: HBM[2]
|
| 84 |
+
Index: 2
|
| 85 |
+
Type: MEM_DRAM
|
| 86 |
+
Base Address: 0x20000000
|
| 87 |
+
Address Size: 0x10000000
|
| 88 |
+
Bank Used: Yes
|
| 89 |
+
|
| 90 |
+
Name: HBM[3]
|
| 91 |
+
Index: 3
|
| 92 |
+
Type: MEM_DRAM
|
| 93 |
+
Base Address: 0x30000000
|
| 94 |
+
Address Size: 0x10000000
|
| 95 |
+
Bank Used: Yes
|
| 96 |
+
|
| 97 |
+
Name: HBM[4]
|
| 98 |
+
Index: 4
|
| 99 |
+
Type: MEM_DRAM
|
| 100 |
+
Base Address: 0x40000000
|
| 101 |
+
Address Size: 0x10000000
|
| 102 |
+
Bank Used: No
|
| 103 |
+
|
| 104 |
+
Name: HBM[5]
|
| 105 |
+
Index: 5
|
| 106 |
+
Type: MEM_DRAM
|
| 107 |
+
Base Address: 0x50000000
|
| 108 |
+
Address Size: 0x10000000
|
| 109 |
+
Bank Used: No
|
| 110 |
+
|
| 111 |
+
Name: HBM[6]
|
| 112 |
+
Index: 6
|
| 113 |
+
Type: MEM_DRAM
|
| 114 |
+
Base Address: 0x60000000
|
| 115 |
+
Address Size: 0x10000000
|
| 116 |
+
Bank Used: No
|
| 117 |
+
|
| 118 |
+
Name: HBM[7]
|
| 119 |
+
Index: 7
|
| 120 |
+
Type: MEM_DRAM
|
| 121 |
+
Base Address: 0x70000000
|
| 122 |
+
Address Size: 0x10000000
|
| 123 |
+
Bank Used: No
|
| 124 |
+
|
| 125 |
+
Name: HBM[8]
|
| 126 |
+
Index: 8
|
| 127 |
+
Type: MEM_DRAM
|
| 128 |
+
Base Address: 0x80000000
|
| 129 |
+
Address Size: 0x10000000
|
| 130 |
+
Bank Used: No
|
| 131 |
+
|
| 132 |
+
Name: HBM[9]
|
| 133 |
+
Index: 9
|
| 134 |
+
Type: MEM_DRAM
|
| 135 |
+
Base Address: 0x90000000
|
| 136 |
+
Address Size: 0x10000000
|
| 137 |
+
Bank Used: No
|
| 138 |
+
|
| 139 |
+
Name: HBM[10]
|
| 140 |
+
Index: 10
|
| 141 |
+
Type: MEM_DRAM
|
| 142 |
+
Base Address: 0xa0000000
|
| 143 |
+
Address Size: 0x10000000
|
| 144 |
+
Bank Used: No
|
| 145 |
+
|
| 146 |
+
Name: HBM[11]
|
| 147 |
+
Index: 11
|
| 148 |
+
Type: MEM_DRAM
|
| 149 |
+
Base Address: 0xb0000000
|
| 150 |
+
Address Size: 0x10000000
|
| 151 |
+
Bank Used: No
|
| 152 |
+
|
| 153 |
+
Name: HBM[12]
|
| 154 |
+
Index: 12
|
| 155 |
+
Type: MEM_DRAM
|
| 156 |
+
Base Address: 0xc0000000
|
| 157 |
+
Address Size: 0x10000000
|
| 158 |
+
Bank Used: No
|
| 159 |
+
|
| 160 |
+
Name: HBM[13]
|
| 161 |
+
Index: 13
|
| 162 |
+
Type: MEM_DRAM
|
| 163 |
+
Base Address: 0xd0000000
|
| 164 |
+
Address Size: 0x10000000
|
| 165 |
+
Bank Used: No
|
| 166 |
+
|
| 167 |
+
Name: HBM[14]
|
| 168 |
+
Index: 14
|
| 169 |
+
Type: MEM_DRAM
|
| 170 |
+
Base Address: 0xe0000000
|
| 171 |
+
Address Size: 0x10000000
|
| 172 |
+
Bank Used: No
|
| 173 |
+
|
| 174 |
+
Name: HBM[15]
|
| 175 |
+
Index: 15
|
| 176 |
+
Type: MEM_DRAM
|
| 177 |
+
Base Address: 0xf0000000
|
| 178 |
+
Address Size: 0x10000000
|
| 179 |
+
Bank Used: No
|
| 180 |
+
|
| 181 |
+
Name: HBM[16]
|
| 182 |
+
Index: 16
|
| 183 |
+
Type: MEM_DRAM
|
| 184 |
+
Base Address: 0x100000000
|
| 185 |
+
Address Size: 0x10000000
|
| 186 |
+
Bank Used: Yes
|
| 187 |
+
|
| 188 |
+
Name: HBM[17]
|
| 189 |
+
Index: 17
|
| 190 |
+
Type: MEM_DRAM
|
| 191 |
+
Base Address: 0x110000000
|
| 192 |
+
Address Size: 0x10000000
|
| 193 |
+
Bank Used: Yes
|
| 194 |
+
|
| 195 |
+
Name: HBM[18]
|
| 196 |
+
Index: 18
|
| 197 |
+
Type: MEM_DRAM
|
| 198 |
+
Base Address: 0x120000000
|
| 199 |
+
Address Size: 0x10000000
|
| 200 |
+
Bank Used: Yes
|
| 201 |
+
|
| 202 |
+
Name: HBM[19]
|
| 203 |
+
Index: 19
|
| 204 |
+
Type: MEM_DRAM
|
| 205 |
+
Base Address: 0x130000000
|
| 206 |
+
Address Size: 0x10000000
|
| 207 |
+
Bank Used: No
|
| 208 |
+
|
| 209 |
+
Name: HBM[20]
|
| 210 |
+
Index: 20
|
| 211 |
+
Type: MEM_DRAM
|
| 212 |
+
Base Address: 0x140000000
|
| 213 |
+
Address Size: 0x10000000
|
| 214 |
+
Bank Used: No
|
| 215 |
+
|
| 216 |
+
Name: HBM[21]
|
| 217 |
+
Index: 21
|
| 218 |
+
Type: MEM_DRAM
|
| 219 |
+
Base Address: 0x150000000
|
| 220 |
+
Address Size: 0x10000000
|
| 221 |
+
Bank Used: No
|
| 222 |
+
|
| 223 |
+
Name: HBM[22]
|
| 224 |
+
Index: 22
|
| 225 |
+
Type: MEM_DRAM
|
| 226 |
+
Base Address: 0x160000000
|
| 227 |
+
Address Size: 0x10000000
|
| 228 |
+
Bank Used: No
|
| 229 |
+
|
| 230 |
+
Name: HBM[23]
|
| 231 |
+
Index: 23
|
| 232 |
+
Type: MEM_DRAM
|
| 233 |
+
Base Address: 0x170000000
|
| 234 |
+
Address Size: 0x10000000
|
| 235 |
+
Bank Used: No
|
| 236 |
+
|
| 237 |
+
Name: HBM[24]
|
| 238 |
+
Index: 24
|
| 239 |
+
Type: MEM_DRAM
|
| 240 |
+
Base Address: 0x180000000
|
| 241 |
+
Address Size: 0x10000000
|
| 242 |
+
Bank Used: No
|
| 243 |
+
|
| 244 |
+
Name: HBM[25]
|
| 245 |
+
Index: 25
|
| 246 |
+
Type: MEM_DRAM
|
| 247 |
+
Base Address: 0x190000000
|
| 248 |
+
Address Size: 0x10000000
|
| 249 |
+
Bank Used: No
|
| 250 |
+
|
| 251 |
+
Name: HBM[26]
|
| 252 |
+
Index: 26
|
| 253 |
+
Type: MEM_DRAM
|
| 254 |
+
Base Address: 0x1a0000000
|
| 255 |
+
Address Size: 0x10000000
|
| 256 |
+
Bank Used: No
|
| 257 |
+
|
| 258 |
+
Name: HBM[27]
|
| 259 |
+
Index: 27
|
| 260 |
+
Type: MEM_DRAM
|
| 261 |
+
Base Address: 0x1b0000000
|
| 262 |
+
Address Size: 0x10000000
|
| 263 |
+
Bank Used: No
|
| 264 |
+
|
| 265 |
+
Name: HBM[28]
|
| 266 |
+
Index: 28
|
| 267 |
+
Type: MEM_DRAM
|
| 268 |
+
Base Address: 0x1c0000000
|
| 269 |
+
Address Size: 0x10000000
|
| 270 |
+
Bank Used: No
|
| 271 |
+
|
| 272 |
+
Name: HBM[29]
|
| 273 |
+
Index: 29
|
| 274 |
+
Type: MEM_DRAM
|
| 275 |
+
Base Address: 0x1d0000000
|
| 276 |
+
Address Size: 0x10000000
|
| 277 |
+
Bank Used: No
|
| 278 |
+
|
| 279 |
+
Name: HBM[30]
|
| 280 |
+
Index: 30
|
| 281 |
+
Type: MEM_DRAM
|
| 282 |
+
Base Address: 0x1e0000000
|
| 283 |
+
Address Size: 0x10000000
|
| 284 |
+
Bank Used: No
|
| 285 |
+
|
| 286 |
+
Name: HBM[31]
|
| 287 |
+
Index: 31
|
| 288 |
+
Type: MEM_DRAM
|
| 289 |
+
Base Address: 0x1f0000000
|
| 290 |
+
Address Size: 0x10000000
|
| 291 |
+
Bank Used: No
|
| 292 |
+
|
| 293 |
+
Name: DDR[0]
|
| 294 |
+
Index: 32
|
| 295 |
+
Type: MEM_DRAM
|
| 296 |
+
Base Address: 0x0
|
| 297 |
+
Address Size: 0x0
|
| 298 |
+
Bank Used: No
|
| 299 |
+
|
| 300 |
+
Name: DDR[1]
|
| 301 |
+
Index: 33
|
| 302 |
+
Type: MEM_DRAM
|
| 303 |
+
Base Address: 0x0
|
| 304 |
+
Address Size: 0x0
|
| 305 |
+
Bank Used: No
|
| 306 |
+
|
| 307 |
+
Name: PLRAM[0]
|
| 308 |
+
Index: 34
|
| 309 |
+
Type: MEM_DRAM
|
| 310 |
+
Base Address: 0x0
|
| 311 |
+
Address Size: 0x0
|
| 312 |
+
Bank Used: No
|
| 313 |
+
|
| 314 |
+
Name: PLRAM[1]
|
| 315 |
+
Index: 35
|
| 316 |
+
Type: MEM_DRAM
|
| 317 |
+
Base Address: 0x0
|
| 318 |
+
Address Size: 0x0
|
| 319 |
+
Bank Used: No
|
| 320 |
+
|
| 321 |
+
Name: PLRAM[2]
|
| 322 |
+
Index: 36
|
| 323 |
+
Type: MEM_DRAM
|
| 324 |
+
Base Address: 0x0
|
| 325 |
+
Address Size: 0x0
|
| 326 |
+
Bank Used: No
|
| 327 |
+
|
| 328 |
+
Name: PLRAM[3]
|
| 329 |
+
Index: 37
|
| 330 |
+
Type: MEM_DRAM
|
| 331 |
+
Base Address: 0x0
|
| 332 |
+
Address Size: 0x0
|
| 333 |
+
Bank Used: No
|
| 334 |
+
|
| 335 |
+
Name: PLRAM[4]
|
| 336 |
+
Index: 38
|
| 337 |
+
Type: MEM_DRAM
|
| 338 |
+
Base Address: 0x0
|
| 339 |
+
Address Size: 0x0
|
| 340 |
+
Bank Used: No
|
| 341 |
+
|
| 342 |
+
Name: PLRAM[5]
|
| 343 |
+
Index: 39
|
| 344 |
+
Type: MEM_DRAM
|
| 345 |
+
Base Address: 0x0
|
| 346 |
+
Address Size: 0x0
|
| 347 |
+
Bank Used: No
|
| 348 |
+
==============================================================================
|
| 349 |
+
Kernel: opt_kernel
|
| 350 |
+
|
| 351 |
+
Definition
|
| 352 |
+
----------
|
| 353 |
+
Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
|
| 354 |
+
|
| 355 |
+
Ports
|
| 356 |
+
-----
|
| 357 |
+
Port: m_axi_X_acc0
|
| 358 |
+
Mode: master
|
| 359 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 360 |
+
Data Width: 512 bits
|
| 361 |
+
Port Type: addressable
|
| 362 |
+
|
| 363 |
+
Port: m_axi_X_acc1
|
| 364 |
+
Mode: master
|
| 365 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 366 |
+
Data Width: 512 bits
|
| 367 |
+
Port Type: addressable
|
| 368 |
+
|
| 369 |
+
Port: m_axi_W_acc0
|
| 370 |
+
Mode: master
|
| 371 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 372 |
+
Data Width: 512 bits
|
| 373 |
+
Port Type: addressable
|
| 374 |
+
|
| 375 |
+
Port: m_axi_W_acc1
|
| 376 |
+
Mode: master
|
| 377 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 378 |
+
Data Width: 512 bits
|
| 379 |
+
Port Type: addressable
|
| 380 |
+
|
| 381 |
+
Port: m_axi_acc0_out
|
| 382 |
+
Mode: master
|
| 383 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 384 |
+
Data Width: 64 bits
|
| 385 |
+
Port Type: addressable
|
| 386 |
+
|
| 387 |
+
Port: m_axi_acc1_out
|
| 388 |
+
Mode: master
|
| 389 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 390 |
+
Data Width: 64 bits
|
| 391 |
+
Port Type: addressable
|
| 392 |
+
|
| 393 |
+
Port: m_axi_cycle_count
|
| 394 |
+
Mode: master
|
| 395 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 396 |
+
Data Width: 32 bits
|
| 397 |
+
Port Type: addressable
|
| 398 |
+
|
| 399 |
+
Port: s_axi_control
|
| 400 |
+
Mode: slave
|
| 401 |
+
Range (bytes): 0x1000
|
| 402 |
+
Data Width: 32 bits
|
| 403 |
+
Port Type: addressable
|
| 404 |
+
|
| 405 |
+
--------------------------
|
| 406 |
+
Instance: opt_kernel
|
| 407 |
+
Base Address: 0x1800000
|
| 408 |
+
|
| 409 |
+
Argument: L
|
| 410 |
+
Register Offset: 0x10
|
| 411 |
+
Port: s_axi_control
|
| 412 |
+
Memory: <not applicable>
|
| 413 |
+
|
| 414 |
+
Argument: L_out
|
| 415 |
+
Register Offset: 0x18
|
| 416 |
+
Port: s_axi_control
|
| 417 |
+
Memory: <not applicable>
|
| 418 |
+
|
| 419 |
+
Argument: seq_len
|
| 420 |
+
Register Offset: 0x20
|
| 421 |
+
Port: s_axi_control
|
| 422 |
+
Memory: <not applicable>
|
| 423 |
+
|
| 424 |
+
Argument: reload
|
| 425 |
+
Register Offset: 0x28
|
| 426 |
+
Port: s_axi_control
|
| 427 |
+
Memory: <not applicable>
|
| 428 |
+
|
| 429 |
+
Argument: X_acc0
|
| 430 |
+
Register Offset: 0x30
|
| 431 |
+
Port: m_axi_X_acc0
|
| 432 |
+
Memory: HBM[0] (MEM_DDR4)
|
| 433 |
+
|
| 434 |
+
Argument: X_acc1
|
| 435 |
+
Register Offset: 0x3c
|
| 436 |
+
Port: m_axi_X_acc1
|
| 437 |
+
Memory: HBM[16] (MEM_DRAM)
|
| 438 |
+
|
| 439 |
+
Argument: W_acc0
|
| 440 |
+
Register Offset: 0x48
|
| 441 |
+
Port: m_axi_W_acc0
|
| 442 |
+
Memory: HBM[1] (MEM_DDR4)
|
| 443 |
+
|
| 444 |
+
Argument: W_acc1
|
| 445 |
+
Register Offset: 0x54
|
| 446 |
+
Port: m_axi_W_acc1
|
| 447 |
+
Memory: HBM[17] (MEM_DRAM)
|
| 448 |
+
|
| 449 |
+
Argument: acc0_out
|
| 450 |
+
Register Offset: 0x60
|
| 451 |
+
Port: m_axi_acc0_out
|
| 452 |
+
Memory: HBM[2] (MEM_DRAM)
|
| 453 |
+
|
| 454 |
+
Argument: acc1_out
|
| 455 |
+
Register Offset: 0x6c
|
| 456 |
+
Port: m_axi_acc1_out
|
| 457 |
+
Memory: HBM[18] (MEM_DRAM)
|
| 458 |
+
|
| 459 |
+
Argument: cycle_count
|
| 460 |
+
Register Offset: 0x78
|
| 461 |
+
Port: m_axi_cycle_count
|
| 462 |
+
Memory: HBM[3] (MEM_DRAM)
|
| 463 |
+
==============================================================================
|
| 464 |
+
Generated By
|
| 465 |
+
------------
|
| 466 |
+
Command: v++
|
| 467 |
+
Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
|
| 468 |
+
Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
|
| 469 |
+
Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
|
| 470 |
+
--connectivity.nk opt_kernel:1:opt_kernel
|
| 471 |
+
--connectivity.sp opt_kernel.X_acc0:HBM[0]
|
| 472 |
+
--connectivity.sp opt_kernel.X_acc1:HBM[16]
|
| 473 |
+
--connectivity.sp opt_kernel.W_acc0:HBM[1]
|
| 474 |
+
--connectivity.sp opt_kernel.W_acc1:HBM[17]
|
| 475 |
+
--connectivity.sp opt_kernel.acc0_out:HBM[2]
|
| 476 |
+
--connectivity.sp opt_kernel.acc1_out:HBM[18]
|
| 477 |
+
--connectivity.sp opt_kernel.cycle_count:HBM[3]
|
| 478 |
+
--input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
|
| 479 |
+
--kernel opt_kernel
|
| 480 |
+
--link
|
| 481 |
+
--optimize 3
|
| 482 |
+
--output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
|
| 483 |
+
--platform xilinx_u280_xdma_201920_3
|
| 484 |
+
--report_level 2
|
| 485 |
+
--save-temps
|
| 486 |
+
--target hw
|
| 487 |
+
--temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
|
| 488 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
|
| 489 |
+
--vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
|
| 490 |
+
-propconst
|
| 491 |
+
-sweep
|
| 492 |
+
-shift_register_opt}
|
| 493 |
+
--vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
|
| 494 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
|
| 495 |
+
--vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
|
| 496 |
+
--vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
|
| 497 |
+
--vivado.synth.jobs 8
|
| 498 |
+
==============================================================================
|
| 499 |
+
User Added Key Value Pairs
|
| 500 |
+
--------------------------
|
| 501 |
+
<empty>
|
| 502 |
+
==============================================================================
|
gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:722a71423e17da2f05587a9fd3c1e9d695f5cee02962744fcbde569aca21242f
|
| 3 |
+
size 70565471
|
gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d986bb71631b79c5c2b5c6576e1f78051a671d6d4536e2095c8a39127c456461
|
| 3 |
+
size 86497092
|
gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7c1339d6b78b36c4a35cb09709dfebb321bdf0decf037802e5d617356ad42b6
|
| 3 |
+
size 84081530
|
gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d258e7884b1e3c2f42bc8fd7a3878ab976b8cd2cc5042bdaaea949b27f506688
|
| 3 |
+
size 82554104
|
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
==============================================================================
|
| 3 |
+
XRT Build Version: 2.14.384 (2022.2)
|
| 4 |
+
Build Date: 2022-12-09 00:55:08
|
| 5 |
+
Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
|
| 6 |
+
==============================================================================
|
| 7 |
+
xclbin Information
|
| 8 |
+
------------------
|
| 9 |
+
Generated by: v++ (2021.2) on 2021-10-14-04:41:01
|
| 10 |
+
Version: 2.14.384
|
| 11 |
+
Kernels: opt_kernel
|
| 12 |
+
Signature:
|
| 13 |
+
Content: Bitstream
|
| 14 |
+
UUID (xclbin): 06dfa191-ba53-780e-16db-fd0655f01fc3
|
| 15 |
+
Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
|
| 16 |
+
CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
|
| 17 |
+
EMBEDDED_METADATA, SYSTEM_METADATA,
|
| 18 |
+
GROUP_CONNECTIVITY, GROUP_TOPOLOGY
|
| 19 |
+
==============================================================================
|
| 20 |
+
Hardware Platform (Shell) Information
|
| 21 |
+
-------------------------------------
|
| 22 |
+
Vendor: xilinx
|
| 23 |
+
Board: u280
|
| 24 |
+
Name: xdma
|
| 25 |
+
Version: 201920.3
|
| 26 |
+
Generated Version: Vivado 2019.2 (SW Build: 2742762)
|
| 27 |
+
Created:
|
| 28 |
+
Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
|
| 29 |
+
Board Vendor: xilinx.com
|
| 30 |
+
Board Name: xilinx.com:au280:1.0
|
| 31 |
+
Board Part: xilinx.com:au280:part0:1.0
|
| 32 |
+
Platform VBNV: xilinx_u280_xdma_201920_3
|
| 33 |
+
Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
|
| 34 |
+
Feature ROM TimeStamp: 1579649056
|
| 35 |
+
|
| 36 |
+
Scalable Clocks
|
| 37 |
+
---------------
|
| 38 |
+
Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
|
| 39 |
+
Index: 0
|
| 40 |
+
Type: SYSTEM
|
| 41 |
+
Frequency: 450 MHz
|
| 42 |
+
|
| 43 |
+
Name: DATA_CLK
|
| 44 |
+
Index: 1
|
| 45 |
+
Type: DATA
|
| 46 |
+
Frequency: 257 MHz
|
| 47 |
+
|
| 48 |
+
Name: KERNEL_CLK
|
| 49 |
+
Index: 2
|
| 50 |
+
Type: KERNEL
|
| 51 |
+
Frequency: 500 MHz
|
| 52 |
+
|
| 53 |
+
System Clocks
|
| 54 |
+
------
|
| 55 |
+
Name: _bd_top_clkwiz_kernel2_clk_out1
|
| 56 |
+
Type: SCALABLE
|
| 57 |
+
Default Freq: 500 MHz
|
| 58 |
+
Requested Freq: 500 MHz
|
| 59 |
+
Achieved Freq: 500 MHz
|
| 60 |
+
|
| 61 |
+
Name: _bd_top_clkwiz_kernel_clk_out1
|
| 62 |
+
Type: SCALABLE
|
| 63 |
+
Default Freq: 300 MHz
|
| 64 |
+
Requested Freq: 300 MHz
|
| 65 |
+
Achieved Freq: 257.2 MHz
|
| 66 |
+
|
| 67 |
+
Memory Configuration
|
| 68 |
+
--------------------
|
| 69 |
+
Name: HBM[0]
|
| 70 |
+
Index: 0
|
| 71 |
+
Type: MEM_DDR4
|
| 72 |
+
Base Address: 0x0
|
| 73 |
+
Address Size: 0x10000000
|
| 74 |
+
Bank Used: No
|
| 75 |
+
|
| 76 |
+
Name: HBM[1]
|
| 77 |
+
Index: 1
|
| 78 |
+
Type: MEM_DDR4
|
| 79 |
+
Base Address: 0x10000000
|
| 80 |
+
Address Size: 0x10000000
|
| 81 |
+
Bank Used: Yes
|
| 82 |
+
|
| 83 |
+
Name: HBM[2]
|
| 84 |
+
Index: 2
|
| 85 |
+
Type: MEM_DRAM
|
| 86 |
+
Base Address: 0x20000000
|
| 87 |
+
Address Size: 0x10000000
|
| 88 |
+
Bank Used: Yes
|
| 89 |
+
|
| 90 |
+
Name: HBM[3]
|
| 91 |
+
Index: 3
|
| 92 |
+
Type: MEM_DRAM
|
| 93 |
+
Base Address: 0x30000000
|
| 94 |
+
Address Size: 0x10000000
|
| 95 |
+
Bank Used: Yes
|
| 96 |
+
|
| 97 |
+
Name: HBM[4]
|
| 98 |
+
Index: 4
|
| 99 |
+
Type: MEM_DRAM
|
| 100 |
+
Base Address: 0x40000000
|
| 101 |
+
Address Size: 0x10000000
|
| 102 |
+
Bank Used: Yes
|
| 103 |
+
|
| 104 |
+
Name: HBM[5]
|
| 105 |
+
Index: 5
|
| 106 |
+
Type: MEM_DRAM
|
| 107 |
+
Base Address: 0x50000000
|
| 108 |
+
Address Size: 0x10000000
|
| 109 |
+
Bank Used: No
|
| 110 |
+
|
| 111 |
+
Name: HBM[6]
|
| 112 |
+
Index: 6
|
| 113 |
+
Type: MEM_DRAM
|
| 114 |
+
Base Address: 0x60000000
|
| 115 |
+
Address Size: 0x10000000
|
| 116 |
+
Bank Used: No
|
| 117 |
+
|
| 118 |
+
Name: HBM[7]
|
| 119 |
+
Index: 7
|
| 120 |
+
Type: MEM_DRAM
|
| 121 |
+
Base Address: 0x70000000
|
| 122 |
+
Address Size: 0x10000000
|
| 123 |
+
Bank Used: Yes
|
| 124 |
+
|
| 125 |
+
Name: HBM[8]
|
| 126 |
+
Index: 8
|
| 127 |
+
Type: MEM_DRAM
|
| 128 |
+
Base Address: 0x80000000
|
| 129 |
+
Address Size: 0x10000000
|
| 130 |
+
Bank Used: No
|
| 131 |
+
|
| 132 |
+
Name: HBM[9]
|
| 133 |
+
Index: 9
|
| 134 |
+
Type: MEM_DRAM
|
| 135 |
+
Base Address: 0x90000000
|
| 136 |
+
Address Size: 0x10000000
|
| 137 |
+
Bank Used: Yes
|
| 138 |
+
|
| 139 |
+
Name: HBM[10]
|
| 140 |
+
Index: 10
|
| 141 |
+
Type: MEM_DRAM
|
| 142 |
+
Base Address: 0xa0000000
|
| 143 |
+
Address Size: 0x10000000
|
| 144 |
+
Bank Used: No
|
| 145 |
+
|
| 146 |
+
Name: HBM[11]
|
| 147 |
+
Index: 11
|
| 148 |
+
Type: MEM_DRAM
|
| 149 |
+
Base Address: 0xb0000000
|
| 150 |
+
Address Size: 0x10000000
|
| 151 |
+
Bank Used: No
|
| 152 |
+
|
| 153 |
+
Name: HBM[12]
|
| 154 |
+
Index: 12
|
| 155 |
+
Type: MEM_DRAM
|
| 156 |
+
Base Address: 0xc0000000
|
| 157 |
+
Address Size: 0x10000000
|
| 158 |
+
Bank Used: No
|
| 159 |
+
|
| 160 |
+
Name: HBM[13]
|
| 161 |
+
Index: 13
|
| 162 |
+
Type: MEM_DRAM
|
| 163 |
+
Base Address: 0xd0000000
|
| 164 |
+
Address Size: 0x10000000
|
| 165 |
+
Bank Used: No
|
| 166 |
+
|
| 167 |
+
Name: HBM[14]
|
| 168 |
+
Index: 14
|
| 169 |
+
Type: MEM_DRAM
|
| 170 |
+
Base Address: 0xe0000000
|
| 171 |
+
Address Size: 0x10000000
|
| 172 |
+
Bank Used: No
|
| 173 |
+
|
| 174 |
+
Name: HBM[15]
|
| 175 |
+
Index: 15
|
| 176 |
+
Type: MEM_DRAM
|
| 177 |
+
Base Address: 0xf0000000
|
| 178 |
+
Address Size: 0x10000000
|
| 179 |
+
Bank Used: No
|
| 180 |
+
|
| 181 |
+
Name: HBM[16]
|
| 182 |
+
Index: 16
|
| 183 |
+
Type: MEM_DRAM
|
| 184 |
+
Base Address: 0x100000000
|
| 185 |
+
Address Size: 0x10000000
|
| 186 |
+
Bank Used: No
|
| 187 |
+
|
| 188 |
+
Name: HBM[17]
|
| 189 |
+
Index: 17
|
| 190 |
+
Type: MEM_DRAM
|
| 191 |
+
Base Address: 0x110000000
|
| 192 |
+
Address Size: 0x10000000
|
| 193 |
+
Bank Used: No
|
| 194 |
+
|
| 195 |
+
Name: HBM[18]
|
| 196 |
+
Index: 18
|
| 197 |
+
Type: MEM_DRAM
|
| 198 |
+
Base Address: 0x120000000
|
| 199 |
+
Address Size: 0x10000000
|
| 200 |
+
Bank Used: No
|
| 201 |
+
|
| 202 |
+
Name: HBM[19]
|
| 203 |
+
Index: 19
|
| 204 |
+
Type: MEM_DRAM
|
| 205 |
+
Base Address: 0x130000000
|
| 206 |
+
Address Size: 0x10000000
|
| 207 |
+
Bank Used: No
|
| 208 |
+
|
| 209 |
+
Name: HBM[20]
|
| 210 |
+
Index: 20
|
| 211 |
+
Type: MEM_DRAM
|
| 212 |
+
Base Address: 0x140000000
|
| 213 |
+
Address Size: 0x10000000
|
| 214 |
+
Bank Used: No
|
| 215 |
+
|
| 216 |
+
Name: HBM[21]
|
| 217 |
+
Index: 21
|
| 218 |
+
Type: MEM_DRAM
|
| 219 |
+
Base Address: 0x150000000
|
| 220 |
+
Address Size: 0x10000000
|
| 221 |
+
Bank Used: No
|
| 222 |
+
|
| 223 |
+
Name: HBM[22]
|
| 224 |
+
Index: 22
|
| 225 |
+
Type: MEM_DRAM
|
| 226 |
+
Base Address: 0x160000000
|
| 227 |
+
Address Size: 0x10000000
|
| 228 |
+
Bank Used: No
|
| 229 |
+
|
| 230 |
+
Name: HBM[23]
|
| 231 |
+
Index: 23
|
| 232 |
+
Type: MEM_DRAM
|
| 233 |
+
Base Address: 0x170000000
|
| 234 |
+
Address Size: 0x10000000
|
| 235 |
+
Bank Used: No
|
| 236 |
+
|
| 237 |
+
Name: HBM[24]
|
| 238 |
+
Index: 24
|
| 239 |
+
Type: MEM_DRAM
|
| 240 |
+
Base Address: 0x180000000
|
| 241 |
+
Address Size: 0x10000000
|
| 242 |
+
Bank Used: No
|
| 243 |
+
|
| 244 |
+
Name: HBM[25]
|
| 245 |
+
Index: 25
|
| 246 |
+
Type: MEM_DRAM
|
| 247 |
+
Base Address: 0x190000000
|
| 248 |
+
Address Size: 0x10000000
|
| 249 |
+
Bank Used: No
|
| 250 |
+
|
| 251 |
+
Name: HBM[26]
|
| 252 |
+
Index: 26
|
| 253 |
+
Type: MEM_DRAM
|
| 254 |
+
Base Address: 0x1a0000000
|
| 255 |
+
Address Size: 0x10000000
|
| 256 |
+
Bank Used: No
|
| 257 |
+
|
| 258 |
+
Name: HBM[27]
|
| 259 |
+
Index: 27
|
| 260 |
+
Type: MEM_DRAM
|
| 261 |
+
Base Address: 0x1b0000000
|
| 262 |
+
Address Size: 0x10000000
|
| 263 |
+
Bank Used: No
|
| 264 |
+
|
| 265 |
+
Name: HBM[28]
|
| 266 |
+
Index: 28
|
| 267 |
+
Type: MEM_DRAM
|
| 268 |
+
Base Address: 0x1c0000000
|
| 269 |
+
Address Size: 0x10000000
|
| 270 |
+
Bank Used: No
|
| 271 |
+
|
| 272 |
+
Name: HBM[29]
|
| 273 |
+
Index: 29
|
| 274 |
+
Type: MEM_DRAM
|
| 275 |
+
Base Address: 0x1d0000000
|
| 276 |
+
Address Size: 0x10000000
|
| 277 |
+
Bank Used: No
|
| 278 |
+
|
| 279 |
+
Name: HBM[30]
|
| 280 |
+
Index: 30
|
| 281 |
+
Type: MEM_DRAM
|
| 282 |
+
Base Address: 0x1e0000000
|
| 283 |
+
Address Size: 0x10000000
|
| 284 |
+
Bank Used: No
|
| 285 |
+
|
| 286 |
+
Name: HBM[31]
|
| 287 |
+
Index: 31
|
| 288 |
+
Type: MEM_DRAM
|
| 289 |
+
Base Address: 0x1f0000000
|
| 290 |
+
Address Size: 0x10000000
|
| 291 |
+
Bank Used: No
|
| 292 |
+
|
| 293 |
+
Name: DDR[0]
|
| 294 |
+
Index: 32
|
| 295 |
+
Type: MEM_DRAM
|
| 296 |
+
Base Address: 0x0
|
| 297 |
+
Address Size: 0x0
|
| 298 |
+
Bank Used: No
|
| 299 |
+
|
| 300 |
+
Name: DDR[1]
|
| 301 |
+
Index: 33
|
| 302 |
+
Type: MEM_DRAM
|
| 303 |
+
Base Address: 0x0
|
| 304 |
+
Address Size: 0x0
|
| 305 |
+
Bank Used: No
|
| 306 |
+
|
| 307 |
+
Name: PLRAM[0]
|
| 308 |
+
Index: 34
|
| 309 |
+
Type: MEM_DRAM
|
| 310 |
+
Base Address: 0x0
|
| 311 |
+
Address Size: 0x0
|
| 312 |
+
Bank Used: No
|
| 313 |
+
|
| 314 |
+
Name: PLRAM[1]
|
| 315 |
+
Index: 35
|
| 316 |
+
Type: MEM_DRAM
|
| 317 |
+
Base Address: 0x0
|
| 318 |
+
Address Size: 0x0
|
| 319 |
+
Bank Used: No
|
| 320 |
+
|
| 321 |
+
Name: PLRAM[2]
|
| 322 |
+
Index: 36
|
| 323 |
+
Type: MEM_DRAM
|
| 324 |
+
Base Address: 0x0
|
| 325 |
+
Address Size: 0x0
|
| 326 |
+
Bank Used: No
|
| 327 |
+
|
| 328 |
+
Name: PLRAM[3]
|
| 329 |
+
Index: 37
|
| 330 |
+
Type: MEM_DRAM
|
| 331 |
+
Base Address: 0x0
|
| 332 |
+
Address Size: 0x0
|
| 333 |
+
Bank Used: No
|
| 334 |
+
|
| 335 |
+
Name: PLRAM[4]
|
| 336 |
+
Index: 38
|
| 337 |
+
Type: MEM_DRAM
|
| 338 |
+
Base Address: 0x0
|
| 339 |
+
Address Size: 0x0
|
| 340 |
+
Bank Used: No
|
| 341 |
+
|
| 342 |
+
Name: PLRAM[5]
|
| 343 |
+
Index: 39
|
| 344 |
+
Type: MEM_DRAM
|
| 345 |
+
Base Address: 0x0
|
| 346 |
+
Address Size: 0x0
|
| 347 |
+
Bank Used: No
|
| 348 |
+
==============================================================================
|
| 349 |
+
Kernel: opt_kernel
|
| 350 |
+
|
| 351 |
+
Definition
|
| 352 |
+
----------
|
| 353 |
+
Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<128>* acc0_out, int* cycle_count)
|
| 354 |
+
|
| 355 |
+
Ports
|
| 356 |
+
-----
|
| 357 |
+
Port: m_axi_X_acc0
|
| 358 |
+
Mode: master
|
| 359 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 360 |
+
Data Width: 512 bits
|
| 361 |
+
Port Type: addressable
|
| 362 |
+
|
| 363 |
+
Port: m_axi_X_acc1
|
| 364 |
+
Mode: master
|
| 365 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 366 |
+
Data Width: 512 bits
|
| 367 |
+
Port Type: addressable
|
| 368 |
+
|
| 369 |
+
Port: m_axi_W_acc0
|
| 370 |
+
Mode: master
|
| 371 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 372 |
+
Data Width: 512 bits
|
| 373 |
+
Port Type: addressable
|
| 374 |
+
|
| 375 |
+
Port: m_axi_W_acc1
|
| 376 |
+
Mode: master
|
| 377 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 378 |
+
Data Width: 512 bits
|
| 379 |
+
Port Type: addressable
|
| 380 |
+
|
| 381 |
+
Port: m_axi_acc0_out
|
| 382 |
+
Mode: master
|
| 383 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 384 |
+
Data Width: 128 bits
|
| 385 |
+
Port Type: addressable
|
| 386 |
+
|
| 387 |
+
Port: m_axi_cycle_count
|
| 388 |
+
Mode: master
|
| 389 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 390 |
+
Data Width: 32 bits
|
| 391 |
+
Port Type: addressable
|
| 392 |
+
|
| 393 |
+
Port: s_axi_control
|
| 394 |
+
Mode: slave
|
| 395 |
+
Range (bytes): 0x1000
|
| 396 |
+
Data Width: 32 bits
|
| 397 |
+
Port Type: addressable
|
| 398 |
+
|
| 399 |
+
--------------------------
|
| 400 |
+
Instance: opt_kernel
|
| 401 |
+
Base Address: 0x1800000
|
| 402 |
+
|
| 403 |
+
Argument: L
|
| 404 |
+
Register Offset: 0x10
|
| 405 |
+
Port: s_axi_control
|
| 406 |
+
Memory: <not applicable>
|
| 407 |
+
|
| 408 |
+
Argument: L_out
|
| 409 |
+
Register Offset: 0x18
|
| 410 |
+
Port: s_axi_control
|
| 411 |
+
Memory: <not applicable>
|
| 412 |
+
|
| 413 |
+
Argument: seq_len
|
| 414 |
+
Register Offset: 0x20
|
| 415 |
+
Port: s_axi_control
|
| 416 |
+
Memory: <not applicable>
|
| 417 |
+
|
| 418 |
+
Argument: X_acc0
|
| 419 |
+
Register Offset: 0x28
|
| 420 |
+
Port: m_axi_X_acc0
|
| 421 |
+
Memory: HBM[1] (MEM_DDR4)
|
| 422 |
+
|
| 423 |
+
Argument: X_acc1
|
| 424 |
+
Register Offset: 0x34
|
| 425 |
+
Port: m_axi_X_acc1
|
| 426 |
+
Memory: HBM[2] (MEM_DRAM)
|
| 427 |
+
|
| 428 |
+
Argument: W_acc0
|
| 429 |
+
Register Offset: 0x40
|
| 430 |
+
Port: m_axi_W_acc0
|
| 431 |
+
Memory: HBM[3] (MEM_DRAM)
|
| 432 |
+
|
| 433 |
+
Argument: W_acc1
|
| 434 |
+
Register Offset: 0x4c
|
| 435 |
+
Port: m_axi_W_acc1
|
| 436 |
+
Memory: HBM[4] (MEM_DRAM)
|
| 437 |
+
|
| 438 |
+
Argument: acc0_out
|
| 439 |
+
Register Offset: 0x58
|
| 440 |
+
Port: m_axi_acc0_out
|
| 441 |
+
Memory: HBM[7] (MEM_DRAM)
|
| 442 |
+
|
| 443 |
+
Argument: cycle_count
|
| 444 |
+
Register Offset: 0x64
|
| 445 |
+
Port: m_axi_cycle_count
|
| 446 |
+
Memory: HBM[9] (MEM_DRAM)
|
| 447 |
+
==============================================================================
|
| 448 |
+
Generated By
|
| 449 |
+
------------
|
| 450 |
+
Command: v++
|
| 451 |
+
Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
|
| 452 |
+
Command Line: v++ --config /scratch/oswaldhe/hbm_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[1] --connectivity.sp opt_kernel.X_acc1:HBM[2] --connectivity.sp opt_kernel.W_acc0:HBM[3] --connectivity.sp opt_kernel.W_acc1:HBM[4] --connectivity.sp opt_kernel.acc0_out:HBM[7] --connectivity.sp opt_kernel.cycle_count:HBM[9] --input_files /scratch/oswaldhe/work.out/run-1/design-point.xo --kernel opt_kernel --link --optimize 3 --output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl --vivado.synth.jobs 8
|
| 453 |
+
Options: --config /scratch/oswaldhe/hbm_config.ini
|
| 454 |
+
--connectivity.nk opt_kernel:1:opt_kernel
|
| 455 |
+
--connectivity.sp opt_kernel.X_acc0:HBM[1]
|
| 456 |
+
--connectivity.sp opt_kernel.X_acc1:HBM[2]
|
| 457 |
+
--connectivity.sp opt_kernel.W_acc0:HBM[3]
|
| 458 |
+
--connectivity.sp opt_kernel.W_acc1:HBM[4]
|
| 459 |
+
--connectivity.sp opt_kernel.acc0_out:HBM[7]
|
| 460 |
+
--connectivity.sp opt_kernel.cycle_count:HBM[9]
|
| 461 |
+
--input_files /scratch/oswaldhe/work.out/run-1/design-point.xo
|
| 462 |
+
--kernel opt_kernel
|
| 463 |
+
--link
|
| 464 |
+
--optimize 3
|
| 465 |
+
--output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
|
| 466 |
+
--platform xilinx_u280_xdma_201920_3
|
| 467 |
+
--report_level 2
|
| 468 |
+
--save-temps
|
| 469 |
+
--target hw
|
| 470 |
+
--temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
|
| 471 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
|
| 472 |
+
--vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
|
| 473 |
+
-propconst
|
| 474 |
+
-sweep
|
| 475 |
+
-shift_register_opt}
|
| 476 |
+
--vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs
|
| 477 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default
|
| 478 |
+
--vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default
|
| 479 |
+
--vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl
|
| 480 |
+
--vivado.synth.jobs 8
|
| 481 |
+
==============================================================================
|
| 482 |
+
User Added Key Value Pairs
|
| 483 |
+
--------------------------
|
| 484 |
+
<empty>
|
| 485 |
+
==============================================================================
|
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc34da0da50c9058d7705e2529b37d1b88d2da38c315fa4d8ca878255a43b282
|
| 3 |
+
size 68746361
|
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c64f06b252dc6400e5a6a4910f803b6c120b828876009ed128b25db1719c05d
|
| 3 |
+
size 76311460
|
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info
ADDED
|
@@ -0,0 +1,502 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
==============================================================================
|
| 3 |
+
XRT Build Version: 2.14.384 (2022.2)
|
| 4 |
+
Build Date: 2022-12-09 00:55:08
|
| 5 |
+
Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
|
| 6 |
+
==============================================================================
|
| 7 |
+
xclbin Information
|
| 8 |
+
------------------
|
| 9 |
+
Generated by: v++ (2021.2) on 2021-10-14-04:41:01
|
| 10 |
+
Version: 2.14.384
|
| 11 |
+
Kernels: opt_kernel
|
| 12 |
+
Signature:
|
| 13 |
+
Content: Bitstream
|
| 14 |
+
UUID (xclbin): ce5651b8-ff94-7baf-4833-5b6446d1a345
|
| 15 |
+
Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
|
| 16 |
+
CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
|
| 17 |
+
EMBEDDED_METADATA, SYSTEM_METADATA,
|
| 18 |
+
GROUP_CONNECTIVITY, GROUP_TOPOLOGY
|
| 19 |
+
==============================================================================
|
| 20 |
+
Hardware Platform (Shell) Information
|
| 21 |
+
-------------------------------------
|
| 22 |
+
Vendor: xilinx
|
| 23 |
+
Board: u280
|
| 24 |
+
Name: xdma
|
| 25 |
+
Version: 201920.3
|
| 26 |
+
Generated Version: Vivado 2019.2 (SW Build: 2742762)
|
| 27 |
+
Created:
|
| 28 |
+
Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
|
| 29 |
+
Board Vendor: xilinx.com
|
| 30 |
+
Board Name: xilinx.com:au280:1.0
|
| 31 |
+
Board Part: xilinx.com:au280:part0:1.0
|
| 32 |
+
Platform VBNV: xilinx_u280_xdma_201920_3
|
| 33 |
+
Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
|
| 34 |
+
Feature ROM TimeStamp: 1579649056
|
| 35 |
+
|
| 36 |
+
Scalable Clocks
|
| 37 |
+
---------------
|
| 38 |
+
Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
|
| 39 |
+
Index: 0
|
| 40 |
+
Type: SYSTEM
|
| 41 |
+
Frequency: 450 MHz
|
| 42 |
+
|
| 43 |
+
Name: DATA_CLK
|
| 44 |
+
Index: 1
|
| 45 |
+
Type: DATA
|
| 46 |
+
Frequency: 220 MHz
|
| 47 |
+
|
| 48 |
+
Name: KERNEL_CLK
|
| 49 |
+
Index: 2
|
| 50 |
+
Type: KERNEL
|
| 51 |
+
Frequency: 500 MHz
|
| 52 |
+
|
| 53 |
+
System Clocks
|
| 54 |
+
------
|
| 55 |
+
Name: _bd_top_clkwiz_kernel2_clk_out1
|
| 56 |
+
Type: SCALABLE
|
| 57 |
+
Default Freq: 500 MHz
|
| 58 |
+
Requested Freq: 500 MHz
|
| 59 |
+
Achieved Freq: 500 MHz
|
| 60 |
+
|
| 61 |
+
Name: _bd_top_clkwiz_kernel_clk_out1
|
| 62 |
+
Type: SCALABLE
|
| 63 |
+
Default Freq: 300 MHz
|
| 64 |
+
Requested Freq: 300 MHz
|
| 65 |
+
Achieved Freq: 220 MHz
|
| 66 |
+
|
| 67 |
+
Memory Configuration
|
| 68 |
+
--------------------
|
| 69 |
+
Name: HBM[0]
|
| 70 |
+
Index: 0
|
| 71 |
+
Type: MEM_DDR4
|
| 72 |
+
Base Address: 0x0
|
| 73 |
+
Address Size: 0x10000000
|
| 74 |
+
Bank Used: Yes
|
| 75 |
+
|
| 76 |
+
Name: HBM[1]
|
| 77 |
+
Index: 1
|
| 78 |
+
Type: MEM_DDR4
|
| 79 |
+
Base Address: 0x10000000
|
| 80 |
+
Address Size: 0x10000000
|
| 81 |
+
Bank Used: Yes
|
| 82 |
+
|
| 83 |
+
Name: HBM[2]
|
| 84 |
+
Index: 2
|
| 85 |
+
Type: MEM_DRAM
|
| 86 |
+
Base Address: 0x20000000
|
| 87 |
+
Address Size: 0x10000000
|
| 88 |
+
Bank Used: Yes
|
| 89 |
+
|
| 90 |
+
Name: HBM[3]
|
| 91 |
+
Index: 3
|
| 92 |
+
Type: MEM_DRAM
|
| 93 |
+
Base Address: 0x30000000
|
| 94 |
+
Address Size: 0x10000000
|
| 95 |
+
Bank Used: Yes
|
| 96 |
+
|
| 97 |
+
Name: HBM[4]
|
| 98 |
+
Index: 4
|
| 99 |
+
Type: MEM_DRAM
|
| 100 |
+
Base Address: 0x40000000
|
| 101 |
+
Address Size: 0x10000000
|
| 102 |
+
Bank Used: No
|
| 103 |
+
|
| 104 |
+
Name: HBM[5]
|
| 105 |
+
Index: 5
|
| 106 |
+
Type: MEM_DRAM
|
| 107 |
+
Base Address: 0x50000000
|
| 108 |
+
Address Size: 0x10000000
|
| 109 |
+
Bank Used: No
|
| 110 |
+
|
| 111 |
+
Name: HBM[6]
|
| 112 |
+
Index: 6
|
| 113 |
+
Type: MEM_DRAM
|
| 114 |
+
Base Address: 0x60000000
|
| 115 |
+
Address Size: 0x10000000
|
| 116 |
+
Bank Used: No
|
| 117 |
+
|
| 118 |
+
Name: HBM[7]
|
| 119 |
+
Index: 7
|
| 120 |
+
Type: MEM_DRAM
|
| 121 |
+
Base Address: 0x70000000
|
| 122 |
+
Address Size: 0x10000000
|
| 123 |
+
Bank Used: No
|
| 124 |
+
|
| 125 |
+
Name: HBM[8]
|
| 126 |
+
Index: 8
|
| 127 |
+
Type: MEM_DRAM
|
| 128 |
+
Base Address: 0x80000000
|
| 129 |
+
Address Size: 0x10000000
|
| 130 |
+
Bank Used: No
|
| 131 |
+
|
| 132 |
+
Name: HBM[9]
|
| 133 |
+
Index: 9
|
| 134 |
+
Type: MEM_DRAM
|
| 135 |
+
Base Address: 0x90000000
|
| 136 |
+
Address Size: 0x10000000
|
| 137 |
+
Bank Used: No
|
| 138 |
+
|
| 139 |
+
Name: HBM[10]
|
| 140 |
+
Index: 10
|
| 141 |
+
Type: MEM_DRAM
|
| 142 |
+
Base Address: 0xa0000000
|
| 143 |
+
Address Size: 0x10000000
|
| 144 |
+
Bank Used: No
|
| 145 |
+
|
| 146 |
+
Name: HBM[11]
|
| 147 |
+
Index: 11
|
| 148 |
+
Type: MEM_DRAM
|
| 149 |
+
Base Address: 0xb0000000
|
| 150 |
+
Address Size: 0x10000000
|
| 151 |
+
Bank Used: No
|
| 152 |
+
|
| 153 |
+
Name: HBM[12]
|
| 154 |
+
Index: 12
|
| 155 |
+
Type: MEM_DRAM
|
| 156 |
+
Base Address: 0xc0000000
|
| 157 |
+
Address Size: 0x10000000
|
| 158 |
+
Bank Used: No
|
| 159 |
+
|
| 160 |
+
Name: HBM[13]
|
| 161 |
+
Index: 13
|
| 162 |
+
Type: MEM_DRAM
|
| 163 |
+
Base Address: 0xd0000000
|
| 164 |
+
Address Size: 0x10000000
|
| 165 |
+
Bank Used: No
|
| 166 |
+
|
| 167 |
+
Name: HBM[14]
|
| 168 |
+
Index: 14
|
| 169 |
+
Type: MEM_DRAM
|
| 170 |
+
Base Address: 0xe0000000
|
| 171 |
+
Address Size: 0x10000000
|
| 172 |
+
Bank Used: No
|
| 173 |
+
|
| 174 |
+
Name: HBM[15]
|
| 175 |
+
Index: 15
|
| 176 |
+
Type: MEM_DRAM
|
| 177 |
+
Base Address: 0xf0000000
|
| 178 |
+
Address Size: 0x10000000
|
| 179 |
+
Bank Used: No
|
| 180 |
+
|
| 181 |
+
Name: HBM[16]
|
| 182 |
+
Index: 16
|
| 183 |
+
Type: MEM_DRAM
|
| 184 |
+
Base Address: 0x100000000
|
| 185 |
+
Address Size: 0x10000000
|
| 186 |
+
Bank Used: Yes
|
| 187 |
+
|
| 188 |
+
Name: HBM[17]
|
| 189 |
+
Index: 17
|
| 190 |
+
Type: MEM_DRAM
|
| 191 |
+
Base Address: 0x110000000
|
| 192 |
+
Address Size: 0x10000000
|
| 193 |
+
Bank Used: Yes
|
| 194 |
+
|
| 195 |
+
Name: HBM[18]
|
| 196 |
+
Index: 18
|
| 197 |
+
Type: MEM_DRAM
|
| 198 |
+
Base Address: 0x120000000
|
| 199 |
+
Address Size: 0x10000000
|
| 200 |
+
Bank Used: Yes
|
| 201 |
+
|
| 202 |
+
Name: HBM[19]
|
| 203 |
+
Index: 19
|
| 204 |
+
Type: MEM_DRAM
|
| 205 |
+
Base Address: 0x130000000
|
| 206 |
+
Address Size: 0x10000000
|
| 207 |
+
Bank Used: No
|
| 208 |
+
|
| 209 |
+
Name: HBM[20]
|
| 210 |
+
Index: 20
|
| 211 |
+
Type: MEM_DRAM
|
| 212 |
+
Base Address: 0x140000000
|
| 213 |
+
Address Size: 0x10000000
|
| 214 |
+
Bank Used: No
|
| 215 |
+
|
| 216 |
+
Name: HBM[21]
|
| 217 |
+
Index: 21
|
| 218 |
+
Type: MEM_DRAM
|
| 219 |
+
Base Address: 0x150000000
|
| 220 |
+
Address Size: 0x10000000
|
| 221 |
+
Bank Used: No
|
| 222 |
+
|
| 223 |
+
Name: HBM[22]
|
| 224 |
+
Index: 22
|
| 225 |
+
Type: MEM_DRAM
|
| 226 |
+
Base Address: 0x160000000
|
| 227 |
+
Address Size: 0x10000000
|
| 228 |
+
Bank Used: No
|
| 229 |
+
|
| 230 |
+
Name: HBM[23]
|
| 231 |
+
Index: 23
|
| 232 |
+
Type: MEM_DRAM
|
| 233 |
+
Base Address: 0x170000000
|
| 234 |
+
Address Size: 0x10000000
|
| 235 |
+
Bank Used: No
|
| 236 |
+
|
| 237 |
+
Name: HBM[24]
|
| 238 |
+
Index: 24
|
| 239 |
+
Type: MEM_DRAM
|
| 240 |
+
Base Address: 0x180000000
|
| 241 |
+
Address Size: 0x10000000
|
| 242 |
+
Bank Used: No
|
| 243 |
+
|
| 244 |
+
Name: HBM[25]
|
| 245 |
+
Index: 25
|
| 246 |
+
Type: MEM_DRAM
|
| 247 |
+
Base Address: 0x190000000
|
| 248 |
+
Address Size: 0x10000000
|
| 249 |
+
Bank Used: No
|
| 250 |
+
|
| 251 |
+
Name: HBM[26]
|
| 252 |
+
Index: 26
|
| 253 |
+
Type: MEM_DRAM
|
| 254 |
+
Base Address: 0x1a0000000
|
| 255 |
+
Address Size: 0x10000000
|
| 256 |
+
Bank Used: No
|
| 257 |
+
|
| 258 |
+
Name: HBM[27]
|
| 259 |
+
Index: 27
|
| 260 |
+
Type: MEM_DRAM
|
| 261 |
+
Base Address: 0x1b0000000
|
| 262 |
+
Address Size: 0x10000000
|
| 263 |
+
Bank Used: No
|
| 264 |
+
|
| 265 |
+
Name: HBM[28]
|
| 266 |
+
Index: 28
|
| 267 |
+
Type: MEM_DRAM
|
| 268 |
+
Base Address: 0x1c0000000
|
| 269 |
+
Address Size: 0x10000000
|
| 270 |
+
Bank Used: No
|
| 271 |
+
|
| 272 |
+
Name: HBM[29]
|
| 273 |
+
Index: 29
|
| 274 |
+
Type: MEM_DRAM
|
| 275 |
+
Base Address: 0x1d0000000
|
| 276 |
+
Address Size: 0x10000000
|
| 277 |
+
Bank Used: No
|
| 278 |
+
|
| 279 |
+
Name: HBM[30]
|
| 280 |
+
Index: 30
|
| 281 |
+
Type: MEM_DRAM
|
| 282 |
+
Base Address: 0x1e0000000
|
| 283 |
+
Address Size: 0x10000000
|
| 284 |
+
Bank Used: No
|
| 285 |
+
|
| 286 |
+
Name: HBM[31]
|
| 287 |
+
Index: 31
|
| 288 |
+
Type: MEM_DRAM
|
| 289 |
+
Base Address: 0x1f0000000
|
| 290 |
+
Address Size: 0x10000000
|
| 291 |
+
Bank Used: No
|
| 292 |
+
|
| 293 |
+
Name: DDR[0]
|
| 294 |
+
Index: 32
|
| 295 |
+
Type: MEM_DRAM
|
| 296 |
+
Base Address: 0x0
|
| 297 |
+
Address Size: 0x0
|
| 298 |
+
Bank Used: No
|
| 299 |
+
|
| 300 |
+
Name: DDR[1]
|
| 301 |
+
Index: 33
|
| 302 |
+
Type: MEM_DRAM
|
| 303 |
+
Base Address: 0x0
|
| 304 |
+
Address Size: 0x0
|
| 305 |
+
Bank Used: No
|
| 306 |
+
|
| 307 |
+
Name: PLRAM[0]
|
| 308 |
+
Index: 34
|
| 309 |
+
Type: MEM_DRAM
|
| 310 |
+
Base Address: 0x0
|
| 311 |
+
Address Size: 0x0
|
| 312 |
+
Bank Used: No
|
| 313 |
+
|
| 314 |
+
Name: PLRAM[1]
|
| 315 |
+
Index: 35
|
| 316 |
+
Type: MEM_DRAM
|
| 317 |
+
Base Address: 0x0
|
| 318 |
+
Address Size: 0x0
|
| 319 |
+
Bank Used: No
|
| 320 |
+
|
| 321 |
+
Name: PLRAM[2]
|
| 322 |
+
Index: 36
|
| 323 |
+
Type: MEM_DRAM
|
| 324 |
+
Base Address: 0x0
|
| 325 |
+
Address Size: 0x0
|
| 326 |
+
Bank Used: No
|
| 327 |
+
|
| 328 |
+
Name: PLRAM[3]
|
| 329 |
+
Index: 37
|
| 330 |
+
Type: MEM_DRAM
|
| 331 |
+
Base Address: 0x0
|
| 332 |
+
Address Size: 0x0
|
| 333 |
+
Bank Used: No
|
| 334 |
+
|
| 335 |
+
Name: PLRAM[4]
|
| 336 |
+
Index: 38
|
| 337 |
+
Type: MEM_DRAM
|
| 338 |
+
Base Address: 0x0
|
| 339 |
+
Address Size: 0x0
|
| 340 |
+
Bank Used: No
|
| 341 |
+
|
| 342 |
+
Name: PLRAM[5]
|
| 343 |
+
Index: 39
|
| 344 |
+
Type: MEM_DRAM
|
| 345 |
+
Base Address: 0x0
|
| 346 |
+
Address Size: 0x0
|
| 347 |
+
Bank Used: No
|
| 348 |
+
==============================================================================
|
| 349 |
+
Kernel: opt_kernel
|
| 350 |
+
|
| 351 |
+
Definition
|
| 352 |
+
----------
|
| 353 |
+
Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
|
| 354 |
+
|
| 355 |
+
Ports
|
| 356 |
+
-----
|
| 357 |
+
Port: m_axi_X_acc0
|
| 358 |
+
Mode: master
|
| 359 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 360 |
+
Data Width: 512 bits
|
| 361 |
+
Port Type: addressable
|
| 362 |
+
|
| 363 |
+
Port: m_axi_X_acc1
|
| 364 |
+
Mode: master
|
| 365 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 366 |
+
Data Width: 512 bits
|
| 367 |
+
Port Type: addressable
|
| 368 |
+
|
| 369 |
+
Port: m_axi_W_acc0
|
| 370 |
+
Mode: master
|
| 371 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 372 |
+
Data Width: 512 bits
|
| 373 |
+
Port Type: addressable
|
| 374 |
+
|
| 375 |
+
Port: m_axi_W_acc1
|
| 376 |
+
Mode: master
|
| 377 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 378 |
+
Data Width: 512 bits
|
| 379 |
+
Port Type: addressable
|
| 380 |
+
|
| 381 |
+
Port: m_axi_acc0_out
|
| 382 |
+
Mode: master
|
| 383 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 384 |
+
Data Width: 64 bits
|
| 385 |
+
Port Type: addressable
|
| 386 |
+
|
| 387 |
+
Port: m_axi_acc1_out
|
| 388 |
+
Mode: master
|
| 389 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 390 |
+
Data Width: 64 bits
|
| 391 |
+
Port Type: addressable
|
| 392 |
+
|
| 393 |
+
Port: m_axi_cycle_count
|
| 394 |
+
Mode: master
|
| 395 |
+
Range (bytes): 0xFFFFFFFFFFFFFFFF
|
| 396 |
+
Data Width: 32 bits
|
| 397 |
+
Port Type: addressable
|
| 398 |
+
|
| 399 |
+
Port: s_axi_control
|
| 400 |
+
Mode: slave
|
| 401 |
+
Range (bytes): 0x1000
|
| 402 |
+
Data Width: 32 bits
|
| 403 |
+
Port Type: addressable
|
| 404 |
+
|
| 405 |
+
--------------------------
|
| 406 |
+
Instance: opt_kernel
|
| 407 |
+
Base Address: 0x1800000
|
| 408 |
+
|
| 409 |
+
Argument: L
|
| 410 |
+
Register Offset: 0x10
|
| 411 |
+
Port: s_axi_control
|
| 412 |
+
Memory: <not applicable>
|
| 413 |
+
|
| 414 |
+
Argument: L_out
|
| 415 |
+
Register Offset: 0x18
|
| 416 |
+
Port: s_axi_control
|
| 417 |
+
Memory: <not applicable>
|
| 418 |
+
|
| 419 |
+
Argument: seq_len
|
| 420 |
+
Register Offset: 0x20
|
| 421 |
+
Port: s_axi_control
|
| 422 |
+
Memory: <not applicable>
|
| 423 |
+
|
| 424 |
+
Argument: reload
|
| 425 |
+
Register Offset: 0x28
|
| 426 |
+
Port: s_axi_control
|
| 427 |
+
Memory: <not applicable>
|
| 428 |
+
|
| 429 |
+
Argument: X_acc0
|
| 430 |
+
Register Offset: 0x30
|
| 431 |
+
Port: m_axi_X_acc0
|
| 432 |
+
Memory: HBM[0] (MEM_DDR4)
|
| 433 |
+
|
| 434 |
+
Argument: X_acc1
|
| 435 |
+
Register Offset: 0x3c
|
| 436 |
+
Port: m_axi_X_acc1
|
| 437 |
+
Memory: HBM[16] (MEM_DRAM)
|
| 438 |
+
|
| 439 |
+
Argument: W_acc0
|
| 440 |
+
Register Offset: 0x48
|
| 441 |
+
Port: m_axi_W_acc0
|
| 442 |
+
Memory: HBM[1] (MEM_DDR4)
|
| 443 |
+
|
| 444 |
+
Argument: W_acc1
|
| 445 |
+
Register Offset: 0x54
|
| 446 |
+
Port: m_axi_W_acc1
|
| 447 |
+
Memory: HBM[17] (MEM_DRAM)
|
| 448 |
+
|
| 449 |
+
Argument: acc0_out
|
| 450 |
+
Register Offset: 0x60
|
| 451 |
+
Port: m_axi_acc0_out
|
| 452 |
+
Memory: HBM[2] (MEM_DRAM)
|
| 453 |
+
|
| 454 |
+
Argument: acc1_out
|
| 455 |
+
Register Offset: 0x6c
|
| 456 |
+
Port: m_axi_acc1_out
|
| 457 |
+
Memory: HBM[18] (MEM_DRAM)
|
| 458 |
+
|
| 459 |
+
Argument: cycle_count
|
| 460 |
+
Register Offset: 0x78
|
| 461 |
+
Port: m_axi_cycle_count
|
| 462 |
+
Memory: HBM[3] (MEM_DRAM)
|
| 463 |
+
==============================================================================
|
| 464 |
+
Generated By
|
| 465 |
+
------------
|
| 466 |
+
Command: v++
|
| 467 |
+
Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
|
| 468 |
+
Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
|
| 469 |
+
Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
|
| 470 |
+
--connectivity.nk opt_kernel:1:opt_kernel
|
| 471 |
+
--connectivity.sp opt_kernel.X_acc0:HBM[0]
|
| 472 |
+
--connectivity.sp opt_kernel.X_acc1:HBM[16]
|
| 473 |
+
--connectivity.sp opt_kernel.W_acc0:HBM[1]
|
| 474 |
+
--connectivity.sp opt_kernel.W_acc1:HBM[17]
|
| 475 |
+
--connectivity.sp opt_kernel.acc0_out:HBM[2]
|
| 476 |
+
--connectivity.sp opt_kernel.acc1_out:HBM[18]
|
| 477 |
+
--connectivity.sp opt_kernel.cycle_count:HBM[3]
|
| 478 |
+
--input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
|
| 479 |
+
--kernel opt_kernel
|
| 480 |
+
--link
|
| 481 |
+
--optimize 3
|
| 482 |
+
--output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
|
| 483 |
+
--platform xilinx_u280_xdma_201920_3
|
| 484 |
+
--report_level 2
|
| 485 |
+
--save-temps
|
| 486 |
+
--target hw
|
| 487 |
+
--temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
|
| 488 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
|
| 489 |
+
--vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
|
| 490 |
+
-propconst
|
| 491 |
+
-sweep
|
| 492 |
+
-shift_register_opt}
|
| 493 |
+
--vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
|
| 494 |
+
--vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
|
| 495 |
+
--vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
|
| 496 |
+
--vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
|
| 497 |
+
--vivado.synth.jobs 8
|
| 498 |
+
==============================================================================
|
| 499 |
+
User Added Key Value Pairs
|
| 500 |
+
--------------------------
|
| 501 |
+
<empty>
|
| 502 |
+
==============================================================================
|
gpt-2-medium/export_xo.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rapidstream import RapidStreamTAPA, DeviceFactory, get_u250_vitis_device_factory
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
CURR_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
BUILD_DIR = "rs_build"
|
| 7 |
+
VITIS_PLATFORM = "~/vpk180_linux_platform/vpk180_pfm_vitis/export/vpk180_pfm_vitis/vpk180_pfm_vitis.xpfm"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
rs = RapidStreamTAPA(BUILD_DIR)
|
| 11 |
+
|
| 12 |
+
# factory = get_u250_vitis_device_factory(VITIS_PLATFORM)
|
| 13 |
+
factory = DeviceFactory(
|
| 14 |
+
row=4,
|
| 15 |
+
col=2,
|
| 16 |
+
part_num="xcvp1802-lsvc4072-2MP-e-S",
|
| 17 |
+
board_name="xilinx.com:vpk180:part0:1.1",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# Set the pblocks of the device so that each slot contains half of an SLR:
|
| 21 |
+
factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
|
| 22 |
+
factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
|
| 23 |
+
factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
|
| 24 |
+
factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
|
| 25 |
+
|
| 26 |
+
factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
|
| 27 |
+
factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
|
| 28 |
+
factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
|
| 29 |
+
factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
|
| 30 |
+
|
| 31 |
+
# There are 18870 total SLL nodes for VP1552:
|
| 32 |
+
factory.set_slot_capacity(0, 0, north=9435)
|
| 33 |
+
factory.set_slot_capacity(1, 0, north=9435)
|
| 34 |
+
factory.set_slot_capacity(0, 1, north=9435)
|
| 35 |
+
factory.set_slot_capacity(1, 1, north=9435)
|
| 36 |
+
factory.set_slot_capacity(0, 2, north=9435)
|
| 37 |
+
factory.set_slot_capacity(1, 2, north=9435)
|
| 38 |
+
|
| 39 |
+
# Call factory to extract the slot resources automatically from Vivado:
|
| 40 |
+
factory.extract_slot_resources()
|
| 41 |
+
|
| 42 |
+
rs.set_virtual_device(factory.generate_virtual_device())
|
| 43 |
+
|
| 44 |
+
rs.add_xo_file("./gpt2-sa.tapa/gpt2.xo")
|
| 45 |
+
rs.set_top_module_name("opt_kernel")
|
| 46 |
+
rs.add_clock("ap_clk", period_ns=3.33)
|
| 47 |
+
rs.set_vitis_connectivity_config("link_config_versal.ini")
|
| 48 |
+
|
| 49 |
+
work_dir_to_ir = {Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5'): Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5/add_pipeline.json')}
|
| 50 |
+
rs.remote_ip_cache = Path(f"{CURR_DIR}/{BUILD_DIR}")
|
| 51 |
+
rs.set_vitis_platform(VITIS_PLATFORM)
|
| 52 |
+
rs.parallel_export_candidates(work_dir_to_ir)
|
gpt-2-medium/generate_bitstream_sample.sh
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
TARGET=hw
|
| 3 |
+
# TARGET=hw_emu
|
| 4 |
+
# DEBUG=-g
|
| 5 |
+
|
| 6 |
+
TOP=opt_kernel
|
| 7 |
+
XO='/path/to/opt_kernel.xo'
|
| 8 |
+
CONSTRAINT='/path/to/floorplanning/constraint.tcl'
|
| 9 |
+
>&2 echo "Using the default clock target of the platform."
|
| 10 |
+
PLATFORM="/path/to/vitis/vpk180.xpfm"
|
| 11 |
+
TARGET_FREQUENCY=240000000
|
| 12 |
+
if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
|
| 13 |
+
|
| 14 |
+
OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
|
| 15 |
+
|
| 16 |
+
MAX_SYNTH_JOBS=16
|
| 17 |
+
STRATEGY="Explore"
|
| 18 |
+
PLACEMENT_STRATEGY="Explore"
|
| 19 |
+
|
| 20 |
+
v++ ${DEBUG} \
|
| 21 |
+
--link \
|
| 22 |
+
--output "${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
|
| 23 |
+
--kernel ${TOP} \
|
| 24 |
+
--platform ${PLATFORM} \
|
| 25 |
+
--target ${TARGET} \
|
| 26 |
+
--report_level 2 \
|
| 27 |
+
--temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp" \
|
| 28 |
+
--optimize 3 \
|
| 29 |
+
--connectivity.nk ${TOP}:1:${TOP} \
|
| 30 |
+
--save-temps \
|
| 31 |
+
"${XO}" \
|
| 32 |
+
--vivado.synth.jobs ${MAX_SYNTH_JOBS} \
|
| 33 |
+
--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
|
| 34 |
+
--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
|
| 35 |
+
--vivado.prop=run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE\ OPTIONS}={-debug_log} \
|
| 36 |
+
--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$PLACEMENT_STRATEGY \
|
| 37 |
+
--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
|
| 38 |
+
--vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
|
| 39 |
+
--clock.default_freqhz ${TARGET_FREQUENCY} \
|
| 40 |
+
--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT \
|
gpt-2-medium/hbm_config.ini
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[connectivity]
|
| 2 |
+
sp=opt_kernel.X_acc0:HBM[0]
|
| 3 |
+
sp=opt_kernel.X_acc1:HBM[16]
|
| 4 |
+
sp=opt_kernel.W_acc0:HBM[1]
|
| 5 |
+
sp=opt_kernel.W_acc1:HBM[17]
|
| 6 |
+
sp=opt_kernel.acc0_out:HBM[2]
|
| 7 |
+
sp=opt_kernel.cycle_count:HBM[19]
|
gpt-2-medium/host-u280.cpp
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <ctime>
|
| 6 |
+
#include <cmath>
|
| 7 |
+
#include <tapa.h>
|
| 8 |
+
#include <gflags/gflags.h>
|
| 9 |
+
#include <ap_int.h>
|
| 10 |
+
|
| 11 |
+
constexpr int D = 1024;
|
| 12 |
+
constexpr int D_ffn = 5504;
|
| 13 |
+
constexpr int N_head = 16;
|
| 14 |
+
constexpr int MAX_SEQ_LEN = 1024;
|
| 15 |
+
constexpr int NUM_SLR = 3;
|
| 16 |
+
constexpr int NUM_DUM_SLR = 4;
|
| 17 |
+
constexpr int D_head = D / N_head;
|
| 18 |
+
constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
|
| 19 |
+
constexpr int OUT_WEIGHT_SIZE = D * D;
|
| 20 |
+
constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
|
| 21 |
+
|
| 22 |
+
using std::vector;
|
| 23 |
+
using int_v16 = tapa::vec_t<int, 16>;
|
| 24 |
+
using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
|
| 25 |
+
using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
|
| 26 |
+
|
| 27 |
+
void opt_kernel(
|
| 28 |
+
const int L,
|
| 29 |
+
const int L_out,
|
| 30 |
+
const int seq_len,
|
| 31 |
+
// tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
|
| 32 |
+
tapa::mmap<ap_uint<512>> X_acc0,
|
| 33 |
+
tapa::mmap<ap_uint<512>> X_acc1,
|
| 34 |
+
tapa::mmap<ap_uint<512>> W_acc0,
|
| 35 |
+
tapa::mmap<ap_uint<512>> W_acc1,
|
| 36 |
+
tapa::mmap<ap_uint<128>> acc0_out,
|
| 37 |
+
// tapa::mmap<ap_uint<64>> acc1_out,
|
| 38 |
+
tapa::mmap<int> cycle_count
|
| 39 |
+
);
|
| 40 |
+
|
| 41 |
+
template <typename T>
|
| 42 |
+
using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
|
| 43 |
+
|
| 44 |
+
DEFINE_string(bitstream, "", "path to bitstream file");
|
| 45 |
+
|
| 46 |
+
int main(int argc, char *argv[]){
|
| 47 |
+
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
| 48 |
+
|
| 49 |
+
const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
|
| 50 |
+
|
| 51 |
+
srand((unsigned)time(nullptr));
|
| 52 |
+
|
| 53 |
+
// data preparation
|
| 54 |
+
aligned_vector<int> inst = {L, 1};
|
| 55 |
+
aligned_vector<ap_int<8>> X_acc0(L * D, 0);
|
| 56 |
+
aligned_vector<ap_int<8>> X_acc1(L * D, 0);
|
| 57 |
+
aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
|
| 58 |
+
aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
|
| 59 |
+
aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
|
| 60 |
+
// aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
|
| 61 |
+
aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
|
| 62 |
+
aligned_vector<int> cycle_count(1);
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
vector<int> X_copy(L * D);
|
| 66 |
+
vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 67 |
+
vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 68 |
+
vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 69 |
+
vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 70 |
+
vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 71 |
+
vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
|
| 72 |
+
vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 73 |
+
|
| 74 |
+
// for(int i = 0; i < L * D; i++){
|
| 75 |
+
// int val = (rand() % 8) + 1;
|
| 76 |
+
// ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 77 |
+
// X_copy[i] = val;
|
| 78 |
+
// X_acc0[i] = ap_int<8>(full(7, 0));
|
| 79 |
+
// X_acc1[i] = ap_int<8>(full(7, 0));
|
| 80 |
+
// }
|
| 81 |
+
|
| 82 |
+
// for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
|
| 83 |
+
// int val = (rand() % 6) - 1;
|
| 84 |
+
// ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 85 |
+
// W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 86 |
+
// W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 87 |
+
// }
|
| 88 |
+
|
| 89 |
+
// for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
|
| 90 |
+
// int val = (rand() % 6) - 1;
|
| 91 |
+
// ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 92 |
+
// W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 93 |
+
// W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 94 |
+
// }
|
| 95 |
+
|
| 96 |
+
// for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
|
| 97 |
+
// int val = (rand() % 6) - 1;
|
| 98 |
+
// int ind = i - D * D_head * NUM_DUM_SLR * 4;
|
| 99 |
+
// ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 100 |
+
// W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 101 |
+
// W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 102 |
+
// W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
|
| 103 |
+
// }
|
| 104 |
+
|
| 105 |
+
// // cpu
|
| 106 |
+
// for(int i = 0; i < NUM_SLR; i++){
|
| 107 |
+
// // WqX
|
| 108 |
+
// for(int j = 0; j < L; j++){
|
| 109 |
+
// for(int k = 0; k < D_head; k++){
|
| 110 |
+
// int acc = 0;
|
| 111 |
+
// for(int l = 0; l < D; l++){
|
| 112 |
+
// acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
|
| 113 |
+
// }
|
| 114 |
+
// q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 115 |
+
// }
|
| 116 |
+
// }
|
| 117 |
+
|
| 118 |
+
// //WvX
|
| 119 |
+
// for(int j = 0; j < L; j++){
|
| 120 |
+
// for(int k = 0; k < D_head; k++){
|
| 121 |
+
// int acc = 0;
|
| 122 |
+
// for(int l = 0; l < D; l++){
|
| 123 |
+
// acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
|
| 124 |
+
// }
|
| 125 |
+
// acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 126 |
+
// }
|
| 127 |
+
// }
|
| 128 |
+
|
| 129 |
+
// //WkX
|
| 130 |
+
// for(int j = 0; j < L; j++){
|
| 131 |
+
// for(int k = 0; k < D_head; k++){
|
| 132 |
+
// int acc = 0;
|
| 133 |
+
// for(int l = 0; l < D; l++){
|
| 134 |
+
// acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
|
| 135 |
+
// }
|
| 136 |
+
// k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 137 |
+
// }
|
| 138 |
+
// }
|
| 139 |
+
|
| 140 |
+
// // QK^T
|
| 141 |
+
// for(int j = 0; j < L; j++){
|
| 142 |
+
// for(int k = 0; k < L; k++){
|
| 143 |
+
// int acc = 0;
|
| 144 |
+
// for(int l = 0; l < D_head; l++){
|
| 145 |
+
// acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
|
| 146 |
+
// }
|
| 147 |
+
// attn_golden[i][j*D_head+k] = acc;
|
| 148 |
+
// }
|
| 149 |
+
// }
|
| 150 |
+
// }
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
// invoke the kernel
|
| 154 |
+
int64_t kernel_time_ns = 0;
|
| 155 |
+
for(int i = 0; i < 24; i++){
|
| 156 |
+
kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
|
| 157 |
+
L * D, L * D / 16, L,
|
| 158 |
+
// tapa::read_only_mmap<int>(inst),
|
| 159 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
|
| 160 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
|
| 161 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
|
| 162 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
|
| 163 |
+
tapa::write_only_mmap<ap_uint<128>>(acc0_out),
|
| 164 |
+
// tapa::write_only_mmap<ap_uint<64>>(acc1_out),
|
| 165 |
+
tapa::write_only_mmap<int>(cycle_count));
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
std::clog << "cycle time: " << cycle_count[0] << std::endl;
|
| 169 |
+
std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
|
| 170 |
+
|
| 171 |
+
}
|
| 172 |
+
|
gpt-2-medium/host-versal.cpp
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <ctime>
|
| 6 |
+
#include <cmath>
|
| 7 |
+
#include <tapa.h>
|
| 8 |
+
#include <gflags/gflags.h>
|
| 9 |
+
#include <ap_int.h>
|
| 10 |
+
|
| 11 |
+
constexpr int D = 1024;
|
| 12 |
+
constexpr int D_ffn = 4096;
|
| 13 |
+
constexpr int N_head = 16;
|
| 14 |
+
constexpr int MAX_SEQ_LEN = 1024;
|
| 15 |
+
constexpr int NUM_SLR = 4;
|
| 16 |
+
constexpr int NUM_DUM_SLR = 4;
|
| 17 |
+
constexpr int D_head = D / N_head;
|
| 18 |
+
constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
|
| 19 |
+
constexpr int OUT_WEIGHT_SIZE = D * D;
|
| 20 |
+
constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
|
| 21 |
+
|
| 22 |
+
using std::vector;
|
| 23 |
+
using int_v16 = tapa::vec_t<int, 16>;
|
| 24 |
+
using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
|
| 25 |
+
using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
|
| 26 |
+
|
| 27 |
+
void opt_kernel(
|
| 28 |
+
const int L,
|
| 29 |
+
const int L_out,
|
| 30 |
+
const int seq_len,
|
| 31 |
+
// tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
|
| 32 |
+
tapa::mmap<ap_uint<512>> X_acc0,
|
| 33 |
+
tapa::mmap<ap_uint<512>> X_acc1,
|
| 34 |
+
tapa::mmap<ap_uint<512>> W_acc0,
|
| 35 |
+
tapa::mmap<ap_uint<512>> W_acc1,
|
| 36 |
+
tapa::mmap<ap_uint<128>> acc0_out,
|
| 37 |
+
// tapa::mmap<ap_uint<64>> acc1_out,
|
| 38 |
+
tapa::mmap<int> cycle_count
|
| 39 |
+
);
|
| 40 |
+
|
| 41 |
+
template <typename T>
|
| 42 |
+
using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
|
| 43 |
+
|
| 44 |
+
DEFINE_string(bitstream, "", "path to bitstream file");
|
| 45 |
+
|
| 46 |
+
int main(int argc, char *argv[]){
|
| 47 |
+
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
| 48 |
+
|
| 49 |
+
const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
|
| 50 |
+
|
| 51 |
+
srand((unsigned)time(nullptr));
|
| 52 |
+
|
| 53 |
+
// data preparation
|
| 54 |
+
aligned_vector<int> inst = {L, 1};
|
| 55 |
+
aligned_vector<ap_int<8>> X_acc0(L * D);
|
| 56 |
+
aligned_vector<ap_int<8>> X_acc1(L * D);
|
| 57 |
+
aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
|
| 58 |
+
aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
|
| 59 |
+
aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
|
| 60 |
+
// aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
|
| 61 |
+
aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
|
| 62 |
+
aligned_vector<int> cycle_count(1);
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
vector<int> X_copy(L * D);
|
| 66 |
+
vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 67 |
+
vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 68 |
+
vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
|
| 69 |
+
vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 70 |
+
vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 71 |
+
vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
|
| 72 |
+
vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 73 |
+
|
| 74 |
+
for(int i = 0; i < L * D; i++){
|
| 75 |
+
int val = (rand() % 8) + 1;
|
| 76 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 77 |
+
X_copy[i] = val;
|
| 78 |
+
X_acc0[i] = ap_int<8>(full(7, 0));
|
| 79 |
+
X_acc1[i] = ap_int<8>(full(7, 0));
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
|
| 83 |
+
int val = (rand() % 6) - 1;
|
| 84 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 85 |
+
W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 86 |
+
W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
|
| 90 |
+
int val = (rand() % 6) - 1;
|
| 91 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 92 |
+
W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 93 |
+
W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
|
| 97 |
+
int val = (rand() % 6) - 1;
|
| 98 |
+
int ind = i - D * D_head * NUM_DUM_SLR * 4;
|
| 99 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 100 |
+
W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 101 |
+
W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 102 |
+
W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
// cpu
|
| 106 |
+
for(int i = 0; i < NUM_SLR; i++){
|
| 107 |
+
// WqX
|
| 108 |
+
for(int j = 0; j < L; j++){
|
| 109 |
+
for(int k = 0; k < D_head; k++){
|
| 110 |
+
int acc = 0;
|
| 111 |
+
for(int l = 0; l < D; l++){
|
| 112 |
+
acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
|
| 113 |
+
}
|
| 114 |
+
q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
//WvX
|
| 119 |
+
for(int j = 0; j < L; j++){
|
| 120 |
+
for(int k = 0; k < D_head; k++){
|
| 121 |
+
int acc = 0;
|
| 122 |
+
for(int l = 0; l < D; l++){
|
| 123 |
+
acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
|
| 124 |
+
}
|
| 125 |
+
acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
//WkX
|
| 130 |
+
for(int j = 0; j < L; j++){
|
| 131 |
+
for(int k = 0; k < D_head; k++){
|
| 132 |
+
int acc = 0;
|
| 133 |
+
for(int l = 0; l < D; l++){
|
| 134 |
+
acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
|
| 135 |
+
}
|
| 136 |
+
k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
// QK^T
|
| 141 |
+
for(int j = 0; j < L; j++){
|
| 142 |
+
for(int k = 0; k < L; k++){
|
| 143 |
+
int acc = 0;
|
| 144 |
+
for(int l = 0; l < D_head; l++){
|
| 145 |
+
acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
|
| 146 |
+
}
|
| 147 |
+
attn_golden[i][j*D_head+k] = acc;
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
// invoke the kernel
|
| 154 |
+
int64_t kernel_time_ns = 0;
|
| 155 |
+
for(int i = 0; i < 1; i++){
|
| 156 |
+
kernel_time_ns = tapa::invoke(opt_kernel, FLAGS_bitstream,
|
| 157 |
+
L * D, L * D / 16, L,
|
| 158 |
+
// tapa::read_only_mmap<int>(inst),
|
| 159 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
|
| 160 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
|
| 161 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
|
| 162 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
|
| 163 |
+
tapa::write_only_mmap<ap_uint<128>>(acc0_out),
|
| 164 |
+
// tapa::write_only_mmap<ap_uint<64>>(acc1_out),
|
| 165 |
+
tapa::write_only_mmap<int>(cycle_count));
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
std::clog << "cycle time: " << cycle_count[0] << std::endl;
|
| 169 |
+
std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
|
| 170 |
+
|
| 171 |
+
int error = 0;
|
| 172 |
+
|
| 173 |
+
// compare
|
| 174 |
+
// for(int i = 0; i < NUM_SLR; i++){
|
| 175 |
+
// for(int j = 0; j < 4; j++){
|
| 176 |
+
// for(int k = 0; k < 16; k++){
|
| 177 |
+
// if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
|
| 178 |
+
// std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
|
| 179 |
+
// error++;
|
| 180 |
+
// }
|
| 181 |
+
// }
|
| 182 |
+
// }
|
| 183 |
+
// }
|
| 184 |
+
|
| 185 |
+
if (error == 0) {
|
| 186 |
+
std::clog << "PASSED" << std::endl;
|
| 187 |
+
} else {
|
| 188 |
+
std::clog << "FAILED" << std::endl;
|
| 189 |
+
return 1;
|
| 190 |
+
}
|
| 191 |
+
return 0;
|
| 192 |
+
|
| 193 |
+
}
|
| 194 |
+
|
gpt-2-medium/host.cpp
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <ctime>
|
| 6 |
+
#include <cmath>
|
| 7 |
+
#include <tapa.h>
|
| 8 |
+
#include <gflags/gflags.h>
|
| 9 |
+
#include <ap_int.h>
|
| 10 |
+
|
| 11 |
+
constexpr int D = 1024;
|
| 12 |
+
constexpr int D_ffn = 4096;
|
| 13 |
+
constexpr int N_head = 16;
|
| 14 |
+
constexpr int MAX_SEQ_LEN = 1024;
|
| 15 |
+
constexpr int NUM_SLR = 3;
|
| 16 |
+
constexpr int NUM_DUM_SLR = 4;
|
| 17 |
+
constexpr int D_head = D / N_head;
|
| 18 |
+
constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
|
| 19 |
+
constexpr int OUT_WEIGHT_SIZE = D * D;
|
| 20 |
+
constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
|
| 21 |
+
|
| 22 |
+
using std::vector;
|
| 23 |
+
using int_v16 = tapa::vec_t<int, 16>;
|
| 24 |
+
using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
|
| 25 |
+
using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
|
| 26 |
+
|
| 27 |
+
void opt_kernel(
|
| 28 |
+
const int L,
|
| 29 |
+
const int L_out,
|
| 30 |
+
const int seq_len,
|
| 31 |
+
// tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
|
| 32 |
+
tapa::mmap<ap_uint<512>> X_acc0,
|
| 33 |
+
tapa::mmap<ap_uint<512>> X_acc1,
|
| 34 |
+
tapa::mmap<ap_uint<512>> W_acc0,
|
| 35 |
+
tapa::mmap<ap_uint<512>> W_acc1,
|
| 36 |
+
tapa::mmap<ap_uint<64>> acc0_out,
|
| 37 |
+
tapa::mmap<ap_uint<64>> acc1_out,
|
| 38 |
+
tapa::mmap<int> cycle_count
|
| 39 |
+
);
|
| 40 |
+
|
| 41 |
+
template <typename T>
|
| 42 |
+
using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
|
| 43 |
+
|
| 44 |
+
DEFINE_string(bitstream, "", "path to bitstream file");
|
| 45 |
+
|
| 46 |
+
int main(int argc, char *argv[]){
|
| 47 |
+
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
| 48 |
+
|
| 49 |
+
const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
|
| 50 |
+
|
| 51 |
+
srand((unsigned)time(nullptr));
|
| 52 |
+
|
| 53 |
+
// data preparation
|
| 54 |
+
aligned_vector<int> inst = {L, 1};
|
| 55 |
+
aligned_vector<ap_int<8>> X_acc0(L * D);
|
| 56 |
+
aligned_vector<ap_int<8>> X_acc1(L * D);
|
| 57 |
+
aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10);
|
| 58 |
+
aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10);
|
| 59 |
+
aligned_vector<ap_uint<64>> acc0_out(NUM_SLR * L * D / 8);
|
| 60 |
+
// aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
|
| 61 |
+
aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
|
| 62 |
+
aligned_vector<int> cycle_count(1);
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
vector<int> X_copy(L * D);
|
| 66 |
+
vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
|
| 67 |
+
vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
|
| 68 |
+
vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
|
| 69 |
+
vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 70 |
+
vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 71 |
+
vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
|
| 72 |
+
vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
|
| 73 |
+
|
| 74 |
+
for(int i = 0; i < L * D; i++){
|
| 75 |
+
int val = (rand() % 8) + 1;
|
| 76 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 77 |
+
X_copy[i] = val;
|
| 78 |
+
X_acc0[i] = ap_int<8>(full(7, 0));
|
| 79 |
+
X_acc1[i] = ap_int<8>(full(7, 0));
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
|
| 83 |
+
int val = (rand() % 6) - 1;
|
| 84 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 85 |
+
W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 86 |
+
W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
|
| 90 |
+
int val = (rand() % 6) - 1;
|
| 91 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 92 |
+
W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 93 |
+
W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
for(int i = D * D_head * NUM_DUM_SLR * 5; i < D * D_head * NUM_DUM_SLR * 15; i++){
|
| 97 |
+
int val = (rand() % 6) - 1;
|
| 98 |
+
int ind = i - D * D_head * NUM_DUM_SLR * 5;
|
| 99 |
+
ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
|
| 100 |
+
W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 101 |
+
W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
|
| 102 |
+
W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
// cpu
|
| 106 |
+
for(int i = 0; i < NUM_SLR; i++){
|
| 107 |
+
// WqX
|
| 108 |
+
for(int j = 0; j < L; j++){
|
| 109 |
+
for(int k = 0; k < D_head; k++){
|
| 110 |
+
int acc = 0;
|
| 111 |
+
for(int l = 0; l < D; l++){
|
| 112 |
+
acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
|
| 113 |
+
}
|
| 114 |
+
q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
//WvX
|
| 119 |
+
for(int j = 0; j < L; j++){
|
| 120 |
+
for(int k = 0; k < D_head; k++){
|
| 121 |
+
int acc = 0;
|
| 122 |
+
for(int l = 0; l < D; l++){
|
| 123 |
+
acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
|
| 124 |
+
}
|
| 125 |
+
acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
//WkX
|
| 130 |
+
for(int j = 0; j < L; j++){
|
| 131 |
+
for(int k = 0; k < D_head; k++){
|
| 132 |
+
int acc = 0;
|
| 133 |
+
for(int l = 0; l < D; l++){
|
| 134 |
+
acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
|
| 135 |
+
}
|
| 136 |
+
k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
// QK^T
|
| 141 |
+
for(int j = 0; j < L; j++){
|
| 142 |
+
for(int k = 0; k < L; k++){
|
| 143 |
+
int acc = 0;
|
| 144 |
+
for(int l = 0; l < D_head; l++){
|
| 145 |
+
acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
|
| 146 |
+
}
|
| 147 |
+
attn_golden[i][j*D_head+k] = acc;
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
// invoke the kernel
|
| 154 |
+
int64_t kernel_time_ns = 0;
|
| 155 |
+
for(int i = 0; i < 24; i++){
|
| 156 |
+
kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
|
| 157 |
+
L * D, L * D / 8, L,
|
| 158 |
+
// tapa::read_only_mmap<int>(inst),
|
| 159 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
|
| 160 |
+
tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
|
| 161 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
|
| 162 |
+
tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
|
| 163 |
+
tapa::write_only_mmap<ap_uint<64>>(acc0_out),
|
| 164 |
+
tapa::write_only_mmap<ap_uint<64>>(acc1_out),
|
| 165 |
+
tapa::write_only_mmap<int>(cycle_count));
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// std::clog << "cycle time: " << cycle_count[0] << std::endl;
|
| 169 |
+
std::clog << "kernel time: " << kernel_time_ns * 2e-9 << " s" << std::endl;
|
| 170 |
+
|
| 171 |
+
int error = 0;
|
| 172 |
+
|
| 173 |
+
// compare
|
| 174 |
+
// for(int i = 0; i < NUM_SLR; i++){
|
| 175 |
+
// for(int j = 0; j < 4; j++){
|
| 176 |
+
// for(int k = 0; k < 16; k++){
|
| 177 |
+
// if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
|
| 178 |
+
// std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
|
| 179 |
+
// error++;
|
| 180 |
+
// }
|
| 181 |
+
// }
|
| 182 |
+
// }
|
| 183 |
+
// }
|
| 184 |
+
|
| 185 |
+
if (error == 0) {
|
| 186 |
+
std::clog << "PASSED" << std::endl;
|
| 187 |
+
} else {
|
| 188 |
+
std::clog << "FAILED" << std::endl;
|
| 189 |
+
return 1;
|
| 190 |
+
}
|
| 191 |
+
return 0;
|
| 192 |
+
|
| 193 |
+
}
|
| 194 |
+
|
gpt-2-medium/host_opencl.cpp
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
Vendor: Xilinx
|
| 3 |
+
Associated Filename: vadd.cpp
|
| 4 |
+
Purpose: VITIS vector addition
|
| 5 |
+
|
| 6 |
+
*******************************************************************************
|
| 7 |
+
Copyright (C) 2019 XILINX, Inc.
|
| 8 |
+
|
| 9 |
+
This file contains confidential and proprietary information of Xilinx, Inc. and
|
| 10 |
+
is protected under U.S. and international copyright and other intellectual
|
| 11 |
+
property laws.
|
| 12 |
+
|
| 13 |
+
DISCLAIMER
|
| 14 |
+
This disclaimer is not a license and does not grant any rights to the materials
|
| 15 |
+
distributed herewith. Except as otherwise provided in a valid license issued to
|
| 16 |
+
you by Xilinx, and to the maximum extent permitted by applicable law:
|
| 17 |
+
(1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
|
| 18 |
+
HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
|
| 19 |
+
INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
|
| 20 |
+
FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
|
| 21 |
+
in contract or tort, including negligence, or under any other theory of
|
| 22 |
+
liability) for any loss or damage of any kind or nature related to, arising under
|
| 23 |
+
or in connection with these materials, including for any direct, or any indirect,
|
| 24 |
+
special, incidental, or consequential loss or damage (including loss of data,
|
| 25 |
+
profits, goodwill, or any type of loss or damage suffered as a result of any
|
| 26 |
+
action brought by a third party) even if such damage or loss was reasonably
|
| 27 |
+
foreseeable or Xilinx had been advised of the possibility of the same.
|
| 28 |
+
|
| 29 |
+
CRITICAL APPLICATIONS
|
| 30 |
+
Xilinx products are not designed or intended to be fail-safe, or for use in any
|
| 31 |
+
application requiring fail-safe performance, such as life-support or safety
|
| 32 |
+
devices or systems, Class III medical devices, nuclear facilities, applications
|
| 33 |
+
related to the deployment of airbags, or any other applications that could lead
|
| 34 |
+
to death, personal injury, or severe property or environmental damage
|
| 35 |
+
(individually and collectively, "Critical Applications"). Customer assumes the
|
| 36 |
+
sole risk and liability of any use of Xilinx products in Critical Applications,
|
| 37 |
+
subject only to applicable laws and regulations governing limitations on product
|
| 38 |
+
liability.
|
| 39 |
+
|
| 40 |
+
THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
|
| 41 |
+
ALL TIMES.
|
| 42 |
+
|
| 43 |
+
*******************************************************************************/
|
| 44 |
+
|
| 45 |
+
#define OCL_CHECK(error, call) \
|
| 46 |
+
call; \
|
| 47 |
+
if (error != CL_SUCCESS) { \
|
| 48 |
+
printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \
|
| 49 |
+
exit(EXIT_FAILURE); \
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
#include "host_opencl.h"
|
| 53 |
+
#include <fstream>
|
| 54 |
+
#include <iostream>
|
| 55 |
+
#include <stdlib.h>
|
| 56 |
+
#include <ap_int.h>
|
| 57 |
+
|
| 58 |
+
static const int DATA_SIZE = 4096;
|
| 59 |
+
|
| 60 |
+
static const std::string error_message =
|
| 61 |
+
"Error: Result mismatch:\n"
|
| 62 |
+
"i = %d CPU result = %d Device result = %d\n";
|
| 63 |
+
|
| 64 |
+
int main(int argc, char* argv[]) {
|
| 65 |
+
// TARGET_DEVICE macro needs to be passed from gcc command line
|
| 66 |
+
if (argc < 2) {
|
| 67 |
+
std::cout << "Usage: " << argv[0] << " <xclbin>" << std::endl;
|
| 68 |
+
return EXIT_FAILURE;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
std::string xclbinFilename = argv[1];
|
| 72 |
+
|
| 73 |
+
// Compute the size of array in bytes
|
| 74 |
+
size_t size_in_bytes = DATA_SIZE * sizeof(int);
|
| 75 |
+
int L = 64;
|
| 76 |
+
if (argc == 3) {
|
| 77 |
+
L = atoi(argv[2]);
|
| 78 |
+
}
|
| 79 |
+
const int D = 1024;
|
| 80 |
+
const int NUM_DUM_SLR = 4;
|
| 81 |
+
const int NUM_SLR = 4;
|
| 82 |
+
const int D_head = 64;
|
| 83 |
+
const int D_ffn = 4096;
|
| 84 |
+
|
| 85 |
+
// Creates a vector of DATA_SIZE elements with an initial value of 10 and 32
|
| 86 |
+
// using customized allocator for getting buffer alignment to 4k boundary
|
| 87 |
+
|
| 88 |
+
std::vector<cl::Device> devices;
|
| 89 |
+
cl_int err;
|
| 90 |
+
cl::Context context;
|
| 91 |
+
cl::CommandQueue q;
|
| 92 |
+
cl::Kernel krnl_vector_add;
|
| 93 |
+
cl::Program program;
|
| 94 |
+
std::vector<cl::Platform> platforms;
|
| 95 |
+
bool found_device = false;
|
| 96 |
+
|
| 97 |
+
// traversing all Platforms To find Xilinx Platform and targeted
|
| 98 |
+
// Device in Xilinx Platform
|
| 99 |
+
cl::Platform::get(&platforms);
|
| 100 |
+
for (size_t i = 0; (i < platforms.size()) & (found_device == false); i++) {
|
| 101 |
+
cl::Platform platform = platforms[i];
|
| 102 |
+
std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
|
| 103 |
+
if (platformName == "Xilinx") {
|
| 104 |
+
devices.clear();
|
| 105 |
+
platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
|
| 106 |
+
if (devices.size()) {
|
| 107 |
+
found_device = true;
|
| 108 |
+
break;
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
if (found_device == false) {
|
| 113 |
+
std::cout << "Error: Unable to find Target Device " << std::endl;
|
| 114 |
+
return EXIT_FAILURE;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
std::cout << "INFO: Reading " << xclbinFilename << std::endl;
|
| 118 |
+
FILE* fp;
|
| 119 |
+
if ((fp = fopen(xclbinFilename.c_str(), "r")) == nullptr) {
|
| 120 |
+
printf("ERROR: %s xclbin not available please build\n", xclbinFilename.c_str());
|
| 121 |
+
exit(EXIT_FAILURE);
|
| 122 |
+
}
|
| 123 |
+
// Load xclbin
|
| 124 |
+
std::cout << "Loading: '" << xclbinFilename << "'\n";
|
| 125 |
+
std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
|
| 126 |
+
bin_file.seekg(0, bin_file.end);
|
| 127 |
+
unsigned nb = bin_file.tellg();
|
| 128 |
+
bin_file.seekg(0, bin_file.beg);
|
| 129 |
+
char* buf = new char[nb];
|
| 130 |
+
bin_file.read(buf, nb);
|
| 131 |
+
|
| 132 |
+
// Creating Program from Binary File
|
| 133 |
+
cl::Program::Binaries bins;
|
| 134 |
+
bins.push_back({buf, nb});
|
| 135 |
+
bool valid_device = false;
|
| 136 |
+
for (unsigned int i = 0; i < devices.size(); i++) {
|
| 137 |
+
auto device = devices[i];
|
| 138 |
+
// Creating Context and Command Queue for selected Device
|
| 139 |
+
OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
|
| 140 |
+
OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
|
| 141 |
+
std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
|
| 142 |
+
cl::Program program(context, {device}, bins, nullptr, &err);
|
| 143 |
+
if (err != CL_SUCCESS) {
|
| 144 |
+
std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
|
| 145 |
+
} else {
|
| 146 |
+
std::cout << "Device[" << i << "]: program successful!\n";
|
| 147 |
+
OCL_CHECK(err, krnl_vector_add = cl::Kernel(program, "opt_kernel", &err));
|
| 148 |
+
valid_device = true;
|
| 149 |
+
break; // we break because we found a valid device
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
if (!valid_device) {
|
| 153 |
+
std::cout << "Failed to program any device found, exit!\n";
|
| 154 |
+
exit(EXIT_FAILURE);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// These commands will allocate memory on the Device. The cl::Buffer objects can
|
| 158 |
+
// be used to reference the memory locations on the device.
|
| 159 |
+
OCL_CHECK(err, cl::Buffer buffer_X_acc0(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
|
| 160 |
+
OCL_CHECK(err, cl::Buffer buffer_X_acc1(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
|
| 161 |
+
OCL_CHECK(err, cl::Buffer buffer_W_acc0(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
|
| 162 |
+
OCL_CHECK(err, cl::Buffer buffer_W_acc1(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
|
| 163 |
+
OCL_CHECK(err, cl::Buffer buffer_acc0_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D * 8), NULL, &err));
|
| 164 |
+
// OCL_CHECK(err, cl::Buffer buffer_acc1_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D), NULL, &err));
|
| 165 |
+
OCL_CHECK(err, cl::Buffer buffer_cycle(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &err));
|
| 166 |
+
|
| 167 |
+
std::cout << "Finish creating buffer\n";
|
| 168 |
+
|
| 169 |
+
// set the kernel Arguments
|
| 170 |
+
int narg = 0;
|
| 171 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D));
|
| 172 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D/16));
|
| 173 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L));
|
| 174 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc0));
|
| 175 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc1));
|
| 176 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc0));
|
| 177 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc1));
|
| 178 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc0_out));
|
| 179 |
+
// OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc1_out));
|
| 180 |
+
OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_cycle));
|
| 181 |
+
|
| 182 |
+
std::cout << "Finish setArgs\n";
|
| 183 |
+
|
| 184 |
+
// We then need to map our OpenCL buffers to get the pointers
|
| 185 |
+
ap_int<8>* X_acc0;
|
| 186 |
+
ap_int<8>* X_acc1;
|
| 187 |
+
ap_int<8>* W_acc0;
|
| 188 |
+
ap_int<8>* W_acc1;
|
| 189 |
+
ap_uint<128>* acc0_out;
|
| 190 |
+
// ap_uint<64>* acc1_out;
|
| 191 |
+
int* cycle;
|
| 192 |
+
OCL_CHECK(err,
|
| 193 |
+
X_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc0, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
|
| 194 |
+
OCL_CHECK(err,
|
| 195 |
+
X_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc1, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
|
| 196 |
+
OCL_CHECK(err,
|
| 197 |
+
W_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc0, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
|
| 198 |
+
OCL_CHECK(err,
|
| 199 |
+
W_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc1, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
|
| 200 |
+
OCL_CHECK(err, acc0_out = (ap_uint<128>*)q.enqueueMapBuffer(buffer_acc0_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D * 2, NULL,
|
| 201 |
+
NULL, &err));
|
| 202 |
+
// OCL_CHECK(err, acc1_out = (ap_uint<64>*)q.enqueueMapBuffer(buffer_acc1_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D, NULL,
|
| 203 |
+
// NULL, &err));
|
| 204 |
+
OCL_CHECK(err, cycle = (int*)q.enqueueMapBuffer(buffer_cycle, CL_TRUE, CL_MAP_READ, 0, sizeof(int), NULL,
|
| 205 |
+
NULL, &err));
|
| 206 |
+
|
| 207 |
+
// Initialize the vectors used in the test
|
| 208 |
+
for(int i = 0; i < L * D; i++){
|
| 209 |
+
X_acc0[i] = 1;
|
| 210 |
+
X_acc1[i] = 1;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
|
| 214 |
+
W_acc1[i] = 1;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
|
| 218 |
+
W_acc0[i] = 1;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
std::cout << "Finish assigning values\n";
|
| 222 |
+
|
| 223 |
+
cl::Event event;
|
| 224 |
+
uint64_t nstimestart, nstimeend;
|
| 225 |
+
uint64_t exe_time = 0;
|
| 226 |
+
|
| 227 |
+
// Data will be migrated to kernel space
|
| 228 |
+
OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_X_acc0, buffer_X_acc1, buffer_W_acc0, buffer_W_acc1}, 0 /* 0 means from host*/));
|
| 229 |
+
|
| 230 |
+
std::cout << "Start kernel\n";
|
| 231 |
+
|
| 232 |
+
// Launch the Kernel
|
| 233 |
+
OCL_CHECK(err, err = q.enqueueTask(krnl_vector_add, nullptr, &event));
|
| 234 |
+
|
| 235 |
+
std::cout << "Finish kernel\n";
|
| 236 |
+
|
| 237 |
+
// The result of the previous kernel execution will need to be retrieved in
|
| 238 |
+
// order to view the results. This call will transfer the data from FPGA to
|
| 239 |
+
// source_results vector
|
| 240 |
+
OCL_CHECK(err, q.enqueueMigrateMemObjects({buffer_acc0_out, buffer_cycle}, CL_MIGRATE_MEM_OBJECT_HOST));
|
| 241 |
+
|
| 242 |
+
std::cout << "Receive data\n";
|
| 243 |
+
|
| 244 |
+
OCL_CHECK(err, q.finish());
|
| 245 |
+
OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_START, &nstimestart));
|
| 246 |
+
OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_END, &nstimeend));
|
| 247 |
+
exe_time += nstimeend - nstimestart;
|
| 248 |
+
|
| 249 |
+
// Verify the result
|
| 250 |
+
int match = 0;
|
| 251 |
+
// for (int i = 0; i < DATA_SIZE; i++) {
|
| 252 |
+
// int host_result = ptr_a[i] + ptr_b[i];
|
| 253 |
+
// if (ptr_result[i] != host_result) {
|
| 254 |
+
// printf(error_message.c_str(), i, host_result, ptr_result[i]);
|
| 255 |
+
// match = 1;
|
| 256 |
+
// break;
|
| 257 |
+
// }
|
| 258 |
+
// }
|
| 259 |
+
std::cout << "Cycle count: " << cycle[0] << std::endl;
|
| 260 |
+
std::cout << "Latency: " << exe_time << " ns" << std::endl;
|
| 261 |
+
|
| 262 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc0, X_acc0));
|
| 263 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc1, X_acc1));
|
| 264 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc0, W_acc0));
|
| 265 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc1, W_acc1));
|
| 266 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc0_out, acc0_out));
|
| 267 |
+
// OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc1_out, acc1_out));
|
| 268 |
+
OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_cycle, cycle));
|
| 269 |
+
OCL_CHECK(err, err = q.finish());
|
| 270 |
+
|
| 271 |
+
std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
|
| 272 |
+
return (match ? EXIT_FAILURE : EXIT_SUCCESS);
|
| 273 |
+
}
|
gpt-2-medium/host_opencl.h
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*******************************************************************************
|
| 2 |
+
Vendor: Xilinx
|
| 3 |
+
Associated Filename: vadd.h
|
| 4 |
+
Purpose: VITIS vector addition
|
| 5 |
+
Revision History: January 28, 2016
|
| 6 |
+
|
| 7 |
+
*******************************************************************************
|
| 8 |
+
Copyright (C) 2019 XILINX, Inc.
|
| 9 |
+
|
| 10 |
+
This file contains confidential and proprietary information of Xilinx, Inc. and
|
| 11 |
+
is protected under U.S. and international copyright and other intellectual
|
| 12 |
+
property laws.
|
| 13 |
+
|
| 14 |
+
DISCLAIMER
|
| 15 |
+
This disclaimer is not a license and does not grant any rights to the materials
|
| 16 |
+
distributed herewith. Except as otherwise provided in a valid license issued to
|
| 17 |
+
you by Xilinx, and to the maximum extent permitted by applicable law:
|
| 18 |
+
(1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
|
| 19 |
+
HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
|
| 20 |
+
INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
|
| 21 |
+
FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
|
| 22 |
+
in contract or tort, including negligence, or under any other theory of
|
| 23 |
+
liability) for any loss or damage of any kind or nature related to, arising under
|
| 24 |
+
or in connection with these materials, including for any direct, or any indirect,
|
| 25 |
+
special, incidental, or consequential loss or damage (including loss of data,
|
| 26 |
+
profits, goodwill, or any type of loss or damage suffered as a result of any
|
| 27 |
+
action brought by a third party) even if such damage or loss was reasonably
|
| 28 |
+
foreseeable or Xilinx had been advised of the possibility of the same.
|
| 29 |
+
|
| 30 |
+
CRITICAL APPLICATIONS
|
| 31 |
+
Xilinx products are not designed or intended to be fail-safe, or for use in any
|
| 32 |
+
application requiring fail-safe performance, such as life-support or safety
|
| 33 |
+
devices or systems, Class III medical devices, nuclear facilities, applications
|
| 34 |
+
related to the deployment of airbags, or any other applications that could lead
|
| 35 |
+
to death, personal injury, or severe property or environmental damage
|
| 36 |
+
(individually and collectively, "Critical Applications"). Customer assumes the
|
| 37 |
+
sole risk and liability of any use of Xilinx products in Critical Applications,
|
| 38 |
+
subject only to applicable laws and regulations governing limitations on product
|
| 39 |
+
liability.
|
| 40 |
+
|
| 41 |
+
THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
|
| 42 |
+
ALL TIMES.
|
| 43 |
+
|
| 44 |
+
*******************************************************************************/
|
| 45 |
+
|
| 46 |
+
#pragma once
|
| 47 |
+
|
| 48 |
+
#define CL_HPP_CL_1_2_DEFAULT_BUILD
|
| 49 |
+
#define CL_HPP_TARGET_OPENCL_VERSION 120
|
| 50 |
+
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
|
| 51 |
+
#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
|
| 52 |
+
|
| 53 |
+
#include <CL/cl2.hpp>
|
| 54 |
+
|
| 55 |
+
//Customized buffer allocation for 4K boundary alignment
|
| 56 |
+
template <typename T>
|
| 57 |
+
struct aligned_allocator
|
| 58 |
+
{
|
| 59 |
+
using value_type = T;
|
| 60 |
+
T* allocate(std::size_t num)
|
| 61 |
+
{
|
| 62 |
+
void* ptr = nullptr;
|
| 63 |
+
if (posix_memalign(&ptr,4096,num*sizeof(T)))
|
| 64 |
+
throw std::bad_alloc();
|
| 65 |
+
return reinterpret_cast<T*>(ptr);
|
| 66 |
+
}
|
| 67 |
+
void deallocate(T* p, std::size_t num)
|
| 68 |
+
{
|
| 69 |
+
free(p);
|
| 70 |
+
}
|
| 71 |
+
};
|
gpt-2-medium/kernel-ultrascale.cpp
ADDED
|
@@ -0,0 +1,2091 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cmath>
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <tapa.h>
|
| 4 |
+
#include <ap_int.h>
|
| 5 |
+
#include <hls_math.h>
|
| 6 |
+
|
| 7 |
+
constexpr int D = 1024;
|
| 8 |
+
constexpr int D_div_2 = D / 2;
|
| 9 |
+
constexpr int D_div_4 = D / 4;
|
| 10 |
+
constexpr int D_ffn = 3072;
|
| 11 |
+
constexpr int N_head = 16;
|
| 12 |
+
constexpr int MAX_SEQ_LEN = 1024;
|
| 13 |
+
constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
|
| 14 |
+
constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
|
| 15 |
+
constexpr int NUM_SLR = 3;
|
| 16 |
+
constexpr int NUM_DUM_SLR = 4;
|
| 17 |
+
constexpr int TOTAL_PORT = NUM_SLR * 2;
|
| 18 |
+
constexpr int D_head = D / N_head;
|
| 19 |
+
constexpr int D_head_div_32 = D_head / 32;
|
| 20 |
+
constexpr int D_head_div_16 = D_head / 16;
|
| 21 |
+
constexpr int D_head_div_8 = D_head / 8;
|
| 22 |
+
constexpr int D_head_div_4 = D_head / 4;
|
| 23 |
+
constexpr int D_head_div_2 = D_head / 2;
|
| 24 |
+
constexpr int D_div_8 = D / 8;
|
| 25 |
+
constexpr int D_div_16 = D / 16;
|
| 26 |
+
constexpr int D_ffn_SLR = 1376;
|
| 27 |
+
constexpr int D_ffn_SLR_div_8 = D_ffn_SLR / 8;
|
| 28 |
+
constexpr int D_ffn_SLR_div_2 = D_ffn_SLR / 2;
|
| 29 |
+
constexpr int FFN_WEIGHT_SIZE = D * D_ffn_SLR * NUM_DUM_SLR;
|
| 30 |
+
constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
|
| 31 |
+
constexpr int WEIGHT_D = D * 2;
|
| 32 |
+
constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 15 / 2; // multi-head attention
|
| 33 |
+
constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE + FFN_WEIGHT_SIZE;
|
| 34 |
+
constexpr int CONTEXT_D = D_head_div_8 * 5;
|
| 35 |
+
constexpr int D_head_mul_4 = D_head * 4;
|
| 36 |
+
constexpr int D_write_zero_acc0 = D / 32;
|
| 37 |
+
constexpr int D_write_zero_acc1 = D / 32 + D / 16;
|
| 38 |
+
|
| 39 |
+
using int_v16 = tapa::vec_t<int, 16>;
|
| 40 |
+
using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
|
| 41 |
+
using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
|
| 42 |
+
|
| 43 |
+
template <typename data_t>
|
| 44 |
+
inline void bh(tapa::istream<data_t> & q) {
|
| 45 |
+
#pragma HLS inline
|
| 46 |
+
for (;;) {
|
| 47 |
+
#pragma HLS pipeline II=1 style=stp
|
| 48 |
+
data_t tmp; q.try_read(tmp);
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
struct ConfigInst {
|
| 53 |
+
ap_uint<3> stage; // stage 7 -> read L
|
| 54 |
+
ap_uint<11> weight_bound;
|
| 55 |
+
ap_uint<7> i_bound;
|
| 56 |
+
ap_uint<8> j_bound;
|
| 57 |
+
ap_uint<8> k_bound;
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
void black_hole_int(tapa::istream<int> & fifo_in) {
|
| 61 |
+
bh(fifo_in);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
void black_hole_inst(tapa::istream<ConfigInst> & fifo_in) {
|
| 65 |
+
bh(fifo_in);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
|
| 69 |
+
bh(fifo_in);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
|
| 73 |
+
bh(fifo_in);
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
|
| 77 |
+
bh(fifo_in);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
|
| 81 |
+
bh(fifo_in);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
|
| 85 |
+
bh(fifo_in);
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
void read_W(
|
| 89 |
+
tapa::async_mmap<ap_uint<512>>& vec,
|
| 90 |
+
tapa::ostream<ap_uint<512>>& fifo_out
|
| 91 |
+
){
|
| 92 |
+
for(int i_req = 0, i_resp = 0; i_resp < (TOTAL_WEIGHT_SIZE >> 7);){
|
| 93 |
+
#pragma HLS pipeline II=1 style=stp
|
| 94 |
+
if((i_req < (TOTAL_WEIGHT_SIZE >> 7)) & !vec.read_addr.full()){
|
| 95 |
+
vec.read_addr.write(i_req);
|
| 96 |
+
i_req++;
|
| 97 |
+
}
|
| 98 |
+
ap_uint<512> tmp_o;
|
| 99 |
+
bool success = vec.read_data.try_read(tmp_o);
|
| 100 |
+
if(success){
|
| 101 |
+
fifo_out.write(tmp_o);
|
| 102 |
+
i_resp++;
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
void read_X(
|
| 108 |
+
const int N,
|
| 109 |
+
tapa::async_mmap<ap_uint<512>>& vec,
|
| 110 |
+
tapa::ostream<ap_uint<512>>& fifo_out
|
| 111 |
+
){
|
| 112 |
+
for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
|
| 113 |
+
#pragma HLS pipeline II=1 style=stp
|
| 114 |
+
if((i_req < (N >> 6)) & !vec.read_addr.full()){
|
| 115 |
+
vec.read_addr.write(i_req);
|
| 116 |
+
i_req++;
|
| 117 |
+
}
|
| 118 |
+
ap_uint<512> tmp_o;
|
| 119 |
+
bool success = vec.read_data.try_read(tmp_o);
|
| 120 |
+
if(success){
|
| 121 |
+
fifo_out.write(tmp_o);
|
| 122 |
+
i_resp++;
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
void read_inst(
|
| 128 |
+
const int L,
|
| 129 |
+
tapa::ostream<ConfigInst>& fifo_out_acc0,
|
| 130 |
+
tapa::ostream<ConfigInst>& fifo_out_acc1
|
| 131 |
+
){
|
| 132 |
+
ConfigInst len;
|
| 133 |
+
len.stage = 7;
|
| 134 |
+
len.weight_bound = L;
|
| 135 |
+
|
| 136 |
+
fifo_out_acc0.write(len);
|
| 137 |
+
fifo_out_acc1.write(len);
|
| 138 |
+
|
| 139 |
+
for(int stage_i = 0; stage_i < 17; stage_i++){
|
| 140 |
+
#pragma HLS pipeline II=1 style=stp
|
| 141 |
+
|
| 142 |
+
ConfigInst inst_acc0;
|
| 143 |
+
ConfigInst inst_acc1;
|
| 144 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : (stage_i - 12);
|
| 145 |
+
|
| 146 |
+
inst_acc0.stage = ap_uint<3>(stage);
|
| 147 |
+
inst_acc1.stage = ap_uint<3>(stage);
|
| 148 |
+
if(stage == 0){
|
| 149 |
+
inst_acc0.weight_bound = D_head_div_4;
|
| 150 |
+
inst_acc0.i_bound = (L >> 4);
|
| 151 |
+
inst_acc0.j_bound = D_head_div_16;
|
| 152 |
+
inst_acc0.k_bound = D_div_8;
|
| 153 |
+
|
| 154 |
+
inst_acc1 = inst_acc0;
|
| 155 |
+
} else if (stage == 1){
|
| 156 |
+
inst_acc0.weight_bound = D_head_div_8;
|
| 157 |
+
inst_acc0.i_bound = (L >> 4);
|
| 158 |
+
inst_acc0.j_bound = D_head_div_32;
|
| 159 |
+
inst_acc0.k_bound = D_div_8;
|
| 160 |
+
|
| 161 |
+
inst_acc1 = inst_acc0;
|
| 162 |
+
} else if (stage == 2){
|
| 163 |
+
inst_acc0.weight_bound = 0;
|
| 164 |
+
inst_acc0.i_bound = (L >> 4);
|
| 165 |
+
inst_acc0.j_bound = (L >> 4);
|
| 166 |
+
inst_acc0.k_bound = D_head_div_8;
|
| 167 |
+
|
| 168 |
+
inst_acc1.weight_bound = 0;
|
| 169 |
+
inst_acc1.i_bound = (L >> 4);
|
| 170 |
+
inst_acc1.j_bound = D_head_div_16;
|
| 171 |
+
inst_acc1.k_bound = (L >> 3);
|
| 172 |
+
} else if (stage == 3){
|
| 173 |
+
inst_acc0.weight_bound = (CONTEXT_D << 1);
|
| 174 |
+
inst_acc0.i_bound = (L >> 5);
|
| 175 |
+
inst_acc0.j_bound = D_div_16;
|
| 176 |
+
inst_acc0.k_bound = CONTEXT_D;
|
| 177 |
+
|
| 178 |
+
inst_acc1 = inst_acc0;
|
| 179 |
+
} else {
|
| 180 |
+
inst_acc0.weight_bound = (D_ffn_SLR >> 2);
|
| 181 |
+
inst_acc0.i_bound = (L >> 4);
|
| 182 |
+
inst_acc0.j_bound = (D_ffn_SLR >> 4);
|
| 183 |
+
inst_acc0.k_bound = D_div_8;
|
| 184 |
+
|
| 185 |
+
inst_acc1.weight_bound = D_div_4;
|
| 186 |
+
inst_acc1.i_bound = (L >> 4);
|
| 187 |
+
inst_acc1.j_bound = D_div_16;
|
| 188 |
+
inst_acc1.k_bound = D_ffn_SLR_div_8;
|
| 189 |
+
}
|
| 190 |
+
fifo_out_acc0.write(inst_acc0);
|
| 191 |
+
fifo_out_acc1.write(inst_acc1);
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
void packet_switch_acc(
|
| 196 |
+
tapa::istream<int>& fifo_inst_in,
|
| 197 |
+
tapa::ostream<int>& fifo_sfu_out,
|
| 198 |
+
tapa::ostream<int>& fifo_sfu_gelu
|
| 199 |
+
) {
|
| 200 |
+
const int L = fifo_inst_in.read();
|
| 201 |
+
fifo_sfu_out.write(L);
|
| 202 |
+
fifo_sfu_gelu.write(L);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
void write_mtx(
|
| 206 |
+
const int N,
|
| 207 |
+
tapa::async_mmap<ap_uint<128>>& output_mtx,
|
| 208 |
+
tapa::istream<ap_uint<128>>& fifo_in,
|
| 209 |
+
tapa::ostream<bool>& fifo_fin
|
| 210 |
+
){
|
| 211 |
+
|
| 212 |
+
for(int i_req = 0, i_resp = 0; i_resp < N;){
|
| 213 |
+
#pragma HLS pipeline II=1 style=stp
|
| 214 |
+
if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
|
| 215 |
+
output_mtx.write_addr.try_write(i_req);
|
| 216 |
+
ap_uint<128> tmp; fifo_in.try_read(tmp);
|
| 217 |
+
output_mtx.write_data.try_write(tmp);
|
| 218 |
+
++i_req;
|
| 219 |
+
}
|
| 220 |
+
bool success = false;
|
| 221 |
+
auto resp = output_mtx.write_resp.read(success);
|
| 222 |
+
if(success){
|
| 223 |
+
i_resp += unsigned(resp)+1;
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
fifo_fin.write(true);
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
void write_zero(
|
| 230 |
+
const int L,
|
| 231 |
+
const int D,
|
| 232 |
+
tapa::ostream<ap_uint<512>>& fifo_zero
|
| 233 |
+
){
|
| 234 |
+
for(int i = 0; i < L * D;){
|
| 235 |
+
if(!fifo_zero.full()){
|
| 236 |
+
ap_uint<512> tmp = 0;
|
| 237 |
+
fifo_zero.try_write(tmp);
|
| 238 |
+
i++;
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
// acc slr0 master node
|
| 244 |
+
void temporal_acc0_slr0(
|
| 245 |
+
tapa::istream<ConfigInst>& fifo_inst_in,
|
| 246 |
+
tapa::ostream<ConfigInst>& fifo_inst_out,
|
| 247 |
+
tapa::ostream<int>& fifo_len_sfu,
|
| 248 |
+
tapa::istream<ap_uint<512>>& fifo_X_in,
|
| 249 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 250 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 251 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 252 |
+
tapa::istream<ap_uint<256>>& fifo_from_acc1,
|
| 253 |
+
tapa::ostream<ap_uint<512>>& fifo_O_out,
|
| 254 |
+
tapa::ostream<ap_uint<512>>& fifo_ffn_out,
|
| 255 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 256 |
+
tapa::istream<ap_uint<1024>>& fifo_ffn_in,
|
| 257 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 258 |
+
tapa::ostream<ap_uint<512>>& fifo_res_send
|
| 259 |
+
// tapa::ostream<ap_uint<64>>& fifo_write,
|
| 260 |
+
// tapa::ostream<bool>& fifo_fin
|
| 261 |
+
){
|
| 262 |
+
|
| 263 |
+
ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 264 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
|
| 265 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
|
| 266 |
+
#pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
|
| 267 |
+
|
| 268 |
+
ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 269 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
|
| 270 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
|
| 271 |
+
#pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
|
| 272 |
+
|
| 273 |
+
ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
|
| 274 |
+
#pragma HLS array_partition variable=X cyclic dim=1 factor=16
|
| 275 |
+
#pragma HLS array_partition variable=X cyclic dim=2 factor=2
|
| 276 |
+
#pragma HLS bind_storage variable=X type=ram_2p impl=uram
|
| 277 |
+
|
| 278 |
+
ConfigInst len = fifo_inst_in.read();
|
| 279 |
+
const int L = len.weight_bound;
|
| 280 |
+
fifo_inst_out.write(len);
|
| 281 |
+
fifo_len_sfu.write(L);
|
| 282 |
+
|
| 283 |
+
for(int stage_i = 0; stage_i < 17; stage_i++){
|
| 284 |
+
|
| 285 |
+
//TODO: stage send from inst
|
| 286 |
+
|
| 287 |
+
// stage 0: WqX
|
| 288 |
+
// stage 1: WkX0 <- acc1
|
| 289 |
+
// stage 2: QK^T
|
| 290 |
+
|
| 291 |
+
ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // TODO: reduce dimension
|
| 292 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=8
|
| 293 |
+
#pragma HLS bind_storage variable=W type=ram_2p impl=uram
|
| 294 |
+
|
| 295 |
+
ConfigInst inst = fifo_inst_in.read();
|
| 296 |
+
fifo_inst_out.write(inst);
|
| 297 |
+
|
| 298 |
+
const ap_uint<3> stage = inst.stage;
|
| 299 |
+
|
| 300 |
+
// load weights and forward
|
| 301 |
+
if(stage != 2) { // TODO: 1d array & uniform access
|
| 302 |
+
const int weight_bound = inst.weight_bound;
|
| 303 |
+
for(int i = 0; i < weight_bound; i++){
|
| 304 |
+
load_weight:
|
| 305 |
+
for(int j = 0; j < D_div_8;){
|
| 306 |
+
if(!fifo_W_in.empty()){
|
| 307 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 308 |
+
|
| 309 |
+
for(int k = 0; k < 2; k++){
|
| 310 |
+
#pragma HLS unroll
|
| 311 |
+
W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
|
| 312 |
+
}
|
| 313 |
+
val = ap_uint<512>(val >> 128);
|
| 314 |
+
fifo_W_out.write(val);
|
| 315 |
+
j++;
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
}
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
// stage 1: compute Q
|
| 322 |
+
const int i_bound = inst.i_bound;
|
| 323 |
+
const int j_bound = inst.j_bound;
|
| 324 |
+
const int k_bound = inst.k_bound;
|
| 325 |
+
|
| 326 |
+
for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 16
|
| 327 |
+
if(stage_i == 0){
|
| 328 |
+
for(int ii = 0; ii < 2; ii++){ // load only 1 time
|
| 329 |
+
load_x:
|
| 330 |
+
for(int jj = 0; jj < D_div_8;){
|
| 331 |
+
if(!fifo_X_in.empty()){
|
| 332 |
+
ap_uint<512> val; fifo_X_in.try_read(val);
|
| 333 |
+
|
| 334 |
+
for(int k = 0; k < 8; k++){
|
| 335 |
+
#pragma HLS unroll
|
| 336 |
+
X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
|
| 337 |
+
}
|
| 338 |
+
jj++;
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
|
| 345 |
+
#pragma HLS loop_flatten off
|
| 346 |
+
|
| 347 |
+
ap_int<38> acc_vec[8][16][8];
|
| 348 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 349 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 350 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 351 |
+
|
| 352 |
+
for(int ii = 0; ii < 8; ii++){
|
| 353 |
+
#pragma HLS unroll
|
| 354 |
+
for(int kk = 0; kk < 16; kk++){
|
| 355 |
+
#pragma HLS unroll
|
| 356 |
+
for(int k = 0; k < 8; k++){
|
| 357 |
+
#pragma HLS unroll
|
| 358 |
+
acc_vec[ii][kk][k] = 0;
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
compute:
|
| 364 |
+
for(int k = 0; k < k_bound; k++){ // reduction dim
|
| 365 |
+
#pragma HLS pipeline II=1 style=stp
|
| 366 |
+
|
| 367 |
+
ap_uint<64> op1_mtx[16];
|
| 368 |
+
ap_uint<64> op2_mtx[16];
|
| 369 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 370 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 371 |
+
|
| 372 |
+
ap_uint<1024> recv_pkt;
|
| 373 |
+
|
| 374 |
+
if(stage == 3) {
|
| 375 |
+
recv_pkt = fifo_context.read();
|
| 376 |
+
}else if(stage == 4) {
|
| 377 |
+
recv_pkt = fifo_ffn_in.read();
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
for(int ii = 0; ii < 16; ii++){
|
| 381 |
+
#pragma HLS unroll
|
| 382 |
+
if(stage > 2){
|
| 383 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 384 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 385 |
+
} else if(stage == 2) {
|
| 386 |
+
op1_mtx[ii] = scratchpad_k[j*16+ii][k];
|
| 387 |
+
op2_mtx[ii] = scratchpad_q[i*16+ii][k];
|
| 388 |
+
} else {
|
| 389 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 390 |
+
op2_mtx[ii] = X[i*16+ii][k];
|
| 391 |
+
}
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
if(stage < 2){
|
| 395 |
+
ap_uint<1024> send_pkt = ap_uint<1024>((
|
| 396 |
+
op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
|
| 397 |
+
op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
|
| 398 |
+
));
|
| 399 |
+
fifo_X_out.write(send_pkt);
|
| 400 |
+
} else if (stage == 4) {
|
| 401 |
+
fifo_X_out.write(recv_pkt);
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
for(int ii = 0; ii < 8; ii++){
|
| 405 |
+
#pragma HLS unroll
|
| 406 |
+
for(int kk = 0; kk < 16; kk++){
|
| 407 |
+
#pragma HLS unroll
|
| 408 |
+
for(int l = 0; l < 8; l++){
|
| 409 |
+
#pragma HLS unroll
|
| 410 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 411 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 412 |
+
if(stage == 2){
|
| 413 |
+
op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
|
| 414 |
+
op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
|
| 415 |
+
} else {
|
| 416 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 417 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 418 |
+
}
|
| 419 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 420 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 421 |
+
}
|
| 422 |
+
}
|
| 423 |
+
}
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
ap_int<22> acc_final[16][16];
|
| 427 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 428 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 429 |
+
|
| 430 |
+
for(int ii = 0; ii < 16; ii++){
|
| 431 |
+
#pragma HLS unroll
|
| 432 |
+
for(int k = 0; k < 16; k++){
|
| 433 |
+
#pragma HLS unroll
|
| 434 |
+
acc_final[ii][k] = 0;
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
reduction:
|
| 439 |
+
for(int kk = 0; kk < 8; kk++){
|
| 440 |
+
for(int ii = 0; ii < 16; ii++){
|
| 441 |
+
#pragma HLS unroll
|
| 442 |
+
for(int k = 0; k < 8; k++){
|
| 443 |
+
#pragma HLS unroll
|
| 444 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 445 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 446 |
+
res1 = res1 + res0[18];
|
| 447 |
+
acc_final[ii][k*2] += res0;
|
| 448 |
+
acc_final[ii][k*2+1] += res1;
|
| 449 |
+
}
|
| 450 |
+
}
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
if(stage == 0){
|
| 454 |
+
for(int ii = 0; ii < 16; ii++){
|
| 455 |
+
#pragma HLS unroll
|
| 456 |
+
for(int k = 0; k < 16; k++){
|
| 457 |
+
#pragma HLS unroll
|
| 458 |
+
int offset = k%8;
|
| 459 |
+
scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
|
| 460 |
+
}
|
| 461 |
+
}
|
| 462 |
+
} else if (stage == 1){
|
| 463 |
+
for(int ii = 0; ii < 4; ii++){
|
| 464 |
+
for(int jj = 0; jj < 2; jj++){
|
| 465 |
+
#pragma HLS pipeline II=1 style=stp
|
| 466 |
+
ap_uint<256> tmp = fifo_from_acc1.read();
|
| 467 |
+
|
| 468 |
+
for(int l = 0; l < 4; l++){
|
| 469 |
+
#pragma HLS unroll
|
| 470 |
+
ap_uint<64> tmp_pack;
|
| 471 |
+
for(int k = 0; k < 8; k++){
|
| 472 |
+
#pragma HLS unroll
|
| 473 |
+
tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
|
| 474 |
+
}
|
| 475 |
+
scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
|
| 476 |
+
}
|
| 477 |
+
for(int l = 0; l < 4; l++){
|
| 478 |
+
#pragma HLS unroll
|
| 479 |
+
scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
|
| 480 |
+
}
|
| 481 |
+
}
|
| 482 |
+
}
|
| 483 |
+
} else if(stage == 2 || stage == 4){
|
| 484 |
+
for(int kk = 0; kk < 16; kk++){
|
| 485 |
+
#pragma HLS pipeline II=1 style=stp
|
| 486 |
+
ap_uint<512> tmp;
|
| 487 |
+
for(int ii = 0; ii < 16; ii++){
|
| 488 |
+
#pragma HLS unroll
|
| 489 |
+
if(stage == 2 && (i*16+ii < j*16+kk)){
|
| 490 |
+
tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
|
| 491 |
+
} else {
|
| 492 |
+
tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
|
| 493 |
+
}
|
| 494 |
+
}
|
| 495 |
+
if(stage == 2) fifo_O_out.write(tmp);
|
| 496 |
+
else fifo_ffn_out.write(tmp);
|
| 497 |
+
}
|
| 498 |
+
} else {
|
| 499 |
+
final_acc:
|
| 500 |
+
for(int ii = 0; ii < 16; ii++){
|
| 501 |
+
#pragma HLS pipeline II=1 style=stp
|
| 502 |
+
#pragma HLS dependence variable=X type=inter false
|
| 503 |
+
ap_uint<512> tmp_recv = fifo_reduce_recv.read();
|
| 504 |
+
ap_uint<512> tmp_send;
|
| 505 |
+
for(int k = 0; k < 16; k++){
|
| 506 |
+
#pragma HLS unroll
|
| 507 |
+
ap_int<32> tmp = acc_final[ii][k] + ap_int<32>(tmp_recv(k*32+31, k*32));
|
| 508 |
+
tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
|
| 509 |
+
tmp_send(k*32+31, k*32) = tmp;
|
| 510 |
+
}
|
| 511 |
+
fifo_res_send.write(tmp_send);
|
| 512 |
+
}
|
| 513 |
+
}
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
}
|
| 517 |
+
// fifo_fin.write(true);
|
| 518 |
+
|
| 519 |
+
// write:
|
| 520 |
+
// for(int i = 0; i < L; i++){
|
| 521 |
+
// for(int j = 0; j < D_div_8; j++){
|
| 522 |
+
// #pragma HLS pipeline II=1 style=stp
|
| 523 |
+
// fifo_write.write(X[i][j]);
|
| 524 |
+
// }
|
| 525 |
+
// }
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
void temporal_acc0(
|
| 529 |
+
tapa::istream<ConfigInst>& fifo_inst_in,
|
| 530 |
+
tapa::ostream<ConfigInst>& fifo_inst_out,
|
| 531 |
+
tapa::ostream<int>& fifo_len_sfu,
|
| 532 |
+
tapa::istream<ap_uint<1024>>& fifo_X_in,
|
| 533 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 534 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 535 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 536 |
+
tapa::istream<ap_uint<256>>& fifo_from_acc1,
|
| 537 |
+
tapa::ostream<ap_uint<512>>& fifo_O_out,
|
| 538 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 539 |
+
tapa::ostream<ap_uint<512>>& fifo_ffn_out,
|
| 540 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 541 |
+
tapa::ostream<ap_uint<512>>& fifo_reduce_send
|
| 542 |
+
){
|
| 543 |
+
|
| 544 |
+
ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 545 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
|
| 546 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
|
| 547 |
+
#pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
|
| 548 |
+
|
| 549 |
+
ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 550 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
|
| 551 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
|
| 552 |
+
#pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
|
| 553 |
+
|
| 554 |
+
ConfigInst len = fifo_inst_in.read();
|
| 555 |
+
const int L = len.weight_bound;
|
| 556 |
+
fifo_inst_out.write(len);
|
| 557 |
+
fifo_len_sfu.write(L);
|
| 558 |
+
|
| 559 |
+
for(int stage_i = 0; stage_i < 17; stage_i++){
|
| 560 |
+
#pragma HLS loop_flatten off
|
| 561 |
+
|
| 562 |
+
// stage 0: WqX
|
| 563 |
+
// stage 1: WkX0 <- acc1
|
| 564 |
+
// stage 2: QK^T
|
| 565 |
+
// stage 3: WoO
|
| 566 |
+
|
| 567 |
+
ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // 4 bit
|
| 568 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=8
|
| 569 |
+
#pragma HLS bind_storage variable=W type=ram_2p impl=uram
|
| 570 |
+
|
| 571 |
+
ConfigInst inst = fifo_inst_in.read();
|
| 572 |
+
fifo_inst_out.write(inst);
|
| 573 |
+
|
| 574 |
+
const ap_uint<3> stage = inst.stage;
|
| 575 |
+
|
| 576 |
+
// load weights and forward
|
| 577 |
+
if(stage != 2) {
|
| 578 |
+
const int weight_bound = inst.weight_bound;
|
| 579 |
+
for(int i = 0; i < weight_bound; i++){
|
| 580 |
+
load_weight:
|
| 581 |
+
for(int j = 0; j < D_div_8;){
|
| 582 |
+
if(!fifo_W_in.empty()){
|
| 583 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 584 |
+
|
| 585 |
+
for(int k = 0; k < 2; k++){
|
| 586 |
+
#pragma HLS unroll
|
| 587 |
+
W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
|
| 588 |
+
}
|
| 589 |
+
val = ap_uint<512>(val >> 128);
|
| 590 |
+
fifo_W_out.write(val);
|
| 591 |
+
j++;
|
| 592 |
+
}
|
| 593 |
+
}
|
| 594 |
+
}
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
const int i_bound = inst.i_bound;
|
| 598 |
+
const int j_bound = inst.j_bound;
|
| 599 |
+
const int k_bound = inst.k_bound;
|
| 600 |
+
|
| 601 |
+
// stage 1: compute Q
|
| 602 |
+
for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 64
|
| 603 |
+
|
| 604 |
+
for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
|
| 605 |
+
#pragma HLS loop_flatten off
|
| 606 |
+
|
| 607 |
+
ap_int<38> acc_vec[8][16][8];
|
| 608 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 609 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 610 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 611 |
+
|
| 612 |
+
for(int ii = 0; ii < 8; ii++){
|
| 613 |
+
#pragma HLS unroll
|
| 614 |
+
for(int kk = 0; kk < 16; kk++){
|
| 615 |
+
#pragma HLS unroll
|
| 616 |
+
for(int k = 0; k < 8; k++){
|
| 617 |
+
#pragma HLS unroll
|
| 618 |
+
acc_vec[ii][kk][k] = 0;
|
| 619 |
+
}
|
| 620 |
+
}
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
compute:
|
| 624 |
+
for(int k = 0; k < k_bound; k++){ // reduction dim
|
| 625 |
+
#pragma HLS pipeline II=1 style=stp
|
| 626 |
+
|
| 627 |
+
ap_uint<64> op1_mtx[16];
|
| 628 |
+
ap_uint<64> op2_mtx[16];
|
| 629 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 630 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 631 |
+
|
| 632 |
+
ap_uint<1024> recv_pkt;
|
| 633 |
+
if(stage == 3){
|
| 634 |
+
recv_pkt = fifo_context.read();
|
| 635 |
+
} else if(stage != 2) {
|
| 636 |
+
recv_pkt = fifo_X_in.read();
|
| 637 |
+
fifo_X_out.write(recv_pkt);
|
| 638 |
+
}
|
| 639 |
+
|
| 640 |
+
for(int ii = 0; ii < 16; ii++){
|
| 641 |
+
#pragma HLS unroll
|
| 642 |
+
if(stage == 2) {
|
| 643 |
+
op1_mtx[ii] = scratchpad_q[i*16+ii][k];
|
| 644 |
+
op2_mtx[ii] = scratchpad_k[j*16+ii][k];
|
| 645 |
+
} else {
|
| 646 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 647 |
+
op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
|
| 648 |
+
}
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
for(int ii = 0; ii < 8; ii++){
|
| 652 |
+
#pragma HLS unroll
|
| 653 |
+
for(int kk = 0; kk < 16; kk++){
|
| 654 |
+
#pragma HLS unroll
|
| 655 |
+
for(int l = 0; l < 8; l++){
|
| 656 |
+
#pragma HLS unroll
|
| 657 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 658 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 659 |
+
if(stage == 2){
|
| 660 |
+
op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
|
| 661 |
+
op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
|
| 662 |
+
} else {
|
| 663 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 664 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 665 |
+
}
|
| 666 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 667 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 668 |
+
}
|
| 669 |
+
}
|
| 670 |
+
}
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
ap_int<22> acc_final[16][16];
|
| 674 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 675 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 676 |
+
|
| 677 |
+
for(int ii = 0; ii < 16; ii++){
|
| 678 |
+
#pragma HLS unroll
|
| 679 |
+
for(int k = 0; k < 16; k++){
|
| 680 |
+
#pragma HLS unroll
|
| 681 |
+
acc_final[ii][k] = 0;
|
| 682 |
+
}
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
reduction:
|
| 686 |
+
for(int kk = 0; kk < 8; kk++){
|
| 687 |
+
for(int ii = 0; ii < 16; ii++){
|
| 688 |
+
#pragma HLS unroll
|
| 689 |
+
for(int k = 0; k < 8; k++){
|
| 690 |
+
#pragma HLS unroll
|
| 691 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 692 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 693 |
+
res1 = res1 + res0[18];
|
| 694 |
+
acc_final[ii][k*2] += res0;
|
| 695 |
+
acc_final[ii][k*2+1] += res1;
|
| 696 |
+
}
|
| 697 |
+
}
|
| 698 |
+
}
|
| 699 |
+
|
| 700 |
+
if(stage == 0){
|
| 701 |
+
for(int ii = 0; ii < 16; ii++){
|
| 702 |
+
#pragma HLS unroll
|
| 703 |
+
for(int k = 0; k < 16; k++){
|
| 704 |
+
#pragma HLS unroll
|
| 705 |
+
int offset = k%8;
|
| 706 |
+
scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
|
| 707 |
+
}
|
| 708 |
+
}
|
| 709 |
+
} else if (stage == 1){
|
| 710 |
+
for(int ii = 0; ii < 4; ii++){
|
| 711 |
+
for(int jj = 0; jj < 2; jj++){
|
| 712 |
+
#pragma HLS pipeline II=1 style=stp
|
| 713 |
+
ap_uint<256> tmp = fifo_from_acc1.read();
|
| 714 |
+
|
| 715 |
+
for(int l = 0; l < 4; l++){
|
| 716 |
+
#pragma HLS unroll
|
| 717 |
+
ap_uint<64> tmp_pack;
|
| 718 |
+
for(int k = 0; k < 8; k++){
|
| 719 |
+
#pragma HLS unroll
|
| 720 |
+
tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
|
| 721 |
+
}
|
| 722 |
+
scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
|
| 723 |
+
}
|
| 724 |
+
for(int l = 0; l < 4; l++){
|
| 725 |
+
#pragma HLS unroll
|
| 726 |
+
scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
|
| 727 |
+
}
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
} else if(stage == 2 || stage == 4){
|
| 731 |
+
for(int kk = 0; kk < 16; kk++){
|
| 732 |
+
#pragma HLS pipeline II=1 style=stp
|
| 733 |
+
ap_uint<512> tmp;
|
| 734 |
+
for(int ii = 0; ii < 16; ii++){
|
| 735 |
+
#pragma HLS unroll
|
| 736 |
+
if(stage == 2 && (i*16+ii < j*16+kk)){
|
| 737 |
+
tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
|
| 738 |
+
} else {
|
| 739 |
+
tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
|
| 740 |
+
}
|
| 741 |
+
}
|
| 742 |
+
if(stage == 2) fifo_O_out.write(tmp);
|
| 743 |
+
else fifo_ffn_out.write(tmp);
|
| 744 |
+
}
|
| 745 |
+
} else {
|
| 746 |
+
final_acc:
|
| 747 |
+
for(int ii = 0; ii < 16; ii++){
|
| 748 |
+
#pragma HLS pipeline II=1 style=stp
|
| 749 |
+
ap_uint<512> tmp_recv = fifo_reduce_recv.read();
|
| 750 |
+
ap_uint<512> tmp;
|
| 751 |
+
for(int k = 0; k < 16; k++){
|
| 752 |
+
#pragma HLS unroll
|
| 753 |
+
acc_final[ii][k] += ap_int<24>(tmp_recv(k*32+23, k*32));
|
| 754 |
+
tmp(k*32+23, k*32) = acc_final[ii][k];
|
| 755 |
+
}
|
| 756 |
+
fifo_reduce_send.write(tmp);
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
}
|
| 760 |
+
}
|
| 761 |
+
}
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
// acc slr0 master node
|
| 765 |
+
void temporal_acc1_slr0(
|
| 766 |
+
tapa::istream<ConfigInst>& fifo_inst_in,
|
| 767 |
+
tapa::ostream<ConfigInst>& fifo_inst_out,
|
| 768 |
+
tapa::ostream<int>& fifo_len_context,
|
| 769 |
+
tapa::istream<ap_uint<512>>& fifo_X_in,
|
| 770 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 771 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 772 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 773 |
+
tapa::ostream<ap_uint<256>>& fifo_to_acc0,
|
| 774 |
+
tapa::istream<ap_uint<128>>& fifo_from_sfu,
|
| 775 |
+
tapa::ostream<ap_uint<1024>>& fifo_O_out,
|
| 776 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 777 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 778 |
+
tapa::ostream<ap_uint<512>>& fifo_res_send,
|
| 779 |
+
tapa::istream<ap_uint<1024>>& fifo_gelu_in,
|
| 780 |
+
tapa::ostream<ap_uint<512>>& fifo_ffn_out
|
| 781 |
+
// tapa::ostream<ap_uint<64>>& fifo_write,
|
| 782 |
+
// tapa::ostream<bool>& fifo_fin
|
| 783 |
+
){
|
| 784 |
+
ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
|
| 785 |
+
#pragma HLS array_partition variable=X cyclic dim=1 factor=16
|
| 786 |
+
#pragma HLS array_partition variable=X cyclic dim=2 factor=2
|
| 787 |
+
#pragma HLS bind_storage variable=X type=ram_2p impl=uram
|
| 788 |
+
|
| 789 |
+
ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
|
| 790 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
|
| 791 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
|
| 792 |
+
#pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
|
| 793 |
+
|
| 794 |
+
// ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
|
| 795 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
|
| 796 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
|
| 797 |
+
|
| 798 |
+
ConfigInst len = fifo_inst_in.read();
|
| 799 |
+
const int L = len.weight_bound;
|
| 800 |
+
fifo_inst_out.write(len);
|
| 801 |
+
fifo_len_context.write(L);
|
| 802 |
+
|
| 803 |
+
for(int stage_i = 0; stage_i < 17; stage_i++){
|
| 804 |
+
|
| 805 |
+
// stage 0: WvX
|
| 806 |
+
// stage 1: WkX1 -> acc0
|
| 807 |
+
// stage 2: Softmax(QK)V <- acc0
|
| 808 |
+
// stage 3: WoO
|
| 809 |
+
|
| 810 |
+
ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
|
| 811 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=8
|
| 812 |
+
#pragma HLS bind_storage variable=W type=ram_2p impl=uram
|
| 813 |
+
|
| 814 |
+
|
| 815 |
+
ConfigInst inst = fifo_inst_in.read();
|
| 816 |
+
fifo_inst_out.write(inst);
|
| 817 |
+
|
| 818 |
+
const ap_uint<3> stage = inst.stage;
|
| 819 |
+
|
| 820 |
+
// load weights and forward
|
| 821 |
+
if(stage != 2) {
|
| 822 |
+
const int weight_bound = inst.weight_bound;
|
| 823 |
+
int sub_bound = D_div_8;
|
| 824 |
+
if (stage == 4) sub_bound = D_ffn_SLR_div_8;
|
| 825 |
+
for(int i = 0; i < weight_bound; i++){
|
| 826 |
+
load_weight:
|
| 827 |
+
for(int j = 0; j < sub_bound;){
|
| 828 |
+
if(!fifo_W_in.empty()){
|
| 829 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 830 |
+
|
| 831 |
+
for(int k = 0; k < 2; k++){
|
| 832 |
+
#pragma HLS unroll
|
| 833 |
+
W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
|
| 834 |
+
}
|
| 835 |
+
val = ap_uint<512>(val >> 128);
|
| 836 |
+
fifo_W_out.write(val);
|
| 837 |
+
j++;
|
| 838 |
+
}
|
| 839 |
+
}
|
| 840 |
+
}
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
const int i_bound = inst.i_bound;
|
| 844 |
+
const int j_bound = inst.j_bound;
|
| 845 |
+
|
| 846 |
+
for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
|
| 847 |
+
|
| 848 |
+
const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
|
| 849 |
+
|
| 850 |
+
ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
|
| 851 |
+
#pragma HLS array_partition variable=cache_attn dim=2 complete
|
| 852 |
+
#pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
|
| 853 |
+
|
| 854 |
+
if(stage_i == 0){
|
| 855 |
+
for(int ii = 0; ii < 2; ii++){ // load only 1 time
|
| 856 |
+
load_x:
|
| 857 |
+
for(int jj = 0; jj < D_div_8;){
|
| 858 |
+
if(!fifo_X_in.empty()){
|
| 859 |
+
ap_uint<512> val; fifo_X_in.try_read(val);
|
| 860 |
+
|
| 861 |
+
for(int k = 0; k < 8; k++){
|
| 862 |
+
#pragma HLS unroll
|
| 863 |
+
X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
|
| 864 |
+
}
|
| 865 |
+
jj++;
|
| 866 |
+
}
|
| 867 |
+
}
|
| 868 |
+
}
|
| 869 |
+
} else if (stage == 2) {
|
| 870 |
+
for(int ii = 0; ii < ((i+1)*2); ii++){
|
| 871 |
+
ap_uint<32> fuse_reg[16];
|
| 872 |
+
load_attn:
|
| 873 |
+
for(int offset = 0; offset < 8;){
|
| 874 |
+
#pragma HLS pipeline II=1 style=stp
|
| 875 |
+
if(!fifo_from_sfu.empty()){
|
| 876 |
+
ap_uint<128> val; fifo_from_sfu.try_read(val);
|
| 877 |
+
for(int k = 0; k < 16; k++){
|
| 878 |
+
#pragma HLS unroll
|
| 879 |
+
fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
|
| 880 |
+
}
|
| 881 |
+
offset++;
|
| 882 |
+
}
|
| 883 |
+
}
|
| 884 |
+
for(int k = 0; k < 16; k++){
|
| 885 |
+
#pragma HLS unroll
|
| 886 |
+
cache_attn[ii][k] = fuse_reg[k];
|
| 887 |
+
}
|
| 888 |
+
}
|
| 889 |
+
}
|
| 890 |
+
|
| 891 |
+
for(int j = 0; j < j_bound; j++){
|
| 892 |
+
#pragma HLS loop_flatten off
|
| 893 |
+
|
| 894 |
+
ap_int<38> acc_vec[8][16][8];
|
| 895 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 896 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 897 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 898 |
+
|
| 899 |
+
for(int ii = 0; ii < 8; ii++){
|
| 900 |
+
#pragma HLS unroll
|
| 901 |
+
for(int kk = 0; kk < 16; kk++){
|
| 902 |
+
#pragma HLS unroll
|
| 903 |
+
for(int k = 0; k < 8; k++){
|
| 904 |
+
#pragma HLS unroll
|
| 905 |
+
acc_vec[ii][kk][k] = 0;
|
| 906 |
+
}
|
| 907 |
+
}
|
| 908 |
+
}
|
| 909 |
+
|
| 910 |
+
compute:
|
| 911 |
+
for(int k = 0; k < k_bound; k++){
|
| 912 |
+
#pragma HLS pipeline II=1 style=stp
|
| 913 |
+
|
| 914 |
+
ap_uint<64> op1_mtx[16];
|
| 915 |
+
ap_uint<64> op2_mtx[16];
|
| 916 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 917 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 918 |
+
|
| 919 |
+
ap_uint<1024> recv_pkt;
|
| 920 |
+
|
| 921 |
+
if(stage == 3) {
|
| 922 |
+
recv_pkt = fifo_context.read();
|
| 923 |
+
} else if(stage == 4) {
|
| 924 |
+
recv_pkt = fifo_gelu_in.read();
|
| 925 |
+
}
|
| 926 |
+
|
| 927 |
+
for(int ii = 0; ii < 16; ii++){
|
| 928 |
+
#pragma HLS unroll
|
| 929 |
+
if(stage == 3){
|
| 930 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 931 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 932 |
+
} else if(stage != 2) {
|
| 933 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 934 |
+
op2_mtx[ii] = X[i*16+ii][k];
|
| 935 |
+
} else {
|
| 936 |
+
op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
|
| 937 |
+
op2_mtx[ii] = scratchpad[k][j*16+ii];
|
| 938 |
+
}
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
if(stage < 2){
|
| 942 |
+
ap_uint<1024> send_pkt = ap_uint<1024>((
|
| 943 |
+
op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
|
| 944 |
+
op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
|
| 945 |
+
));
|
| 946 |
+
fifo_X_out.write(send_pkt);
|
| 947 |
+
}
|
| 948 |
+
|
| 949 |
+
for(int ii = 0; ii < 8; ii++){
|
| 950 |
+
#pragma HLS unroll
|
| 951 |
+
for(int kk = 0; kk < 16; kk++){
|
| 952 |
+
#pragma HLS unroll
|
| 953 |
+
for(int l = 0; l < 8; l++){
|
| 954 |
+
#pragma HLS unroll
|
| 955 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 956 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 957 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 958 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 959 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 960 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 961 |
+
}
|
| 962 |
+
}
|
| 963 |
+
}
|
| 964 |
+
}
|
| 965 |
+
|
| 966 |
+
ap_int<22> acc_final[16][16];
|
| 967 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 968 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 969 |
+
|
| 970 |
+
for(int ii = 0; ii < 16; ii++){
|
| 971 |
+
#pragma HLS unroll
|
| 972 |
+
for(int k = 0; k < 16; k++){
|
| 973 |
+
#pragma HLS unroll
|
| 974 |
+
acc_final[ii][k] = 0;
|
| 975 |
+
}
|
| 976 |
+
}
|
| 977 |
+
|
| 978 |
+
reduction:
|
| 979 |
+
for(int kk = 0; kk < 8; kk++){
|
| 980 |
+
for(int ii = 0; ii < 16; ii++){
|
| 981 |
+
#pragma HLS unroll
|
| 982 |
+
for(int k = 0; k < 8; k++){
|
| 983 |
+
#pragma HLS unroll
|
| 984 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 985 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 986 |
+
res1 = res1 + res0[18];
|
| 987 |
+
acc_final[ii][k*2] += res0;
|
| 988 |
+
acc_final[ii][k*2+1] += res1;
|
| 989 |
+
}
|
| 990 |
+
}
|
| 991 |
+
}
|
| 992 |
+
|
| 993 |
+
if(stage == 0){
|
| 994 |
+
for(int ii = 0; ii < 16; ii++){
|
| 995 |
+
#pragma HLS unroll
|
| 996 |
+
for(int k = 0; k < 16; k++){
|
| 997 |
+
#pragma HLS unroll
|
| 998 |
+
int offset = ii%8;
|
| 999 |
+
scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
|
| 1000 |
+
}
|
| 1001 |
+
}
|
| 1002 |
+
} else if (stage == 2){
|
| 1003 |
+
for(int ii = 0; ii < 2; ii++){
|
| 1004 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1005 |
+
ap_uint<1024> tmp;
|
| 1006 |
+
for(int jj = 0; jj < 8; jj++){
|
| 1007 |
+
#pragma HLS unroll
|
| 1008 |
+
for(int k = 0; k < 16; k++){
|
| 1009 |
+
#pragma HLS unroll
|
| 1010 |
+
tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
|
| 1011 |
+
}
|
| 1012 |
+
}
|
| 1013 |
+
fifo_O_out.write(tmp);
|
| 1014 |
+
}
|
| 1015 |
+
} else if (stage == 1) {
|
| 1016 |
+
for(int ii = 0; ii < 4; ii++){
|
| 1017 |
+
for(int jj = 0; jj < 2; jj++){
|
| 1018 |
+
ap_uint<256> tmp;
|
| 1019 |
+
for(int k = 0; k < 32; k++){
|
| 1020 |
+
#pragma HLS unroll
|
| 1021 |
+
tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
|
| 1022 |
+
}
|
| 1023 |
+
fifo_to_acc0.write(tmp);
|
| 1024 |
+
}
|
| 1025 |
+
}
|
| 1026 |
+
} else {
|
| 1027 |
+
final_acc:
|
| 1028 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1029 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1030 |
+
#pragma HLS dependence variable=X type=inter false
|
| 1031 |
+
ap_uint<512> tmp_recv = fifo_reduce_recv.read();
|
| 1032 |
+
ap_uint<512> tmp_send;
|
| 1033 |
+
for(int k = 0; k < 16; k++){
|
| 1034 |
+
#pragma HLS unroll
|
| 1035 |
+
ap_int<32> tmp = acc_final[ii][k] + ap_int<24>(tmp_recv(k*32+23, k*32));
|
| 1036 |
+
if(stage == 3) tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
|
| 1037 |
+
tmp_send(k*32+31, k*32) = tmp;
|
| 1038 |
+
}
|
| 1039 |
+
if(stage == 3) fifo_res_send.write(tmp_send);
|
| 1040 |
+
else fifo_ffn_out.write(tmp_send);
|
| 1041 |
+
}
|
| 1042 |
+
}
|
| 1043 |
+
}
|
| 1044 |
+
}
|
| 1045 |
+
}
|
| 1046 |
+
}
|
| 1047 |
+
|
| 1048 |
+
void residual(
|
| 1049 |
+
const int L,
|
| 1050 |
+
tapa::istream<ap_uint<512>>& fifo_res_in,
|
| 1051 |
+
tapa::ostream<ap_uint<512>>& fifo_res_out
|
| 1052 |
+
){
|
| 1053 |
+
for(int i = 0; i < (L >> 5); i++){
|
| 1054 |
+
for(int j = 0; j < D_div_16; j++){
|
| 1055 |
+
ap_uint<32> res_buffer[16][16];
|
| 1056 |
+
#pragma HLS array_partition variable=res_buffer complete dim=1
|
| 1057 |
+
#pragma HLS array_partition variable=res_buffer complete dim=2
|
| 1058 |
+
|
| 1059 |
+
read:
|
| 1060 |
+
for(int k = 0; k < 16;){
|
| 1061 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1062 |
+
ap_uint<512> tmp;
|
| 1063 |
+
bool success = fifo_res_in.try_read(tmp);
|
| 1064 |
+
if(success){
|
| 1065 |
+
for(int l = 0; l < 16; l++){
|
| 1066 |
+
#pragma HLS unroll
|
| 1067 |
+
res_buffer[k][l] = ap_uint<32>(tmp(l*32+31, l*32));
|
| 1068 |
+
}
|
| 1069 |
+
k++;
|
| 1070 |
+
}
|
| 1071 |
+
}
|
| 1072 |
+
transpose:
|
| 1073 |
+
for(int k = 0; k < 16; k++){
|
| 1074 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1075 |
+
ap_uint<512> tmp;
|
| 1076 |
+
for(int l = 0; l < 16; l++){
|
| 1077 |
+
#pragma HLS unroll
|
| 1078 |
+
tmp(l*32+31, l*32) = ap_uint<32>(res_buffer[l][k]);
|
| 1079 |
+
}
|
| 1080 |
+
fifo_res_out.write(tmp);
|
| 1081 |
+
}
|
| 1082 |
+
}
|
| 1083 |
+
}
|
| 1084 |
+
}
|
| 1085 |
+
|
| 1086 |
+
|
| 1087 |
+
void temporal_acc1(
|
| 1088 |
+
tapa::istream<ConfigInst>& fifo_inst_in,
|
| 1089 |
+
tapa::ostream<ConfigInst>& fifo_inst_out,
|
| 1090 |
+
tapa::ostream<int>& fifo_len_context,
|
| 1091 |
+
tapa::istream<ap_uint<1024>>& fifo_X_in,
|
| 1092 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 1093 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 1094 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 1095 |
+
tapa::ostream<ap_uint<256>>& fifo_to_acc0,
|
| 1096 |
+
tapa::istream<ap_uint<128>>& fifo_from_sfu,
|
| 1097 |
+
tapa::ostream<ap_uint<1024>>& fifo_O_out,
|
| 1098 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 1099 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 1100 |
+
tapa::ostream<ap_uint<512>>& fifo_reduce_send,
|
| 1101 |
+
tapa::istream<ap_uint<1024>>& fifo_gelu_in
|
| 1102 |
+
){
|
| 1103 |
+
|
| 1104 |
+
ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
|
| 1105 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
|
| 1106 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
|
| 1107 |
+
#pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
|
| 1108 |
+
|
| 1109 |
+
// ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
|
| 1110 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
|
| 1111 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
|
| 1112 |
+
|
| 1113 |
+
ConfigInst len = fifo_inst_in.read();
|
| 1114 |
+
const int L = len.weight_bound;
|
| 1115 |
+
fifo_inst_out.write(len);
|
| 1116 |
+
fifo_len_context.write(L);
|
| 1117 |
+
|
| 1118 |
+
for(int stage_i = 0; stage_i < 17; stage_i++){
|
| 1119 |
+
|
| 1120 |
+
// stage 0: WvX
|
| 1121 |
+
// stage 1: WkX1 -> acc0
|
| 1122 |
+
// stage 2: Softmax(QK)V <- acc0
|
| 1123 |
+
// stage 3: WoO
|
| 1124 |
+
|
| 1125 |
+
ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
|
| 1126 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=8
|
| 1127 |
+
#pragma HLS bind_storage variable=W type=ram_2p impl=uram
|
| 1128 |
+
|
| 1129 |
+
ConfigInst inst = fifo_inst_in.read();
|
| 1130 |
+
fifo_inst_out.write(inst);
|
| 1131 |
+
|
| 1132 |
+
const ap_uint<3> stage = inst.stage;
|
| 1133 |
+
|
| 1134 |
+
// load weights and forward
|
| 1135 |
+
if(stage != 2) {
|
| 1136 |
+
const int weight_bound = inst.weight_bound;
|
| 1137 |
+
int sub_bound = D_div_8;
|
| 1138 |
+
if (stage == 4) sub_bound = D_ffn_SLR_div_8;
|
| 1139 |
+
for(int i = 0; i < weight_bound; i++){
|
| 1140 |
+
load_weight:
|
| 1141 |
+
for(int j = 0; j < sub_bound;){
|
| 1142 |
+
if(!fifo_W_in.empty()){
|
| 1143 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 1144 |
+
|
| 1145 |
+
for(int k = 0; k < 2; k++){
|
| 1146 |
+
#pragma HLS unroll
|
| 1147 |
+
W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
|
| 1148 |
+
}
|
| 1149 |
+
val = ap_uint<512>(val >> 128);
|
| 1150 |
+
fifo_W_out.write(val);
|
| 1151 |
+
j++;
|
| 1152 |
+
}
|
| 1153 |
+
}
|
| 1154 |
+
}
|
| 1155 |
+
}
|
| 1156 |
+
|
| 1157 |
+
const int i_bound = inst.i_bound;
|
| 1158 |
+
const int j_bound = inst.j_bound;
|
| 1159 |
+
|
| 1160 |
+
for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
|
| 1161 |
+
|
| 1162 |
+
const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
|
| 1163 |
+
|
| 1164 |
+
ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
|
| 1165 |
+
#pragma HLS array_partition variable=cache_attn dim=2 complete
|
| 1166 |
+
#pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
|
| 1167 |
+
|
| 1168 |
+
if(stage == 2){
|
| 1169 |
+
for(int ii = 0; ii < ((i+1)*2); ii++){
|
| 1170 |
+
ap_uint<32> fuse_reg[16];
|
| 1171 |
+
load_attn:
|
| 1172 |
+
for(int offset = 0; offset < 8;){
|
| 1173 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1174 |
+
if(!fifo_from_sfu.empty()){
|
| 1175 |
+
ap_uint<128> val; fifo_from_sfu.try_read(val);
|
| 1176 |
+
for(int k = 0; k < 16; k++){
|
| 1177 |
+
#pragma HLS unroll
|
| 1178 |
+
fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
|
| 1179 |
+
}
|
| 1180 |
+
offset++;
|
| 1181 |
+
}
|
| 1182 |
+
}
|
| 1183 |
+
for(int k = 0; k < 16; k++){
|
| 1184 |
+
#pragma HLS unroll
|
| 1185 |
+
cache_attn[ii][k] = fuse_reg[k];
|
| 1186 |
+
}
|
| 1187 |
+
}
|
| 1188 |
+
}
|
| 1189 |
+
|
| 1190 |
+
for(int j = 0; j < j_bound; j++){
|
| 1191 |
+
#pragma HLS loop_flatten off
|
| 1192 |
+
|
| 1193 |
+
ap_int<38> acc_vec[8][16][8];
|
| 1194 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 1195 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 1196 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 1197 |
+
|
| 1198 |
+
for(int ii = 0; ii < 8; ii++){
|
| 1199 |
+
#pragma HLS unroll
|
| 1200 |
+
for(int kk = 0; kk < 16; kk++){
|
| 1201 |
+
#pragma HLS unroll
|
| 1202 |
+
for(int k = 0; k < 8; k++){
|
| 1203 |
+
#pragma HLS unroll
|
| 1204 |
+
acc_vec[ii][kk][k] = 0;
|
| 1205 |
+
}
|
| 1206 |
+
}
|
| 1207 |
+
}
|
| 1208 |
+
compute:
|
| 1209 |
+
for(int k = 0; k < k_bound; k++){
|
| 1210 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1211 |
+
|
| 1212 |
+
ap_uint<64> op1_mtx[16];
|
| 1213 |
+
ap_uint<64> op2_mtx[16];
|
| 1214 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 1215 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 1216 |
+
|
| 1217 |
+
ap_uint<1024> recv_pkt;
|
| 1218 |
+
|
| 1219 |
+
if(stage == 3) {
|
| 1220 |
+
recv_pkt = fifo_context.read();
|
| 1221 |
+
}else if(stage == 4) {
|
| 1222 |
+
recv_pkt = fifo_gelu_in.read();
|
| 1223 |
+
}else if(stage != 2) {
|
| 1224 |
+
recv_pkt = fifo_X_in.read();
|
| 1225 |
+
fifo_X_out.write(recv_pkt);
|
| 1226 |
+
}
|
| 1227 |
+
|
| 1228 |
+
for(int ii = 0; ii < 16; ii++){ //TODO: change logic
|
| 1229 |
+
#pragma HLS unroll
|
| 1230 |
+
if (stage != 2) {
|
| 1231 |
+
op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
|
| 1232 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 1233 |
+
} else {
|
| 1234 |
+
op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
|
| 1235 |
+
op2_mtx[ii] = scratchpad[k][j*16+ii];
|
| 1236 |
+
}
|
| 1237 |
+
}
|
| 1238 |
+
|
| 1239 |
+
for(int ii = 0; ii < 8; ii++){
|
| 1240 |
+
#pragma HLS unroll
|
| 1241 |
+
for(int kk = 0; kk < 16; kk++){
|
| 1242 |
+
#pragma HLS unroll
|
| 1243 |
+
for(int l = 0; l < 8; l++){
|
| 1244 |
+
#pragma HLS unroll
|
| 1245 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 1246 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 1247 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 1248 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 1249 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 1250 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 1251 |
+
}
|
| 1252 |
+
}
|
| 1253 |
+
}
|
| 1254 |
+
}
|
| 1255 |
+
|
| 1256 |
+
ap_int<22> acc_final[16][16];
|
| 1257 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 1258 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 1259 |
+
|
| 1260 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1261 |
+
#pragma HLS unroll
|
| 1262 |
+
for(int k = 0; k < 16; k++){
|
| 1263 |
+
#pragma HLS unroll
|
| 1264 |
+
acc_final[ii][k] = 0;
|
| 1265 |
+
}
|
| 1266 |
+
}
|
| 1267 |
+
|
| 1268 |
+
reduction:
|
| 1269 |
+
for(int kk = 0; kk < 8; kk++){
|
| 1270 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1271 |
+
#pragma HLS unroll
|
| 1272 |
+
for(int k = 0; k < 8; k++){
|
| 1273 |
+
#pragma HLS unroll
|
| 1274 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 1275 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 1276 |
+
res1 = res1 + res0[18];
|
| 1277 |
+
acc_final[ii][k*2] += res0;
|
| 1278 |
+
acc_final[ii][k*2+1] += res1;
|
| 1279 |
+
}
|
| 1280 |
+
}
|
| 1281 |
+
}
|
| 1282 |
+
|
| 1283 |
+
if(stage == 0){
|
| 1284 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1285 |
+
#pragma HLS unroll
|
| 1286 |
+
for(int k = 0; k < 16; k++){
|
| 1287 |
+
#pragma HLS unroll
|
| 1288 |
+
int offset = ii%8;
|
| 1289 |
+
scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
|
| 1290 |
+
}
|
| 1291 |
+
}
|
| 1292 |
+
} else if (stage == 2){
|
| 1293 |
+
for(int ii = 0; ii < 2; ii++){
|
| 1294 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1295 |
+
ap_uint<1024> tmp;
|
| 1296 |
+
for(int jj = 0; jj < 8; jj++){
|
| 1297 |
+
#pragma HLS unroll
|
| 1298 |
+
for(int k = 0; k < 16; k++){
|
| 1299 |
+
#pragma HLS unroll
|
| 1300 |
+
tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
|
| 1301 |
+
}
|
| 1302 |
+
}
|
| 1303 |
+
fifo_O_out.write(tmp);
|
| 1304 |
+
}
|
| 1305 |
+
} else if (stage == 1){
|
| 1306 |
+
for(int ii = 0; ii < 4; ii++){
|
| 1307 |
+
for(int jj = 0; jj < 2; jj++){
|
| 1308 |
+
ap_uint<256> tmp;
|
| 1309 |
+
for(int k = 0; k < 32; k++){
|
| 1310 |
+
#pragma HLS unroll
|
| 1311 |
+
tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
|
| 1312 |
+
}
|
| 1313 |
+
fifo_to_acc0.write(tmp);
|
| 1314 |
+
}
|
| 1315 |
+
}
|
| 1316 |
+
} else {
|
| 1317 |
+
final_acc:
|
| 1318 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1319 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1320 |
+
ap_uint<512> tmp_recv = fifo_reduce_recv.read();
|
| 1321 |
+
ap_uint<512> tmp;
|
| 1322 |
+
for(int k = 0; k < 16; k++){
|
| 1323 |
+
#pragma HLS unroll
|
| 1324 |
+
acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
|
| 1325 |
+
tmp(k*32+21, k*32) = acc_final[ii][k];
|
| 1326 |
+
}
|
| 1327 |
+
fifo_reduce_send.write(tmp);
|
| 1328 |
+
}
|
| 1329 |
+
}
|
| 1330 |
+
}
|
| 1331 |
+
}
|
| 1332 |
+
}
|
| 1333 |
+
|
| 1334 |
+
// write out for debug
|
| 1335 |
+
// write:
|
| 1336 |
+
// for(int i = 0; i < L; i++){
|
| 1337 |
+
// for(int j = 0; j < D_head_div_8; j++){
|
| 1338 |
+
// #pragma HLS pipeline II=1 style=stp
|
| 1339 |
+
// fifo_O_out.write(scratchpad_out[i][j]);
|
| 1340 |
+
// }
|
| 1341 |
+
// }
|
| 1342 |
+
}
|
| 1343 |
+
|
| 1344 |
+
void sfu_buffer( // double buffering
|
| 1345 |
+
tapa::istream<int>& fifo_inst,
|
| 1346 |
+
tapa::istream<ap_uint<512>>& fifo_data_in,
|
| 1347 |
+
tapa::ostream<ap_uint<512>>& fifo_data_out
|
| 1348 |
+
){
|
| 1349 |
+
const int L = fifo_inst.read();
|
| 1350 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1351 |
+
|
| 1352 |
+
for(int l = 0; l < (L >> 5); l++){
|
| 1353 |
+
float sum[8][16];
|
| 1354 |
+
float cache[MAX_SEQ_LEN][16];
|
| 1355 |
+
#pragma HLS array_partition variable=cache dim=2 complete
|
| 1356 |
+
#pragma HLS array_partition variable=sum dim=2 complete
|
| 1357 |
+
|
| 1358 |
+
const int hidden_bound = fifo_inst.read();
|
| 1359 |
+
|
| 1360 |
+
for(int i = 0; i < 8; i++){
|
| 1361 |
+
for(int j = 0; j < 16; j++){
|
| 1362 |
+
#pragma HLS unroll
|
| 1363 |
+
sum[i][j] = 0.0;
|
| 1364 |
+
}
|
| 1365 |
+
}
|
| 1366 |
+
|
| 1367 |
+
acc:
|
| 1368 |
+
for(int i = 0; i < hidden_bound; i++){
|
| 1369 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1370 |
+
#pragma HLS dependence false variable=sum
|
| 1371 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1372 |
+
ap_uint<512> tmp = fifo_data_in.read();
|
| 1373 |
+
for(int k = 0; k < 16; k++){
|
| 1374 |
+
#pragma HLS unroll
|
| 1375 |
+
float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1376 |
+
sum[i%8][k] += res;
|
| 1377 |
+
cache[i][k] = res;
|
| 1378 |
+
}
|
| 1379 |
+
}
|
| 1380 |
+
|
| 1381 |
+
reduce:
|
| 1382 |
+
for(int i = 1; i < 8; i++){
|
| 1383 |
+
for(int j = 0; j < 8; j++){
|
| 1384 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1385 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1386 |
+
for(int k = 0; k < 2; k++){
|
| 1387 |
+
sum[0][j*2+k] += sum[i][j*2+k];
|
| 1388 |
+
}
|
| 1389 |
+
}
|
| 1390 |
+
}
|
| 1391 |
+
|
| 1392 |
+
ap_uint<512> tmp;
|
| 1393 |
+
for(int i = 0; i < 16; i++){
|
| 1394 |
+
#pragma HLS unroll
|
| 1395 |
+
tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
|
| 1396 |
+
}
|
| 1397 |
+
fifo_data_out.write(tmp);
|
| 1398 |
+
|
| 1399 |
+
write:
|
| 1400 |
+
for(int i = 0; i < hidden_bound; i++){
|
| 1401 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1402 |
+
ap_uint<512> tmp;
|
| 1403 |
+
for(int j = 0; j < 16; j++){
|
| 1404 |
+
#pragma HLS unroll
|
| 1405 |
+
tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
|
| 1406 |
+
}
|
| 1407 |
+
fifo_data_out.write(tmp);
|
| 1408 |
+
}
|
| 1409 |
+
|
| 1410 |
+
}
|
| 1411 |
+
}
|
| 1412 |
+
|
| 1413 |
+
}
|
| 1414 |
+
|
| 1415 |
+
void sfu_buffer_slr0( // double buffering
|
| 1416 |
+
tapa::istream<int>& fifo_inst,
|
| 1417 |
+
tapa::istream<ap_uint<512>>& fifo_data_in_exp,
|
| 1418 |
+
tapa::istream<ap_uint<512>>& fifo_data_in_ln,
|
| 1419 |
+
tapa::istream<ap_uint<512>>& fifo_data_in_ffn,
|
| 1420 |
+
tapa::ostream<ap_uint<512>>& fifo_data_out
|
| 1421 |
+
){
|
| 1422 |
+
const int L = fifo_inst.read();
|
| 1423 |
+
for(int stage = 0; stage < 7; stage++){
|
| 1424 |
+
|
| 1425 |
+
int hidden_bound = D;
|
| 1426 |
+
|
| 1427 |
+
for(int l = 0; l < (L >> 5); l++){
|
| 1428 |
+
float sum[8][16];
|
| 1429 |
+
float var[8][16];
|
| 1430 |
+
float cache[MAX_SEQ_LEN][16];
|
| 1431 |
+
#pragma HLS array_partition variable=cache dim=2 complete
|
| 1432 |
+
#pragma HLS array_partition variable=sum dim=2 complete
|
| 1433 |
+
#pragma HLS array_partition variable=var dim=2 complete
|
| 1434 |
+
|
| 1435 |
+
if(stage < 5) hidden_bound = fifo_inst.read();
|
| 1436 |
+
|
| 1437 |
+
for(int i = 0; i < 8; i++){
|
| 1438 |
+
for(int j = 0; j < 16; j++){
|
| 1439 |
+
#pragma HLS unroll
|
| 1440 |
+
sum[i][j] = 0.0;
|
| 1441 |
+
var[i][j] = 0.0;
|
| 1442 |
+
}
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
acc:
|
| 1446 |
+
for(int i = 0; i < hidden_bound; i++){
|
| 1447 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1448 |
+
#pragma HLS dependence false variable=sum
|
| 1449 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1450 |
+
|
| 1451 |
+
ap_uint<512> tmp;
|
| 1452 |
+
if(stage < 5) {
|
| 1453 |
+
tmp = fifo_data_in_exp.read();
|
| 1454 |
+
} else if(stage == 5){
|
| 1455 |
+
tmp = fifo_data_in_ln.read();
|
| 1456 |
+
} else {
|
| 1457 |
+
tmp = fifo_data_in_ffn.read();
|
| 1458 |
+
}
|
| 1459 |
+
|
| 1460 |
+
for(int k = 0; k < 16; k++){
|
| 1461 |
+
#pragma HLS unroll
|
| 1462 |
+
float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1463 |
+
sum[i%8][k] = res;
|
| 1464 |
+
if(stage >= 4) var[i%8][k] = res;
|
| 1465 |
+
cache[i][k] = res;
|
| 1466 |
+
}
|
| 1467 |
+
}
|
| 1468 |
+
|
| 1469 |
+
reduce:
|
| 1470 |
+
for(int i = 1; i < 8; i++){
|
| 1471 |
+
for(int j = 0; j < 8; j++){
|
| 1472 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1473 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1474 |
+
#pragma HLS dependence true variable=var distance=8
|
| 1475 |
+
for(int k = 0; k < 2; k++){
|
| 1476 |
+
sum[0][j*2+k] += sum[i][j*2+k];
|
| 1477 |
+
if(stage >= 5) var[0][j*2+k] += var[i][j*2+k];
|
| 1478 |
+
}
|
| 1479 |
+
}
|
| 1480 |
+
}
|
| 1481 |
+
|
| 1482 |
+
ap_uint<512> tmp;
|
| 1483 |
+
ap_uint<512> tmp_var;
|
| 1484 |
+
for(int i = 0; i < 16; i++){
|
| 1485 |
+
#pragma HLS unroll
|
| 1486 |
+
tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
|
| 1487 |
+
if(stage >= 5) tmp_var(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(var[0][i]);
|
| 1488 |
+
}
|
| 1489 |
+
fifo_data_out.write(tmp);
|
| 1490 |
+
if(stage >= 5) fifo_data_out.write(tmp_var);
|
| 1491 |
+
|
| 1492 |
+
write:
|
| 1493 |
+
for(int i = 0; i < hidden_bound; i++){
|
| 1494 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1495 |
+
ap_uint<512> tmp;
|
| 1496 |
+
for(int j = 0; j < 16; j++){
|
| 1497 |
+
#pragma HLS unroll
|
| 1498 |
+
tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
|
| 1499 |
+
}
|
| 1500 |
+
fifo_data_out.write(tmp);
|
| 1501 |
+
}
|
| 1502 |
+
}
|
| 1503 |
+
}
|
| 1504 |
+
}
|
| 1505 |
+
|
| 1506 |
+
|
| 1507 |
+
void sfu_acc_exp(
|
| 1508 |
+
tapa::istream<int>& fifo_inst,
|
| 1509 |
+
tapa::istream<ap_uint<512>>& fifo_data_in,
|
| 1510 |
+
tapa::ostreams<ap_uint<512>, 2>& fifo_buf,
|
| 1511 |
+
tapa::ostreams<int, 2>& fifo_inst_out
|
| 1512 |
+
) {
|
| 1513 |
+
const int L = fifo_inst.read();
|
| 1514 |
+
fifo_inst_out[0].write(L);
|
| 1515 |
+
fifo_inst_out[1].write(L);
|
| 1516 |
+
|
| 1517 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1518 |
+
|
| 1519 |
+
for(int l = 0; l < (L >> 4); l++){
|
| 1520 |
+
fifo_inst_out[l%2].write(((l+1) << 4));
|
| 1521 |
+
exp_acc:
|
| 1522 |
+
for(int i = 0; i < ((l+1) << 4);){
|
| 1523 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1524 |
+
if(!fifo_data_in.empty()){
|
| 1525 |
+
ap_uint<512> tmp; fifo_data_in.try_read(tmp);
|
| 1526 |
+
ap_uint<512> tmp_o;
|
| 1527 |
+
for(int k = 0; k < 16; k++){
|
| 1528 |
+
#pragma HLS unroll
|
| 1529 |
+
int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1530 |
+
float res_exp = 0.0;
|
| 1531 |
+
res_exp = hls::exp(ap_int<32>(res >> 10));
|
| 1532 |
+
tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
|
| 1533 |
+
}
|
| 1534 |
+
fifo_buf[l%2].write(tmp_o);
|
| 1535 |
+
i++;
|
| 1536 |
+
}
|
| 1537 |
+
}
|
| 1538 |
+
}
|
| 1539 |
+
}
|
| 1540 |
+
}
|
| 1541 |
+
|
| 1542 |
+
void sfu_gelu(
|
| 1543 |
+
tapa::istream<int>& fifo_inst,
|
| 1544 |
+
tapa::ostream<int>& fifo_inst_out,
|
| 1545 |
+
tapa::istream<ap_uint<512>>& fifo_ffn,
|
| 1546 |
+
tapa::ostream<ap_uint<128>>& fifo_out
|
| 1547 |
+
){
|
| 1548 |
+
const int L = fifo_inst.read();
|
| 1549 |
+
fifo_inst_out.write(L);
|
| 1550 |
+
|
| 1551 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1552 |
+
for(int j = 0; j < D_ffn_SLR;){
|
| 1553 |
+
if(!fifo_ffn.empty()){
|
| 1554 |
+
ap_uint<512> tmp; fifo_ffn.try_read(tmp);
|
| 1555 |
+
ap_uint<128> tmp_out;
|
| 1556 |
+
for(int k = 0; k < 16; k++){
|
| 1557 |
+
// table based approximation
|
| 1558 |
+
float val = (float) tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1559 |
+
float outp_data = 0.0;
|
| 1560 |
+
if (val < -2 || val == 0)
|
| 1561 |
+
outp_data = 0;
|
| 1562 |
+
else if(val < -1.5)
|
| 1563 |
+
outp_data = -0.09754;
|
| 1564 |
+
else if(val < -1)
|
| 1565 |
+
outp_data = -0.15743;
|
| 1566 |
+
else if(val < -0.5)
|
| 1567 |
+
outp_data = -0.15383;
|
| 1568 |
+
else if(val < 0)
|
| 1569 |
+
outp_data = -0.10153;
|
| 1570 |
+
else
|
| 1571 |
+
outp_data = val;
|
| 1572 |
+
tmp_out(k*8+7, k*8) = ap_int<8>((int) (outp_data) >> 8);
|
| 1573 |
+
}
|
| 1574 |
+
fifo_out.write(tmp_out);
|
| 1575 |
+
j++;
|
| 1576 |
+
}
|
| 1577 |
+
}
|
| 1578 |
+
}
|
| 1579 |
+
}
|
| 1580 |
+
|
| 1581 |
+
void data_packing(
|
| 1582 |
+
tapa::istream<int>& fifo_inst,
|
| 1583 |
+
tapa::istream<ap_uint<128>>& fifo_in,
|
| 1584 |
+
tapa::ostream<ap_uint<1024>>& fifo_out
|
| 1585 |
+
){
|
| 1586 |
+
const int L = fifo_inst.read();
|
| 1587 |
+
|
| 1588 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1589 |
+
ap_uint<1024> cache[D_ffn_SLR_div_8];
|
| 1590 |
+
|
| 1591 |
+
for(int j = 0; j < D_ffn_SLR_div_8; j++){
|
| 1592 |
+
ap_uint<64> fuse_reg[16];
|
| 1593 |
+
ap_uint<1024> send_pkt;
|
| 1594 |
+
#pragma HLS array_partition variable=fuse_reg complete
|
| 1595 |
+
for(int k = 0; k < 8;){
|
| 1596 |
+
#pragma HLS pipeline II=1
|
| 1597 |
+
if(!fifo_in.empty()){
|
| 1598 |
+
ap_uint<128> tmp; fifo_in.try_read(tmp);
|
| 1599 |
+
for(int l = 0; l < 16; l++){
|
| 1600 |
+
#pragma HLS unroll
|
| 1601 |
+
fuse_reg[l](k*8+7, k*8) = tmp(l*8+7, l*8);
|
| 1602 |
+
}
|
| 1603 |
+
k++;
|
| 1604 |
+
}
|
| 1605 |
+
}
|
| 1606 |
+
send_pkt = ap_uint<1024>((
|
| 1607 |
+
fuse_reg[0], fuse_reg[1], fuse_reg[2], fuse_reg[3], fuse_reg[4], fuse_reg[5], fuse_reg[6], fuse_reg[7],
|
| 1608 |
+
fuse_reg[8], fuse_reg[9], fuse_reg[10], fuse_reg[11], fuse_reg[12], fuse_reg[13], fuse_reg[14], fuse_reg[15]
|
| 1609 |
+
));
|
| 1610 |
+
cache[j] = send_pkt;
|
| 1611 |
+
fifo_out.write(send_pkt);
|
| 1612 |
+
}
|
| 1613 |
+
|
| 1614 |
+
for(int iter = 0; iter < D_div_16*2 - 1; iter++){
|
| 1615 |
+
for(int j = 0; j < D_ffn_SLR_div_8; j++){
|
| 1616 |
+
#pragma HLS pipeline II=1
|
| 1617 |
+
fifo_out.write(cache[j]);
|
| 1618 |
+
}
|
| 1619 |
+
}
|
| 1620 |
+
}
|
| 1621 |
+
}
|
| 1622 |
+
|
| 1623 |
+
void sfu_norm(
|
| 1624 |
+
tapa::istream<int>& fifo_inst,
|
| 1625 |
+
tapa::istreams<ap_uint<512>, 2>& fifo_buf,
|
| 1626 |
+
tapa::ostream<ap_uint<128>>& fifo_data_out
|
| 1627 |
+
){
|
| 1628 |
+
const int L = fifo_inst.read();
|
| 1629 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1630 |
+
|
| 1631 |
+
for(int l = 0; l < (L >> 4); l++){
|
| 1632 |
+
float sum[16];
|
| 1633 |
+
#pragma HLS array_partition variable=sum complete
|
| 1634 |
+
|
| 1635 |
+
ap_uint<512> tmp_in = fifo_buf[l%2].read();
|
| 1636 |
+
|
| 1637 |
+
for(int i = 0; i < 16; i++){
|
| 1638 |
+
#pragma HLS unroll factor=8
|
| 1639 |
+
sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
|
| 1640 |
+
}
|
| 1641 |
+
|
| 1642 |
+
for(int i = 0; i < ((l+1) << 4);){
|
| 1643 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1644 |
+
if(!fifo_buf[l%2].empty()){
|
| 1645 |
+
ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
|
| 1646 |
+
ap_uint<128> tmp;
|
| 1647 |
+
for(int j = 0; j < 16; j++){
|
| 1648 |
+
#pragma HLS unroll
|
| 1649 |
+
ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
|
| 1650 |
+
tmp(j*8 + 7, j*8) = res;
|
| 1651 |
+
}
|
| 1652 |
+
fifo_data_out.write(tmp);
|
| 1653 |
+
i++;
|
| 1654 |
+
}
|
| 1655 |
+
}
|
| 1656 |
+
}
|
| 1657 |
+
}
|
| 1658 |
+
}
|
| 1659 |
+
|
| 1660 |
+
void sfu_norm_slr0(
|
| 1661 |
+
tapa::istream<int>& fifo_inst,
|
| 1662 |
+
tapa::istreams<ap_uint<512>, 2>& fifo_buf,
|
| 1663 |
+
tapa::ostream<ap_uint<128>>& fifo_data_out,
|
| 1664 |
+
tapa::ostream<ap_uint<128>>& fifo_data_off,
|
| 1665 |
+
tapa::ostream<ap_uint<128>>& fifo_out
|
| 1666 |
+
){
|
| 1667 |
+
const int L = fifo_inst.read();
|
| 1668 |
+
|
| 1669 |
+
for(int stage = 0; stage < 7; stage++){
|
| 1670 |
+
|
| 1671 |
+
for(int l = 0; l < (L >> 4); l++){
|
| 1672 |
+
int sum[16];
|
| 1673 |
+
int var[16];
|
| 1674 |
+
#pragma HLS array_partition variable=sum complete
|
| 1675 |
+
#pragma HLS array_partition variable=var complete
|
| 1676 |
+
|
| 1677 |
+
const int fifo_idx = l%2;
|
| 1678 |
+
const int hidden_bound = (stage < 5) ? ((l+1) << 4) : D;
|
| 1679 |
+
|
| 1680 |
+
ap_uint<512> tmp_in = fifo_buf[fifo_idx].read();
|
| 1681 |
+
ap_uint<512> tmp_var;
|
| 1682 |
+
if(stage >= 5) tmp_var = fifo_buf[fifo_idx].read();
|
| 1683 |
+
|
| 1684 |
+
if(stage >= 5){
|
| 1685 |
+
for(int i = 0; i < 16; i++){
|
| 1686 |
+
#pragma HLS unroll
|
| 1687 |
+
var[i] = ap_uint<32>(tmp_in(i*32+31, i*32));
|
| 1688 |
+
}
|
| 1689 |
+
} else {
|
| 1690 |
+
for(int i = 0; i < 16; i++){
|
| 1691 |
+
#pragma HLS unroll
|
| 1692 |
+
sum[i] = ap_uint<32>(tmp_in(i*32+31, i*32)) * 2;
|
| 1693 |
+
}
|
| 1694 |
+
}
|
| 1695 |
+
|
| 1696 |
+
for(int i = 0; i < hidden_bound;){
|
| 1697 |
+
#pragma HLS pipeline II=1 style=stp
|
| 1698 |
+
if(!fifo_buf[fifo_idx].empty()){
|
| 1699 |
+
ap_uint<512> tmp_cache; fifo_buf[fifo_idx].try_read(tmp_cache);
|
| 1700 |
+
ap_uint<128> tmp;
|
| 1701 |
+
for(int j = 0; j < 16; j++){
|
| 1702 |
+
#pragma HLS unroll
|
| 1703 |
+
ap_int<8> res;
|
| 1704 |
+
int op1; int op2;
|
| 1705 |
+
if(stage >= 5){
|
| 1706 |
+
op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
|
| 1707 |
+
op2 = var[j];
|
| 1708 |
+
} else {
|
| 1709 |
+
op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
|
| 1710 |
+
op2 = sum[j];
|
| 1711 |
+
}
|
| 1712 |
+
res = op1 + op2;
|
| 1713 |
+
tmp(j*8 + 7, j*8) = res;
|
| 1714 |
+
}
|
| 1715 |
+
if(stage == 5) {
|
| 1716 |
+
fifo_data_off.write(tmp);
|
| 1717 |
+
} else if(stage == 6){
|
| 1718 |
+
fifo_out.write(tmp);
|
| 1719 |
+
} else {
|
| 1720 |
+
fifo_data_out.write(tmp);
|
| 1721 |
+
}
|
| 1722 |
+
i++;
|
| 1723 |
+
}
|
| 1724 |
+
}
|
| 1725 |
+
}
|
| 1726 |
+
}
|
| 1727 |
+
}
|
| 1728 |
+
|
| 1729 |
+
void context_buffer(
|
| 1730 |
+
tapa::istream<int>& fifo_inst,
|
| 1731 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 1732 |
+
tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
|
| 1733 |
+
tapa::ostream<ap_uint<1024>>& fifo_to_acc1
|
| 1734 |
+
){
|
| 1735 |
+
ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
|
| 1736 |
+
#pragma HLS array_partition variable=context cyclic dim=1 factor=32
|
| 1737 |
+
#pragma HLS bind_storage variable=context type=ram_2p impl=uram
|
| 1738 |
+
|
| 1739 |
+
const int L = fifo_inst.read();
|
| 1740 |
+
|
| 1741 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1742 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1743 |
+
for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
|
| 1744 |
+
if(!fifo_context.empty()){
|
| 1745 |
+
ap_uint<1024> tmp; fifo_context.try_read(tmp);
|
| 1746 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1747 |
+
#pragma HLS unroll
|
| 1748 |
+
context[i*16+ii][j] = tmp(ii*64+63, ii*64);
|
| 1749 |
+
}
|
| 1750 |
+
j++;
|
| 1751 |
+
}
|
| 1752 |
+
}
|
| 1753 |
+
}
|
| 1754 |
+
}
|
| 1755 |
+
|
| 1756 |
+
// NOTE: change it to write to HBM for debugging
|
| 1757 |
+
// write ops to acc0 and acc1 in parallel
|
| 1758 |
+
for(int i = 0; i < (L >> 5); i++){
|
| 1759 |
+
for(int l = 0; l < D_div_16; l++){
|
| 1760 |
+
for(int iter = 0; iter < 2; iter++){
|
| 1761 |
+
for(int j = 0; j < CONTEXT_D; j++){
|
| 1762 |
+
ap_uint<1024> tmp_acc0;
|
| 1763 |
+
ap_uint<1024> tmp_acc1;
|
| 1764 |
+
for(int k = 0; k < 16; k++){
|
| 1765 |
+
#pragma HLS unroll
|
| 1766 |
+
tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
|
| 1767 |
+
tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
|
| 1768 |
+
}
|
| 1769 |
+
fifo_to_acc0.write(tmp_acc0);
|
| 1770 |
+
fifo_to_acc1.write(tmp_acc1);
|
| 1771 |
+
}
|
| 1772 |
+
}
|
| 1773 |
+
}
|
| 1774 |
+
}
|
| 1775 |
+
}
|
| 1776 |
+
|
| 1777 |
+
void ffn_buffer(
|
| 1778 |
+
const int L,
|
| 1779 |
+
tapa::istream<ap_uint<128>>& fifo_ffn_in,
|
| 1780 |
+
tapa::ostream<ap_uint<1024>>& fifo_ffn_out,
|
| 1781 |
+
tapa::ostream<ap_uint<1024>>& fifo_ffn_res
|
| 1782 |
+
){
|
| 1783 |
+
ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
|
| 1784 |
+
#pragma HLS array_partition variable=X cyclic dim=1 factor=16
|
| 1785 |
+
#pragma HLS bind_storage variable=X type=ram_2p impl=uram
|
| 1786 |
+
|
| 1787 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1788 |
+
for(int j = 0; j < D_div_8; j++){
|
| 1789 |
+
ap_uint<64> fuse_reg[16];
|
| 1790 |
+
#pragma HLS array_partition variable=fuse_reg complete
|
| 1791 |
+
|
| 1792 |
+
for(int l = 0; l < 8;){
|
| 1793 |
+
#pragma HLS pipeline II=1
|
| 1794 |
+
|
| 1795 |
+
if(!fifo_ffn_in.empty()){
|
| 1796 |
+
ap_uint<128> tmp; fifo_ffn_in.try_read(tmp);
|
| 1797 |
+
for(int k = 0; k < 16; k++){
|
| 1798 |
+
#pragma HLS unroll
|
| 1799 |
+
fuse_reg[k](l*8+7, l*8) = tmp(k*8+7, k*8);
|
| 1800 |
+
}
|
| 1801 |
+
l++;
|
| 1802 |
+
}
|
| 1803 |
+
}
|
| 1804 |
+
for(int k = 0; k < 16; k++){
|
| 1805 |
+
#pragma HLS unroll
|
| 1806 |
+
X[i*16+k][j] = fuse_reg[k];
|
| 1807 |
+
}
|
| 1808 |
+
}
|
| 1809 |
+
}
|
| 1810 |
+
|
| 1811 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1812 |
+
for(int iter = 0; iter < (D_ffn_SLR >> 4); iter++){
|
| 1813 |
+
for(int it = 0; it < 2; it++){
|
| 1814 |
+
for(int j = 0; j < D_div_8; j++){
|
| 1815 |
+
ap_uint<1024> tmp;
|
| 1816 |
+
for(int k = 0; k < 16; k++){
|
| 1817 |
+
#pragma HLS unroll
|
| 1818 |
+
tmp(k*64+63, k*64) = X[i*16+k][j];
|
| 1819 |
+
}
|
| 1820 |
+
fifo_ffn_out.write(tmp);
|
| 1821 |
+
}
|
| 1822 |
+
}
|
| 1823 |
+
|
| 1824 |
+
if(iter < D_div_16){
|
| 1825 |
+
for(int j = 0; j < 2; j++){
|
| 1826 |
+
ap_uint<1024> send;
|
| 1827 |
+
for(int k = 0; k < 16; k++){
|
| 1828 |
+
#pragma HLS unroll
|
| 1829 |
+
send(k*64+63, k*64) = X[i*16+k][iter*2+j];
|
| 1830 |
+
}
|
| 1831 |
+
fifo_ffn_res.write(send);
|
| 1832 |
+
}
|
| 1833 |
+
}
|
| 1834 |
+
}
|
| 1835 |
+
}
|
| 1836 |
+
}
|
| 1837 |
+
|
| 1838 |
+
void ffn_residual(
|
| 1839 |
+
const int L,
|
| 1840 |
+
tapa::istream<ap_uint<1024>>& fifo_x,
|
| 1841 |
+
tapa::istream<ap_uint<512>>& fifo_in,
|
| 1842 |
+
tapa::ostreams<ap_uint<512>, 2>& fifo_out
|
| 1843 |
+
){
|
| 1844 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1845 |
+
for(int j = 0; j < D_div_8; j++){
|
| 1846 |
+
ap_uint<1024> tmp_x = fifo_x.read();
|
| 1847 |
+
for(int k = 0; k < 8;){
|
| 1848 |
+
if(!fifo_in.empty()){
|
| 1849 |
+
ap_uint<512> tmp; fifo_in.try_read(tmp);
|
| 1850 |
+
ap_uint<512> tmp_o;
|
| 1851 |
+
ap_uint<128> x = tmp_x(k*128+127, k*128);
|
| 1852 |
+
for(int l = 0; l < 16; l++){
|
| 1853 |
+
#pragma HLS unroll
|
| 1854 |
+
ap_int<22> a = tmp(l*32+31, l*32);
|
| 1855 |
+
ap_int<8> b = x(l*8+7, l*8);
|
| 1856 |
+
ap_int<22> res = a + b;
|
| 1857 |
+
tmp_o(l*32+31, l*32) = res;
|
| 1858 |
+
}
|
| 1859 |
+
fifo_out[i%2].write(tmp_o);
|
| 1860 |
+
k++;
|
| 1861 |
+
}
|
| 1862 |
+
}
|
| 1863 |
+
}
|
| 1864 |
+
}
|
| 1865 |
+
}
|
| 1866 |
+
|
| 1867 |
+
void measure_cycle(tapa::istream<bool>& fifo_fin, tapa::mmap<int> cycle_count){
|
| 1868 |
+
for(int cycle = 0;;cycle++){
|
| 1869 |
+
if(!fifo_fin.empty()){
|
| 1870 |
+
fifo_fin.read(nullptr);
|
| 1871 |
+
cycle_count[0] = cycle;
|
| 1872 |
+
break;
|
| 1873 |
+
}
|
| 1874 |
+
}
|
| 1875 |
+
}
|
| 1876 |
+
|
| 1877 |
+
void opt_kernel(
|
| 1878 |
+
const int L,
|
| 1879 |
+
const int L_out,
|
| 1880 |
+
const int seq_len,
|
| 1881 |
+
// tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
|
| 1882 |
+
tapa::mmap<ap_uint<512>> X_acc0,
|
| 1883 |
+
tapa::mmap<ap_uint<512>> X_acc1,
|
| 1884 |
+
tapa::mmap<ap_uint<512>> W_acc0,
|
| 1885 |
+
tapa::mmap<ap_uint<512>> W_acc1,
|
| 1886 |
+
tapa::mmap<ap_uint<128>> acc0_out,
|
| 1887 |
+
// tapa::mmap<ap_uint<64>> acc1_out,
|
| 1888 |
+
tapa::mmap<int> cycle_count
|
| 1889 |
+
){
|
| 1890 |
+
tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
|
| 1891 |
+
tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
|
| 1892 |
+
tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
|
| 1893 |
+
tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
|
| 1894 |
+
tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
|
| 1895 |
+
tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
|
| 1896 |
+
tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
|
| 1897 |
+
tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
|
| 1898 |
+
// tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
|
| 1899 |
+
tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
|
| 1900 |
+
tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
|
| 1901 |
+
tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
|
| 1902 |
+
// tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
|
| 1903 |
+
tapa::streams<ap_uint<256>, NUM_SLR, 8> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
|
| 1904 |
+
tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
|
| 1905 |
+
tapa::stream<bool> fifo_fin("fifo_fin");
|
| 1906 |
+
|
| 1907 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
|
| 1908 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
|
| 1909 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
|
| 1910 |
+
tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
|
| 1911 |
+
tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
|
| 1912 |
+
|
| 1913 |
+
// tapa::stream<ap_uint<128>> fifo_acc0_out("fifo_acc0_out");
|
| 1914 |
+
tapa::stream<ap_uint<128>> fifo_acc1_out("fifo_acc1_out");
|
| 1915 |
+
|
| 1916 |
+
tapa::stream<ap_uint<512>, 16> fifo_res_acc0("fifo_res_acc0");
|
| 1917 |
+
tapa::stream<ap_uint<512>, 16> fifo_res_acc1("fifo_res_acc1");
|
| 1918 |
+
tapa::stream<ap_uint<512>, D> fifo_ln_acc0("fifo_ln_acc0");
|
| 1919 |
+
tapa::stream<ap_uint<512>, D> fifo_ln_acc1("fifo_ln_acc1");
|
| 1920 |
+
|
| 1921 |
+
tapa::stream<ap_uint<128>> fifo_ffn_buffer_in("fifo_ffn_buffer_in");
|
| 1922 |
+
tapa::stream<ap_uint<1024>> fifo_ffn_buffer_out("fifo_ffn_buffer_out");
|
| 1923 |
+
|
| 1924 |
+
tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_gelu_in("fifo_gelu_in");
|
| 1925 |
+
tapa::streams<ap_uint<128>, NUM_SLR, D> fifo_gelu_out("fifo_gelu_out");
|
| 1926 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_gelu_full("fifo_gelu_full");
|
| 1927 |
+
|
| 1928 |
+
tapa::stream<ap_uint<512>, 8> fifo_ffn2("fifo_ffn2");
|
| 1929 |
+
tapa::stream<ap_uint<1024>, D_div_8+2> fifo_skip_x("fifo_skip_x");
|
| 1930 |
+
tapa::streams<ap_uint<512>, 2> fifo_res2("fifo_res2");
|
| 1931 |
+
|
| 1932 |
+
tapa::streams<int, NUM_SLR> fifo_inst_switch_acc0("fifo_inst_switch_acc0");
|
| 1933 |
+
tapa::streams<int, NUM_SLR> fifo_inst_switch_acc1("fifo_inst_switch_acc1");
|
| 1934 |
+
tapa::streams<int, NUM_SLR> fifo_inst_switch_sfu("fifo_inst_switch_sfu");
|
| 1935 |
+
tapa::streams<int, NUM_SLR> fifo_inst_switch_context("fifo_inst_switch_context");
|
| 1936 |
+
tapa::streams<int, NUM_SLR> fifo_inst_switch_gelu("fifo_inst_switch_gelu");
|
| 1937 |
+
tapa::streams<int, NUM_SLR*2> fifo_inst_sfu_buffer("fifo_inst_sfu_buffer");
|
| 1938 |
+
tapa::streams<int, NUM_SLR> fifo_inst_data_pack("fifo_inst_data_pack");
|
| 1939 |
+
tapa::streams<int, NUM_SLR> fifo_inst_norm("fifo_inst_norm");
|
| 1940 |
+
|
| 1941 |
+
tapa::task()
|
| 1942 |
+
.invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
|
| 1943 |
+
.invoke<tapa::join>(read_W, W_acc0, fifo_W_acc0)
|
| 1944 |
+
.invoke<tapa::join>(read_W, W_acc1, fifo_W_acc1)
|
| 1945 |
+
.invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
|
| 1946 |
+
.invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
|
| 1947 |
+
.invoke<tapa::join>(
|
| 1948 |
+
temporal_acc0_slr0,
|
| 1949 |
+
fifo_inst_acc0, fifo_inst_acc0,
|
| 1950 |
+
fifo_inst_switch_acc0,
|
| 1951 |
+
fifo_X_acc0_slr0, fifo_X_acc0,
|
| 1952 |
+
fifo_W_acc0, fifo_W_acc0,
|
| 1953 |
+
fifo_from_acc1_to_acc0,
|
| 1954 |
+
fifo_acc0_to_sfu,
|
| 1955 |
+
fifo_gelu_in,
|
| 1956 |
+
fifo_cont_to_acc0,
|
| 1957 |
+
fifo_ffn_buffer_out,
|
| 1958 |
+
fifo_reduce_acc0,
|
| 1959 |
+
fifo_res_acc0
|
| 1960 |
+
// fifo_fin
|
| 1961 |
+
)
|
| 1962 |
+
.invoke<tapa::join>(
|
| 1963 |
+
temporal_acc1_slr0,
|
| 1964 |
+
fifo_inst_acc1, fifo_inst_acc1,
|
| 1965 |
+
fifo_inst_switch_acc1,
|
| 1966 |
+
fifo_X_acc1_slr0, fifo_X_acc1,
|
| 1967 |
+
fifo_W_acc1, fifo_W_acc1,
|
| 1968 |
+
fifo_from_acc1_to_acc0,
|
| 1969 |
+
fifo_from_sfu_to_acc1,
|
| 1970 |
+
fifo_context,
|
| 1971 |
+
fifo_cont_to_acc1,
|
| 1972 |
+
fifo_reduce_acc1,
|
| 1973 |
+
fifo_res_acc1,
|
| 1974 |
+
fifo_gelu_full,
|
| 1975 |
+
fifo_ffn2
|
| 1976 |
+
// fifo_fin
|
| 1977 |
+
)
|
| 1978 |
+
.invoke<tapa::join>(
|
| 1979 |
+
residual, seq_len,
|
| 1980 |
+
fifo_res_acc0,
|
| 1981 |
+
fifo_ln_acc0
|
| 1982 |
+
)
|
| 1983 |
+
.invoke<tapa::join>(
|
| 1984 |
+
residual, seq_len,
|
| 1985 |
+
fifo_res_acc1,
|
| 1986 |
+
fifo_ln_acc1
|
| 1987 |
+
)
|
| 1988 |
+
.invoke<tapa::join, NUM_SLR-1>(
|
| 1989 |
+
temporal_acc0,
|
| 1990 |
+
fifo_inst_acc0, fifo_inst_acc0,
|
| 1991 |
+
fifo_inst_switch_acc0,
|
| 1992 |
+
fifo_X_acc0, fifo_X_acc0,
|
| 1993 |
+
fifo_W_acc0, fifo_W_acc0,
|
| 1994 |
+
fifo_from_acc1_to_acc0,
|
| 1995 |
+
fifo_acc0_to_sfu,
|
| 1996 |
+
fifo_cont_to_acc0,
|
| 1997 |
+
fifo_gelu_in,
|
| 1998 |
+
fifo_reduce_acc0, fifo_reduce_acc0
|
| 1999 |
+
)
|
| 2000 |
+
.invoke<tapa::join, NUM_SLR-1>(
|
| 2001 |
+
temporal_acc1,
|
| 2002 |
+
fifo_inst_acc1, fifo_inst_acc1,
|
| 2003 |
+
fifo_inst_switch_acc1,
|
| 2004 |
+
fifo_X_acc1, fifo_X_acc1,
|
| 2005 |
+
fifo_W_acc1, fifo_W_acc1,
|
| 2006 |
+
fifo_from_acc1_to_acc0,
|
| 2007 |
+
fifo_from_sfu_to_acc1,
|
| 2008 |
+
fifo_context,
|
| 2009 |
+
fifo_cont_to_acc1,
|
| 2010 |
+
fifo_reduce_acc1, fifo_reduce_acc1,
|
| 2011 |
+
fifo_gelu_full
|
| 2012 |
+
)
|
| 2013 |
+
.invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc0, fifo_inst_switch_sfu, fifo_inst_switch_gelu)
|
| 2014 |
+
.invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc1, fifo_inst_switch_context, fifo_inst_norm)
|
| 2015 |
+
.invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc0, fifo_reduce_acc0)
|
| 2016 |
+
.invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc1, fifo_reduce_acc1)
|
| 2017 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 2018 |
+
sfu_acc_exp, fifo_inst_switch_sfu,
|
| 2019 |
+
fifo_acc0_to_sfu,
|
| 2020 |
+
fifo_sfu_buf_in,
|
| 2021 |
+
fifo_inst_sfu_buffer
|
| 2022 |
+
)
|
| 2023 |
+
.invoke<tapa::join>(
|
| 2024 |
+
sfu_buffer_slr0, fifo_inst_sfu_buffer,
|
| 2025 |
+
fifo_sfu_buf_in,
|
| 2026 |
+
fifo_ln_acc0,
|
| 2027 |
+
fifo_res2,
|
| 2028 |
+
fifo_sfu_buf_out
|
| 2029 |
+
)
|
| 2030 |
+
.invoke<tapa::join>(
|
| 2031 |
+
sfu_buffer_slr0, fifo_inst_sfu_buffer,
|
| 2032 |
+
fifo_sfu_buf_in,
|
| 2033 |
+
fifo_ln_acc1,
|
| 2034 |
+
fifo_res2,
|
| 2035 |
+
fifo_sfu_buf_out
|
| 2036 |
+
)
|
| 2037 |
+
.invoke<tapa::join, (NUM_SLR-1)*2>(
|
| 2038 |
+
sfu_buffer, fifo_inst_sfu_buffer,
|
| 2039 |
+
fifo_sfu_buf_in,
|
| 2040 |
+
fifo_sfu_buf_out
|
| 2041 |
+
)
|
| 2042 |
+
.invoke<tapa::join>(
|
| 2043 |
+
sfu_norm_slr0, fifo_inst_norm,
|
| 2044 |
+
fifo_sfu_buf_out,
|
| 2045 |
+
fifo_from_sfu_to_acc1,
|
| 2046 |
+
fifo_ffn_buffer_in,
|
| 2047 |
+
fifo_acc1_out
|
| 2048 |
+
)
|
| 2049 |
+
.invoke<tapa::join, NUM_SLR-1>(
|
| 2050 |
+
sfu_norm, fifo_inst_norm,
|
| 2051 |
+
fifo_sfu_buf_out,
|
| 2052 |
+
fifo_from_sfu_to_acc1
|
| 2053 |
+
)
|
| 2054 |
+
.invoke<tapa::join>(
|
| 2055 |
+
ffn_buffer, seq_len,
|
| 2056 |
+
fifo_ffn_buffer_in,
|
| 2057 |
+
fifo_ffn_buffer_out,
|
| 2058 |
+
fifo_skip_x
|
| 2059 |
+
)
|
| 2060 |
+
.invoke<tapa::join>(
|
| 2061 |
+
ffn_residual, seq_len,
|
| 2062 |
+
fifo_skip_x,
|
| 2063 |
+
fifo_ffn2,
|
| 2064 |
+
fifo_res2
|
| 2065 |
+
)
|
| 2066 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 2067 |
+
context_buffer, fifo_inst_switch_context,
|
| 2068 |
+
fifo_context,
|
| 2069 |
+
fifo_cont_to_acc0, fifo_cont_to_acc1
|
| 2070 |
+
)
|
| 2071 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 2072 |
+
sfu_gelu, fifo_inst_switch_gelu, fifo_inst_data_pack,
|
| 2073 |
+
fifo_gelu_in,
|
| 2074 |
+
fifo_gelu_out
|
| 2075 |
+
)
|
| 2076 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 2077 |
+
data_packing, fifo_inst_data_pack,
|
| 2078 |
+
fifo_gelu_out,
|
| 2079 |
+
fifo_gelu_full
|
| 2080 |
+
)
|
| 2081 |
+
// .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
|
| 2082 |
+
.invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc1_out, fifo_fin)
|
| 2083 |
+
// .invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
|
| 2084 |
+
.invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
|
| 2085 |
+
.invoke<tapa::detach>(black_hole_inst, fifo_inst_acc0)
|
| 2086 |
+
.invoke<tapa::detach>(black_hole_inst, fifo_inst_acc1)
|
| 2087 |
+
.invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
|
| 2088 |
+
.invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
|
| 2089 |
+
.invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
|
| 2090 |
+
.invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
|
| 2091 |
+
}
|
gpt-2-medium/kernel-versal.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt-2-medium/kernel.cpp
ADDED
|
@@ -0,0 +1,1528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cmath>
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <tapa.h>
|
| 4 |
+
#include <ap_int.h>
|
| 5 |
+
#include <hls_math.h>
|
| 6 |
+
|
| 7 |
+
constexpr int D = 1024;
|
| 8 |
+
constexpr int D_ffn = 4096;
|
| 9 |
+
constexpr int N_head = 16;
|
| 10 |
+
constexpr int MAX_SEQ_LEN = 1024;
|
| 11 |
+
constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
|
| 12 |
+
constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
|
| 13 |
+
constexpr int NUM_SLR = 3;
|
| 14 |
+
constexpr int NUM_DUM_SLR = 4;
|
| 15 |
+
constexpr int TOTAL_PORT = NUM_SLR * 2;
|
| 16 |
+
constexpr int D_head = D / N_head;
|
| 17 |
+
constexpr int D_head_div_16 = D_head / 16;
|
| 18 |
+
constexpr int D_head_div_8 = D_head / 8;
|
| 19 |
+
constexpr int D_head_div_4 = D_head / 4;
|
| 20 |
+
constexpr int D_head_div_2 = D_head / 2;
|
| 21 |
+
constexpr int D_div_8 = D / 8;
|
| 22 |
+
constexpr int D_div_16 = D / 16;
|
| 23 |
+
constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
|
| 24 |
+
constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
|
| 25 |
+
constexpr int WEIGHT_D = D * 2;
|
| 26 |
+
constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 10; // multi-head attention
|
| 27 |
+
constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE;
|
| 28 |
+
constexpr int CONTEXT_D = D_head_div_8 * 5;
|
| 29 |
+
constexpr int D_head_mul_5 = D_head * 5;
|
| 30 |
+
constexpr int D_write_zero = D / 32 * 5;
|
| 31 |
+
|
| 32 |
+
using int_v16 = tapa::vec_t<int, 16>;
|
| 33 |
+
using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
|
| 34 |
+
using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
|
| 35 |
+
|
| 36 |
+
template <typename data_t>
|
| 37 |
+
inline void bh(tapa::istream<data_t> & q) {
|
| 38 |
+
#pragma HLS inline
|
| 39 |
+
for (;;) {
|
| 40 |
+
#pragma HLS pipeline II=1
|
| 41 |
+
data_t tmp; q.try_read(tmp);
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
void black_hole_int(tapa::istream<int> & fifo_in) {
|
| 46 |
+
bh(fifo_in);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
|
| 50 |
+
bh(fifo_in);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
|
| 54 |
+
bh(fifo_in);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
|
| 58 |
+
bh(fifo_in);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
|
| 62 |
+
bh(fifo_in);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
|
| 66 |
+
bh(fifo_in);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
void read_W(
|
| 70 |
+
const int N,
|
| 71 |
+
tapa::async_mmap<ap_uint<512>>& vec,
|
| 72 |
+
tapa::ostream<ap_uint<512>>& fifo_out
|
| 73 |
+
){
|
| 74 |
+
|
| 75 |
+
for(int i_req = 0, i_resp = 0; i_resp < (N >> 7);){
|
| 76 |
+
#pragma HLS pipeline II=1
|
| 77 |
+
if((i_req < (N >> 7)) & !vec.read_addr.full()){
|
| 78 |
+
vec.read_addr.write(i_req);
|
| 79 |
+
i_req++;
|
| 80 |
+
}
|
| 81 |
+
if(!vec.read_data.empty()){
|
| 82 |
+
ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
|
| 83 |
+
fifo_out.write(tmp_o);
|
| 84 |
+
i_resp++;
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
void read_X(
|
| 90 |
+
const int N,
|
| 91 |
+
tapa::async_mmap<ap_uint<512>>& vec,
|
| 92 |
+
tapa::ostream<ap_uint<512>>& fifo_out
|
| 93 |
+
){
|
| 94 |
+
for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
|
| 95 |
+
#pragma HLS pipeline II=1
|
| 96 |
+
if((i_req < (N >> 6)) & !vec.read_addr.full()){
|
| 97 |
+
vec.read_addr.write(i_req);
|
| 98 |
+
i_req++;
|
| 99 |
+
}
|
| 100 |
+
if(!vec.read_data.empty()){
|
| 101 |
+
ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
|
| 102 |
+
fifo_out.write(tmp_o);
|
| 103 |
+
i_resp++;
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
void read_inst(
|
| 109 |
+
const int L,
|
| 110 |
+
tapa::ostream<int>& fifo_out_acc0,
|
| 111 |
+
tapa::ostream<int>& fifo_out_acc1
|
| 112 |
+
){
|
| 113 |
+
for(int stage_i = 0; stage_i < 20; stage_i++){
|
| 114 |
+
#pragma HLS pipeline II=1
|
| 115 |
+
|
| 116 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
|
| 117 |
+
|
| 118 |
+
if(stage == 3){
|
| 119 |
+
fifo_out_acc0.write(0);
|
| 120 |
+
fifo_out_acc1.write(0);
|
| 121 |
+
|
| 122 |
+
fifo_out_acc0.write(L/2);
|
| 123 |
+
fifo_out_acc1.write(L/2);
|
| 124 |
+
}
|
| 125 |
+
else if(stage != 1){
|
| 126 |
+
fifo_out_acc0.write(0);
|
| 127 |
+
fifo_out_acc1.write(0);
|
| 128 |
+
|
| 129 |
+
fifo_out_acc0.write(L);
|
| 130 |
+
fifo_out_acc1.write(L);
|
| 131 |
+
} else {
|
| 132 |
+
fifo_out_acc0.write(0);
|
| 133 |
+
fifo_out_acc0.write(L/2);
|
| 134 |
+
|
| 135 |
+
fifo_out_acc1.write(L/2);
|
| 136 |
+
fifo_out_acc1.write(L);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
void write_mtx(
|
| 142 |
+
const int N,
|
| 143 |
+
tapa::async_mmap<ap_uint<64>>& output_mtx,
|
| 144 |
+
tapa::istream<ap_uint<64>>& fifo_in
|
| 145 |
+
){
|
| 146 |
+
|
| 147 |
+
for(int i_req = 0, i_resp = 0; i_resp < N;){
|
| 148 |
+
#pragma HLS pipeline II=1
|
| 149 |
+
if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
|
| 150 |
+
output_mtx.write_addr.try_write(i_req);
|
| 151 |
+
ap_uint<64> tmp; fifo_in.try_read(tmp);
|
| 152 |
+
output_mtx.write_data.try_write(tmp);
|
| 153 |
+
++i_req;
|
| 154 |
+
}
|
| 155 |
+
if(!output_mtx.write_resp.empty()){
|
| 156 |
+
i_resp += unsigned(output_mtx.write_resp.read(nullptr))+1;
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
void write_zero(
|
| 162 |
+
const int L,
|
| 163 |
+
tapa::ostream<ap_uint<512>>& fifo_zero
|
| 164 |
+
){
|
| 165 |
+
for(int i = 0; i < L * D_write_zero;){
|
| 166 |
+
if(!fifo_zero.full()){
|
| 167 |
+
ap_uint<512> tmp = 0;
|
| 168 |
+
fifo_zero.try_write(tmp);
|
| 169 |
+
i++;
|
| 170 |
+
}
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// acc slr0 master node
|
| 175 |
+
void temporal_acc0_slr0(
|
| 176 |
+
const int L,
|
| 177 |
+
tapa::istream<int>& fifo_len_in,
|
| 178 |
+
tapa::ostream<int>& fifo_len_out,
|
| 179 |
+
tapa::istream<ap_uint<512>>& fifo_X_in,
|
| 180 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 181 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 182 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 183 |
+
tapa::istream<ap_uint<128>>& fifo_from_acc1,
|
| 184 |
+
tapa::ostream<ap_uint<512>>& fifo_O_out,
|
| 185 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 186 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 187 |
+
tapa::ostream<ap_uint<64>>& fifo_write,
|
| 188 |
+
tapa::ostream<bool>& fifo_fin
|
| 189 |
+
){
|
| 190 |
+
|
| 191 |
+
ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 192 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
|
| 193 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
|
| 194 |
+
|
| 195 |
+
ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 196 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
|
| 197 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
|
| 198 |
+
#pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
|
| 199 |
+
|
| 200 |
+
ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
|
| 201 |
+
#pragma HLS array_partition variable=X cyclic dim=1 factor=16
|
| 202 |
+
#pragma HLS array_partition variable=X cyclic dim=2 factor=2
|
| 203 |
+
#pragma HLS bind_storage variable=X type=ram_2p impl=uram
|
| 204 |
+
|
| 205 |
+
for(int stage_i = 0; stage_i < 20; stage_i++){
|
| 206 |
+
|
| 207 |
+
//TODO: stage send from inst
|
| 208 |
+
|
| 209 |
+
// stage 0: WqX
|
| 210 |
+
// stage 1: WkX0 <- acc1
|
| 211 |
+
// stage 2: QK^T
|
| 212 |
+
|
| 213 |
+
ap_uint<32> W[D_head][D_div_8]; // TODO: reduce dimension
|
| 214 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=16
|
| 215 |
+
|
| 216 |
+
const int start = fifo_len_in.read();
|
| 217 |
+
const int end = fifo_len_in.read();
|
| 218 |
+
fifo_len_out.write(start);
|
| 219 |
+
fifo_len_out.write(end);
|
| 220 |
+
|
| 221 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
|
| 222 |
+
|
| 223 |
+
// load weights and forward
|
| 224 |
+
if(stage != 2) { // TODO: 1d array & uniform access
|
| 225 |
+
for(int i = 0; i < D_head_div_4; i++){
|
| 226 |
+
load_weight:
|
| 227 |
+
for(int j = 0; j < D_div_8;){
|
| 228 |
+
if(!fifo_W_in.empty()){
|
| 229 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 230 |
+
|
| 231 |
+
for(int k = 0; k < 4; k++){
|
| 232 |
+
#pragma HLS unroll
|
| 233 |
+
W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
|
| 234 |
+
}
|
| 235 |
+
val = ap_uint<512>(val >> 128);
|
| 236 |
+
fifo_W_out.write(val);
|
| 237 |
+
j++;
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
|
| 244 |
+
j_bound = (stage == 3) ? D_div_16 : j_bound;
|
| 245 |
+
int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
|
| 246 |
+
|
| 247 |
+
// stage 1: compute Q
|
| 248 |
+
for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 16
|
| 249 |
+
|
| 250 |
+
if(stage_i == 0){
|
| 251 |
+
for(int ii = 0; ii < 2; ii++){ // load only 1 time
|
| 252 |
+
load_x:
|
| 253 |
+
for(int jj = 0; jj < D_div_8;){
|
| 254 |
+
if(!fifo_X_in.empty()){
|
| 255 |
+
ap_uint<512> val; fifo_X_in.try_read(val);
|
| 256 |
+
|
| 257 |
+
for(int k = 0; k < 8; k++){
|
| 258 |
+
#pragma HLS unroll
|
| 259 |
+
X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
|
| 260 |
+
}
|
| 261 |
+
jj++;
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
for(int j = 0; j < j_bound; j++){
|
| 268 |
+
|
| 269 |
+
ap_int<38> acc_vec[8][16][8];
|
| 270 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 271 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 272 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 273 |
+
|
| 274 |
+
for(int ii = 0; ii < 8; ii++){
|
| 275 |
+
#pragma HLS unroll
|
| 276 |
+
for(int kk = 0; kk < 16; kk++){
|
| 277 |
+
#pragma HLS unroll
|
| 278 |
+
for(int k = 0; k < 8; k++){
|
| 279 |
+
#pragma HLS unroll
|
| 280 |
+
acc_vec[ii][kk][k] = 0;
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
compute:
|
| 286 |
+
for(int k = 0; k < k_bound; k++){ // reduction dim
|
| 287 |
+
#pragma HLS pipeline II=1
|
| 288 |
+
|
| 289 |
+
ap_uint<64> op1_mtx[16];
|
| 290 |
+
ap_uint<64> op2_mtx[16];
|
| 291 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 292 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 293 |
+
|
| 294 |
+
ap_uint<1024> recv_pkt;
|
| 295 |
+
|
| 296 |
+
if(stage == 3) {
|
| 297 |
+
recv_pkt = fifo_context.read();
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
for(int ii = 0; ii < 16; ii++){
|
| 301 |
+
#pragma HLS unroll
|
| 302 |
+
if(stage == 3){
|
| 303 |
+
op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]); // change it
|
| 304 |
+
op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
|
| 305 |
+
} else if(stage == 2) {
|
| 306 |
+
op1_mtx[ii] = scratchpad_q[i*16+ii][k];
|
| 307 |
+
op2_mtx[ii] = scratchpad_k[j*16+ii][k];
|
| 308 |
+
} else {
|
| 309 |
+
op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
|
| 310 |
+
op2_mtx[ii] = X[i*16+ii][k];
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
if(stage < 2){
|
| 315 |
+
ap_uint<1024> send_pkt = ap_uint<1024>((
|
| 316 |
+
op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
|
| 317 |
+
op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
|
| 318 |
+
));
|
| 319 |
+
fifo_X_out.write(send_pkt);
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
for(int ii = 0; ii < 8; ii++){
|
| 323 |
+
#pragma HLS unroll
|
| 324 |
+
for(int kk = 0; kk < 16; kk++){
|
| 325 |
+
#pragma HLS unroll
|
| 326 |
+
for(int l = 0; l < 8; l++){
|
| 327 |
+
#pragma HLS unroll
|
| 328 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 329 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 330 |
+
if(stage == 2){
|
| 331 |
+
op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
|
| 332 |
+
op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
|
| 333 |
+
} else {
|
| 334 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 335 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 336 |
+
}
|
| 337 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 338 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
ap_int<22> acc_final[16][16];
|
| 345 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 346 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 347 |
+
|
| 348 |
+
for(int ii = 0; ii < 16; ii++){
|
| 349 |
+
#pragma HLS unroll
|
| 350 |
+
for(int k = 0; k < 16; k++){
|
| 351 |
+
#pragma HLS unroll
|
| 352 |
+
acc_final[ii][k] = 0;
|
| 353 |
+
}
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
reduction:
|
| 357 |
+
for(int kk = 0; kk < 8; kk++){
|
| 358 |
+
for(int ii = 0; ii < 16; ii++){
|
| 359 |
+
#pragma HLS unroll
|
| 360 |
+
for(int k = 0; k < 8; k++){
|
| 361 |
+
#pragma HLS unroll
|
| 362 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 363 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 364 |
+
res1 = res1 + res0[18];
|
| 365 |
+
acc_final[ii][k*2] += res0;
|
| 366 |
+
acc_final[ii][k*2+1] += res1;
|
| 367 |
+
if(kk == 7 && stage < 2) {
|
| 368 |
+
acc_final[ii][k*2] = acc_final[ii][k*2] >> 8;
|
| 369 |
+
acc_final[ii][k*2+1] = acc_final[ii][k*2] >> 8;
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
}
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
if(stage == 0){
|
| 376 |
+
for(int ii = 0; ii < 16; ii++){
|
| 377 |
+
#pragma HLS unroll
|
| 378 |
+
for(int k = 0; k < 16; k++){
|
| 379 |
+
#pragma HLS unroll
|
| 380 |
+
int offset = k%8;
|
| 381 |
+
scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
} else if (stage == 1){
|
| 385 |
+
for(int ii = 0; ii < 16; ii++){
|
| 386 |
+
ap_uint<128> tmp = fifo_from_acc1.read();
|
| 387 |
+
|
| 388 |
+
for(int k = 0; k < 16; k++){
|
| 389 |
+
#pragma HLS unroll
|
| 390 |
+
int offset = k%8;
|
| 391 |
+
scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
|
| 392 |
+
}
|
| 393 |
+
for(int k = 0; k < 2; k++){
|
| 394 |
+
#pragma HLS unroll
|
| 395 |
+
scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
|
| 396 |
+
}
|
| 397 |
+
}
|
| 398 |
+
} else if(stage == 2){
|
| 399 |
+
for(int ii = 0; ii < 16; ii++){
|
| 400 |
+
#pragma HLS pipeline II=1
|
| 401 |
+
ap_uint<512> tmp;
|
| 402 |
+
for(int kk = 0; kk < 16; kk++){
|
| 403 |
+
#pragma HLS unroll
|
| 404 |
+
tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
|
| 405 |
+
}
|
| 406 |
+
fifo_O_out.write(tmp);
|
| 407 |
+
}
|
| 408 |
+
} else {
|
| 409 |
+
final_acc:
|
| 410 |
+
for(int ii = 0; ii < 16;){
|
| 411 |
+
#pragma HLS pipeline II=1
|
| 412 |
+
#pragma HLS dependence variable=X type=inter false
|
| 413 |
+
if(!fifo_reduce_recv.empty()){
|
| 414 |
+
ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
|
| 415 |
+
for(int k = 0; k < 16; k++){
|
| 416 |
+
#pragma HLS unroll
|
| 417 |
+
acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
|
| 418 |
+
X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
ii++;
|
| 422 |
+
}
|
| 423 |
+
}
|
| 424 |
+
}
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
fifo_fin.write(true);
|
| 429 |
+
|
| 430 |
+
write:
|
| 431 |
+
for(int i = 0; i < L; i++){
|
| 432 |
+
for(int j = 0; j < D_div_8; j++){
|
| 433 |
+
#pragma HLS pipeline II=1
|
| 434 |
+
fifo_write.write(X[i][j]);
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
void temporal_acc0(
|
| 440 |
+
const int L,
|
| 441 |
+
tapa::istream<int>& fifo_len_in,
|
| 442 |
+
tapa::ostream<int>& fifo_len_out,
|
| 443 |
+
tapa::istream<ap_uint<1024>>& fifo_X_in,
|
| 444 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 445 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 446 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 447 |
+
tapa::istream<ap_uint<128>>& fifo_from_acc1,
|
| 448 |
+
tapa::ostream<ap_uint<512>>& fifo_O_out,
|
| 449 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 450 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 451 |
+
tapa::ostream<ap_uint<512>>& fifo_reduce_send,
|
| 452 |
+
tapa::ostream<bool>& fifo_fin
|
| 453 |
+
){
|
| 454 |
+
|
| 455 |
+
ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 456 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
|
| 457 |
+
#pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
|
| 458 |
+
|
| 459 |
+
ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 460 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
|
| 461 |
+
#pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
|
| 462 |
+
#pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
|
| 463 |
+
|
| 464 |
+
for(int stage_i = 0; stage_i < 20; stage_i++){
|
| 465 |
+
#pragma HLS loop_flatten off
|
| 466 |
+
|
| 467 |
+
// stage 0: WqX
|
| 468 |
+
// stage 1: WkX0 <- acc1
|
| 469 |
+
// stage 2: QK^T
|
| 470 |
+
// stage 3: WoO
|
| 471 |
+
|
| 472 |
+
ap_uint<32> W[D_head][D_div_8]; // 4 bit
|
| 473 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=16
|
| 474 |
+
|
| 475 |
+
const int start = fifo_len_in.read();
|
| 476 |
+
const int end = fifo_len_in.read();
|
| 477 |
+
fifo_len_out.write(start);
|
| 478 |
+
fifo_len_out.write(end);
|
| 479 |
+
|
| 480 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
|
| 481 |
+
|
| 482 |
+
// load weights and forward
|
| 483 |
+
if(stage != 2) {
|
| 484 |
+
for(int i = 0; i < D_head_div_4; i++){
|
| 485 |
+
load_weight:
|
| 486 |
+
for(int j = 0; j < D_div_8;){
|
| 487 |
+
if(!fifo_W_in.empty()){
|
| 488 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 489 |
+
|
| 490 |
+
for(int k = 0; k < 4; k++){
|
| 491 |
+
#pragma HLS unroll
|
| 492 |
+
W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
|
| 493 |
+
}
|
| 494 |
+
val = ap_uint<512>(val >> 128);
|
| 495 |
+
fifo_W_out.write(val);
|
| 496 |
+
j++;
|
| 497 |
+
}
|
| 498 |
+
}
|
| 499 |
+
}
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
|
| 503 |
+
j_bound = (stage == 3) ? D_div_16 : j_bound;
|
| 504 |
+
int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
|
| 505 |
+
|
| 506 |
+
// stage 1: compute Q
|
| 507 |
+
for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 64
|
| 508 |
+
for(int j = 0; j < j_bound; j++){
|
| 509 |
+
|
| 510 |
+
ap_int<38> acc_vec[8][16][8];
|
| 511 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 512 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 513 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 514 |
+
|
| 515 |
+
for(int ii = 0; ii < 8; ii++){
|
| 516 |
+
#pragma HLS unroll
|
| 517 |
+
for(int kk = 0; kk < 16; kk++){
|
| 518 |
+
#pragma HLS unroll
|
| 519 |
+
for(int k = 0; k < 8; k++){
|
| 520 |
+
#pragma HLS unroll
|
| 521 |
+
acc_vec[ii][kk][k] = 0;
|
| 522 |
+
}
|
| 523 |
+
}
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
compute:
|
| 527 |
+
for(int k = 0; k < k_bound; k++){ // reduction dim
|
| 528 |
+
#pragma HLS pipeline II=1
|
| 529 |
+
|
| 530 |
+
ap_uint<64> op1_mtx[16];
|
| 531 |
+
ap_uint<64> op2_mtx[16];
|
| 532 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 533 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 534 |
+
|
| 535 |
+
ap_uint<1024> recv_pkt;
|
| 536 |
+
if(stage == 3){
|
| 537 |
+
recv_pkt = fifo_context.read();
|
| 538 |
+
} else if(stage != 2) {
|
| 539 |
+
recv_pkt = fifo_X_in.read();
|
| 540 |
+
fifo_X_out.write(recv_pkt);
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
for(int ii = 0; ii < 16; ii++){
|
| 544 |
+
#pragma HLS unroll
|
| 545 |
+
if(stage == 3){
|
| 546 |
+
op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
|
| 547 |
+
op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
|
| 548 |
+
} else if(stage == 2) {
|
| 549 |
+
op1_mtx[ii] = scratchpad_q[i*16+ii][k];
|
| 550 |
+
op2_mtx[ii] = scratchpad_k[j*16+ii][k];
|
| 551 |
+
} else {
|
| 552 |
+
op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
|
| 553 |
+
op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
|
| 554 |
+
}
|
| 555 |
+
}
|
| 556 |
+
|
| 557 |
+
for(int ii = 0; ii < 8; ii++){
|
| 558 |
+
#pragma HLS unroll
|
| 559 |
+
for(int kk = 0; kk < 16; kk++){
|
| 560 |
+
#pragma HLS unroll
|
| 561 |
+
for(int l = 0; l < 8; l++){
|
| 562 |
+
#pragma HLS unroll
|
| 563 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 564 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 565 |
+
if(stage == 2){
|
| 566 |
+
op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
|
| 567 |
+
op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
|
| 568 |
+
} else {
|
| 569 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 570 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 571 |
+
}
|
| 572 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 573 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 574 |
+
}
|
| 575 |
+
}
|
| 576 |
+
}
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
ap_int<22> acc_final[16][16];
|
| 580 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 581 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 582 |
+
|
| 583 |
+
for(int ii = 0; ii < 16; ii++){
|
| 584 |
+
#pragma HLS unroll
|
| 585 |
+
for(int k = 0; k < 16; k++){
|
| 586 |
+
#pragma HLS unroll
|
| 587 |
+
acc_final[ii][k] = 0;
|
| 588 |
+
}
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
reduction:
|
| 592 |
+
for(int kk = 0; kk < 8; kk++){
|
| 593 |
+
for(int ii = 0; ii < 16; ii++){
|
| 594 |
+
#pragma HLS unroll
|
| 595 |
+
for(int k = 0; k < 8; k++){
|
| 596 |
+
#pragma HLS unroll
|
| 597 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 598 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 599 |
+
res1 = res1 + res0[18];
|
| 600 |
+
acc_final[ii][k*2] += res0;
|
| 601 |
+
acc_final[ii][k*2+1] += res1;
|
| 602 |
+
if(kk == 7 && stage < 2) {
|
| 603 |
+
acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
|
| 604 |
+
acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
|
| 605 |
+
}
|
| 606 |
+
}
|
| 607 |
+
}
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
if(stage == 0){
|
| 611 |
+
for(int ii = 0; ii < 16; ii++){
|
| 612 |
+
#pragma HLS unroll
|
| 613 |
+
for(int k = 0; k < 16; k++){
|
| 614 |
+
#pragma HLS unroll
|
| 615 |
+
int offset = k%8;
|
| 616 |
+
scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
|
| 617 |
+
}
|
| 618 |
+
}
|
| 619 |
+
} else if (stage == 1){
|
| 620 |
+
for(int ii = 0; ii < 16; ii++){
|
| 621 |
+
ap_uint<128> tmp = fifo_from_acc1.read();
|
| 622 |
+
|
| 623 |
+
for(int k = 0; k < 16; k++){
|
| 624 |
+
#pragma HLS unroll
|
| 625 |
+
int offset = k%8;
|
| 626 |
+
scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
|
| 627 |
+
}
|
| 628 |
+
for(int k = 0; k < 2; k++){
|
| 629 |
+
#pragma HLS unroll
|
| 630 |
+
scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
|
| 631 |
+
}
|
| 632 |
+
}
|
| 633 |
+
} else if(stage == 2){
|
| 634 |
+
for(int ii = 0; ii < 16; ii++){
|
| 635 |
+
#pragma HLS pipeline II=1
|
| 636 |
+
ap_uint<512> tmp;
|
| 637 |
+
for(int kk = 0; kk < 16; kk++){
|
| 638 |
+
#pragma HLS unroll
|
| 639 |
+
tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
|
| 640 |
+
}
|
| 641 |
+
fifo_O_out.write(tmp);
|
| 642 |
+
}
|
| 643 |
+
} else {
|
| 644 |
+
final_acc:
|
| 645 |
+
for(int ii = 0; ii < 16;){
|
| 646 |
+
#pragma HLS pipeline II=1
|
| 647 |
+
if(!fifo_reduce_recv.empty()){
|
| 648 |
+
ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
|
| 649 |
+
ap_uint<512> tmp;
|
| 650 |
+
for(int k = 0; k < 16; k++){
|
| 651 |
+
#pragma HLS unroll
|
| 652 |
+
acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
|
| 653 |
+
tmp(k*32+21, k*32) = acc_final[ii][k];
|
| 654 |
+
}
|
| 655 |
+
fifo_reduce_send.write(tmp);
|
| 656 |
+
ii++;
|
| 657 |
+
}
|
| 658 |
+
}
|
| 659 |
+
}
|
| 660 |
+
}
|
| 661 |
+
}
|
| 662 |
+
}
|
| 663 |
+
fifo_fin.write(true);
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
// acc slr0 master node
|
| 667 |
+
void temporal_acc1_slr0(
|
| 668 |
+
const int L,
|
| 669 |
+
tapa::istream<int>& fifo_len_in,
|
| 670 |
+
tapa::ostream<int>& fifo_len_out,
|
| 671 |
+
tapa::istream<ap_uint<512>>& fifo_X_in,
|
| 672 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 673 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 674 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 675 |
+
tapa::ostream<ap_uint<128>>& fifo_to_acc0,
|
| 676 |
+
tapa::istream<ap_uint<128>>& fifo_from_sfu,
|
| 677 |
+
tapa::ostream<ap_uint<1024>>& fifo_O_out,
|
| 678 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 679 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 680 |
+
tapa::ostream<ap_uint<64>>& fifo_write,
|
| 681 |
+
tapa::ostream<bool>& fifo_fin
|
| 682 |
+
){
|
| 683 |
+
ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
|
| 684 |
+
#pragma HLS array_partition variable=X cyclic dim=1 factor=16
|
| 685 |
+
#pragma HLS array_partition variable=X cyclic dim=2 factor=2
|
| 686 |
+
#pragma HLS bind_storage variable=X type=ram_2p impl=uram
|
| 687 |
+
|
| 688 |
+
ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 689 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
|
| 690 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
|
| 691 |
+
#pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
|
| 692 |
+
|
| 693 |
+
// ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
|
| 694 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
|
| 695 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
|
| 696 |
+
|
| 697 |
+
for(int stage_i = 0; stage_i < 20; stage_i++){
|
| 698 |
+
|
| 699 |
+
// stage 0: WvX
|
| 700 |
+
// stage 1: WkX1 -> acc0
|
| 701 |
+
// stage 2: Softmax(QK)V <- acc0
|
| 702 |
+
// stage 3: WoO
|
| 703 |
+
|
| 704 |
+
ap_uint<32> W[D_head][D_div_8]; // 4 bit
|
| 705 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=16
|
| 706 |
+
|
| 707 |
+
const int start = fifo_len_in.read();
|
| 708 |
+
const int end = fifo_len_in.read();
|
| 709 |
+
fifo_len_out.write(start);
|
| 710 |
+
fifo_len_out.write(end);
|
| 711 |
+
|
| 712 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
|
| 713 |
+
|
| 714 |
+
// load weights and forward
|
| 715 |
+
if(stage != 2) {
|
| 716 |
+
for(int i = 0; i < D_head_div_4; i++){
|
| 717 |
+
load_weight:
|
| 718 |
+
for(int j = 0; j < D_div_8;){
|
| 719 |
+
if(!fifo_W_in.empty()){
|
| 720 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 721 |
+
|
| 722 |
+
for(int k = 0; k < 4; k++){
|
| 723 |
+
#pragma HLS unroll
|
| 724 |
+
W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
|
| 725 |
+
}
|
| 726 |
+
val = ap_uint<512>(val >> 128);
|
| 727 |
+
fifo_W_out.write(val);
|
| 728 |
+
j++;
|
| 729 |
+
}
|
| 730 |
+
}
|
| 731 |
+
}
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
|
| 735 |
+
k_bound = (stage == 3) ? D_head_div_8 : k_bound;
|
| 736 |
+
int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
|
| 737 |
+
|
| 738 |
+
for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
|
| 739 |
+
|
| 740 |
+
ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
|
| 741 |
+
#pragma HLS array_partition variable=cache_attn dim=2 complete
|
| 742 |
+
#pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
|
| 743 |
+
|
| 744 |
+
if(stage_i == 0){
|
| 745 |
+
for(int ii = 0; ii < 2; ii++){ // load only 1 time
|
| 746 |
+
load_x:
|
| 747 |
+
for(int jj = 0; jj < D_div_8;){
|
| 748 |
+
if(!fifo_X_in.empty()){
|
| 749 |
+
ap_uint<512> val; fifo_X_in.try_read(val);
|
| 750 |
+
|
| 751 |
+
for(int k = 0; k < 8; k++){
|
| 752 |
+
#pragma HLS unroll
|
| 753 |
+
X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
|
| 754 |
+
}
|
| 755 |
+
jj++;
|
| 756 |
+
}
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
} else if (stage == 2) {
|
| 760 |
+
for(int ii = 0; ii < (L >> 3); ii++){
|
| 761 |
+
ap_uint<32> fuse_reg[16];
|
| 762 |
+
load_attn:
|
| 763 |
+
for(int offset = 0; offset < 8;){
|
| 764 |
+
#pragma HLS pipeline II=1
|
| 765 |
+
if(!fifo_from_sfu.empty()){
|
| 766 |
+
ap_uint<128> val; fifo_from_sfu.try_read(val);
|
| 767 |
+
for(int k = 0; k < 16; k++){
|
| 768 |
+
#pragma HLS unroll
|
| 769 |
+
fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
|
| 770 |
+
}
|
| 771 |
+
offset++;
|
| 772 |
+
}
|
| 773 |
+
}
|
| 774 |
+
for(int k = 0; k < 16; k++){
|
| 775 |
+
#pragma HLS unroll
|
| 776 |
+
cache_attn[ii][k] = fuse_reg[k];
|
| 777 |
+
}
|
| 778 |
+
}
|
| 779 |
+
}
|
| 780 |
+
|
| 781 |
+
for(int j = 0; j < j_bound; j++){
|
| 782 |
+
|
| 783 |
+
ap_int<38> acc_vec[8][16][8];
|
| 784 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 785 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 786 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 787 |
+
|
| 788 |
+
for(int ii = 0; ii < 8; ii++){
|
| 789 |
+
#pragma HLS unroll
|
| 790 |
+
for(int kk = 0; kk < 16; kk++){
|
| 791 |
+
#pragma HLS unroll
|
| 792 |
+
for(int k = 0; k < 8; k++){
|
| 793 |
+
#pragma HLS unroll
|
| 794 |
+
acc_vec[ii][kk][k] = 0;
|
| 795 |
+
}
|
| 796 |
+
}
|
| 797 |
+
}
|
| 798 |
+
|
| 799 |
+
compute:
|
| 800 |
+
for(int k = 0; k < k_bound; k++){
|
| 801 |
+
#pragma HLS pipeline II=1
|
| 802 |
+
|
| 803 |
+
ap_uint<64> op1_mtx[16];
|
| 804 |
+
ap_uint<64> op2_mtx[16];
|
| 805 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 806 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 807 |
+
|
| 808 |
+
ap_uint<1024> recv_pkt;
|
| 809 |
+
|
| 810 |
+
if(stage == 3) {
|
| 811 |
+
recv_pkt = fifo_context.read();
|
| 812 |
+
}
|
| 813 |
+
|
| 814 |
+
for(int ii = 0; ii < 16; ii++){
|
| 815 |
+
#pragma HLS unroll
|
| 816 |
+
if(stage == 3){
|
| 817 |
+
op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
|
| 818 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 819 |
+
} else if(stage != 2) {
|
| 820 |
+
op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
|
| 821 |
+
op2_mtx[ii] = X[i*16+ii][k];
|
| 822 |
+
} else {
|
| 823 |
+
op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
|
| 824 |
+
op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
|
| 825 |
+
}
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
+
if(stage < 2){
|
| 829 |
+
ap_uint<1024> send_pkt = ap_uint<1024>((
|
| 830 |
+
op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
|
| 831 |
+
op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
|
| 832 |
+
));
|
| 833 |
+
fifo_X_out.write(send_pkt);
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
+
for(int ii = 0; ii < 8; ii++){
|
| 837 |
+
#pragma HLS unroll
|
| 838 |
+
for(int kk = 0; kk < 16; kk++){
|
| 839 |
+
#pragma HLS unroll
|
| 840 |
+
for(int l = 0; l < 8; l++){
|
| 841 |
+
#pragma HLS unroll
|
| 842 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 843 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 844 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 845 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 846 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 847 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 848 |
+
}
|
| 849 |
+
}
|
| 850 |
+
}
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
ap_int<22> acc_final[16][16];
|
| 854 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 855 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 856 |
+
|
| 857 |
+
for(int ii = 0; ii < 16; ii++){
|
| 858 |
+
#pragma HLS unroll
|
| 859 |
+
for(int k = 0; k < 16; k++){
|
| 860 |
+
#pragma HLS unroll
|
| 861 |
+
acc_final[ii][k] = 0;
|
| 862 |
+
}
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
reduction:
|
| 866 |
+
for(int kk = 0; kk < 8; kk++){
|
| 867 |
+
for(int ii = 0; ii < 16; ii++){
|
| 868 |
+
#pragma HLS unroll
|
| 869 |
+
for(int k = 0; k < 8; k++){
|
| 870 |
+
#pragma HLS unroll
|
| 871 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 872 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 873 |
+
res1 = res1 + res0[18];
|
| 874 |
+
acc_final[ii][k*2] += res0;
|
| 875 |
+
acc_final[ii][k*2+1] += res1;
|
| 876 |
+
if(kk == 7 && stage != 3) {
|
| 877 |
+
acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
|
| 878 |
+
acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
|
| 879 |
+
}
|
| 880 |
+
}
|
| 881 |
+
}
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
if(stage == 0){
|
| 885 |
+
for(int ii = 0; ii < 16; ii++){
|
| 886 |
+
#pragma HLS unroll
|
| 887 |
+
for(int k = 0; k < 16; k++){
|
| 888 |
+
#pragma HLS unroll
|
| 889 |
+
int offset = k%8;
|
| 890 |
+
scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
|
| 891 |
+
}
|
| 892 |
+
}
|
| 893 |
+
} else if (stage == 2){
|
| 894 |
+
for(int ii = 0; ii < 2; ii++){
|
| 895 |
+
#pragma HLS pipeline II=1
|
| 896 |
+
ap_uint<1024> tmp;
|
| 897 |
+
for(int jj = 0; jj < 8; jj++){
|
| 898 |
+
#pragma HLS unroll
|
| 899 |
+
for(int k = 0; k < 16; k++){
|
| 900 |
+
#pragma HLS unroll
|
| 901 |
+
tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
|
| 902 |
+
}
|
| 903 |
+
}
|
| 904 |
+
fifo_O_out.write(tmp);
|
| 905 |
+
}
|
| 906 |
+
} else if (stage == 1) {
|
| 907 |
+
for(int ii = 0; ii < 16; ii++){
|
| 908 |
+
ap_uint<128> tmp;
|
| 909 |
+
for(int k = 0; k < 16; k++){
|
| 910 |
+
#pragma HLS unroll
|
| 911 |
+
tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
|
| 912 |
+
}
|
| 913 |
+
fifo_to_acc0.write(tmp);
|
| 914 |
+
}
|
| 915 |
+
} else {
|
| 916 |
+
final_acc:
|
| 917 |
+
for(int ii = 0; ii < 16;){
|
| 918 |
+
#pragma HLS pipeline II=1
|
| 919 |
+
#pragma HLS dependence variable=X type=inter false
|
| 920 |
+
if(!fifo_reduce_recv.empty()){
|
| 921 |
+
ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
|
| 922 |
+
for(int k = 0; k < 16; k++){
|
| 923 |
+
#pragma HLS unroll
|
| 924 |
+
acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
|
| 925 |
+
X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
|
| 926 |
+
}
|
| 927 |
+
|
| 928 |
+
ii++;
|
| 929 |
+
}
|
| 930 |
+
}
|
| 931 |
+
}
|
| 932 |
+
}
|
| 933 |
+
}
|
| 934 |
+
}
|
| 935 |
+
fifo_fin.write(true);
|
| 936 |
+
|
| 937 |
+
// write out for debug
|
| 938 |
+
write:
|
| 939 |
+
for(int i = 0; i < L; i++){
|
| 940 |
+
for(int j = 0; j < D_div_8; j++){
|
| 941 |
+
#pragma HLS pipeline II=1
|
| 942 |
+
fifo_write.write(X[i][j]);
|
| 943 |
+
}
|
| 944 |
+
}
|
| 945 |
+
}
|
| 946 |
+
|
| 947 |
+
void temporal_acc1(
|
| 948 |
+
const int L,
|
| 949 |
+
tapa::istream<int>& fifo_len_in,
|
| 950 |
+
tapa::ostream<int>& fifo_len_out,
|
| 951 |
+
tapa::istream<ap_uint<1024>>& fifo_X_in,
|
| 952 |
+
tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
|
| 953 |
+
tapa::istream<ap_uint<512>>& fifo_W_in,
|
| 954 |
+
tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
|
| 955 |
+
tapa::ostream<ap_uint<128>>& fifo_to_acc0,
|
| 956 |
+
tapa::istream<ap_uint<128>>& fifo_from_sfu,
|
| 957 |
+
tapa::ostream<ap_uint<1024>>& fifo_O_out,
|
| 958 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 959 |
+
tapa::istream<ap_uint<512>>& fifo_reduce_recv,
|
| 960 |
+
tapa::ostream<ap_uint<512>>& fifo_reduce_send,
|
| 961 |
+
tapa::ostream<bool>& fifo_fin
|
| 962 |
+
){
|
| 963 |
+
|
| 964 |
+
ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
|
| 965 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
|
| 966 |
+
#pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
|
| 967 |
+
#pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
|
| 968 |
+
|
| 969 |
+
// ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
|
| 970 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
|
| 971 |
+
// #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
|
| 972 |
+
|
| 973 |
+
for(int stage_i = 0; stage_i < 20; stage_i++){
|
| 974 |
+
|
| 975 |
+
// stage 0: WvX
|
| 976 |
+
// stage 1: WkX1 -> acc0
|
| 977 |
+
// stage 2: Softmax(QK)V <- acc0
|
| 978 |
+
// stage 3: WoO
|
| 979 |
+
|
| 980 |
+
ap_uint<32> W[D_head][D_div_8]; // 4 bit
|
| 981 |
+
#pragma HLS array_partition variable=W cyclic dim=1 factor=16
|
| 982 |
+
|
| 983 |
+
const int start = fifo_len_in.read();
|
| 984 |
+
const int end = fifo_len_in.read();
|
| 985 |
+
fifo_len_out.write(start);
|
| 986 |
+
fifo_len_out.write(end);
|
| 987 |
+
|
| 988 |
+
const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
|
| 989 |
+
|
| 990 |
+
// load weights and forward
|
| 991 |
+
if(stage != 2) {
|
| 992 |
+
for(int i = 0; i < D_head_div_4; i++){
|
| 993 |
+
load_weight:
|
| 994 |
+
for(int j = 0; j < D_div_8;){
|
| 995 |
+
if(!fifo_W_in.empty()){
|
| 996 |
+
ap_uint<512> val; fifo_W_in.try_read(val);
|
| 997 |
+
|
| 998 |
+
for(int k = 0; k < 4; k++){
|
| 999 |
+
#pragma HLS unroll
|
| 1000 |
+
W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
|
| 1001 |
+
}
|
| 1002 |
+
val = ap_uint<512>(val >> 128);
|
| 1003 |
+
fifo_W_out.write(val);
|
| 1004 |
+
j++;
|
| 1005 |
+
}
|
| 1006 |
+
}
|
| 1007 |
+
}
|
| 1008 |
+
}
|
| 1009 |
+
|
| 1010 |
+
int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
|
| 1011 |
+
k_bound = (stage == 3) ? D_head_div_8 : k_bound;
|
| 1012 |
+
int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
|
| 1013 |
+
|
| 1014 |
+
for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
|
| 1015 |
+
|
| 1016 |
+
ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
|
| 1017 |
+
#pragma HLS array_partition variable=cache_attn dim=2 complete
|
| 1018 |
+
#pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
|
| 1019 |
+
|
| 1020 |
+
if(stage == 2){
|
| 1021 |
+
for(int ii = 0; ii < (L >> 3); ii++){
|
| 1022 |
+
ap_uint<32> fuse_reg[16];
|
| 1023 |
+
load_attn:
|
| 1024 |
+
for(int offset = 0; offset < 8;){
|
| 1025 |
+
#pragma HLS pipeline II=1
|
| 1026 |
+
if(!fifo_from_sfu.empty()){
|
| 1027 |
+
ap_uint<128> val; fifo_from_sfu.try_read(val);
|
| 1028 |
+
for(int k = 0; k < 16; k++){
|
| 1029 |
+
#pragma HLS unroll
|
| 1030 |
+
fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
|
| 1031 |
+
}
|
| 1032 |
+
offset++;
|
| 1033 |
+
}
|
| 1034 |
+
}
|
| 1035 |
+
for(int k = 0; k < 16; k++){
|
| 1036 |
+
#pragma HLS unroll
|
| 1037 |
+
cache_attn[ii][k] = fuse_reg[k];
|
| 1038 |
+
}
|
| 1039 |
+
}
|
| 1040 |
+
}
|
| 1041 |
+
|
| 1042 |
+
for(int j = 0; j < j_bound; j++){
|
| 1043 |
+
|
| 1044 |
+
ap_int<38> acc_vec[8][16][8];
|
| 1045 |
+
#pragma HLS array_partition variable=acc_vec dim=1 complete
|
| 1046 |
+
#pragma HLS array_partition variable=acc_vec dim=2 complete
|
| 1047 |
+
#pragma HLS array_partition variable=acc_vec dim=3 complete
|
| 1048 |
+
|
| 1049 |
+
for(int ii = 0; ii < 8; ii++){
|
| 1050 |
+
#pragma HLS unroll
|
| 1051 |
+
for(int kk = 0; kk < 16; kk++){
|
| 1052 |
+
#pragma HLS unroll
|
| 1053 |
+
for(int k = 0; k < 8; k++){
|
| 1054 |
+
#pragma HLS unroll
|
| 1055 |
+
acc_vec[ii][kk][k] = 0;
|
| 1056 |
+
}
|
| 1057 |
+
}
|
| 1058 |
+
}
|
| 1059 |
+
|
| 1060 |
+
compute:
|
| 1061 |
+
for(int k = 0; k < k_bound; k++){
|
| 1062 |
+
#pragma HLS pipeline II=1
|
| 1063 |
+
|
| 1064 |
+
ap_uint<64> op1_mtx[16];
|
| 1065 |
+
ap_uint<64> op2_mtx[16];
|
| 1066 |
+
#pragma HLS array_partition variable=op1_mtx complete
|
| 1067 |
+
#pragma HLS array_partition variable=op2_mtx complete
|
| 1068 |
+
|
| 1069 |
+
ap_uint<1024> recv_pkt;
|
| 1070 |
+
|
| 1071 |
+
if(stage == 3) {
|
| 1072 |
+
recv_pkt = fifo_context.read();
|
| 1073 |
+
} else if(stage != 2) {
|
| 1074 |
+
recv_pkt = fifo_X_in.read();
|
| 1075 |
+
fifo_X_out.write(recv_pkt);
|
| 1076 |
+
}
|
| 1077 |
+
|
| 1078 |
+
for(int ii = 0; ii < 16; ii++){ //TODO: change logic
|
| 1079 |
+
#pragma HLS unroll
|
| 1080 |
+
if(stage == 3){
|
| 1081 |
+
op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
|
| 1082 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 1083 |
+
} else if(stage != 2) {
|
| 1084 |
+
op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
|
| 1085 |
+
op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
|
| 1086 |
+
} else {
|
| 1087 |
+
op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
|
| 1088 |
+
op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
|
| 1089 |
+
}
|
| 1090 |
+
}
|
| 1091 |
+
|
| 1092 |
+
for(int ii = 0; ii < 8; ii++){
|
| 1093 |
+
#pragma HLS unroll
|
| 1094 |
+
for(int kk = 0; kk < 16; kk++){
|
| 1095 |
+
#pragma HLS unroll
|
| 1096 |
+
for(int l = 0; l < 8; l++){
|
| 1097 |
+
#pragma HLS unroll
|
| 1098 |
+
ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
|
| 1099 |
+
op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
|
| 1100 |
+
op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
|
| 1101 |
+
op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
|
| 1102 |
+
ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
|
| 1103 |
+
acc_vec[ii][kk][l] += w_pack * op3;
|
| 1104 |
+
}
|
| 1105 |
+
}
|
| 1106 |
+
}
|
| 1107 |
+
}
|
| 1108 |
+
|
| 1109 |
+
ap_int<22> acc_final[16][16];
|
| 1110 |
+
#pragma HLS array_partition variable=acc_final dim=1 complete
|
| 1111 |
+
#pragma HLS array_partition variable=acc_final dim=2 complete
|
| 1112 |
+
|
| 1113 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1114 |
+
#pragma HLS unroll
|
| 1115 |
+
for(int k = 0; k < 16; k++){
|
| 1116 |
+
#pragma HLS unroll
|
| 1117 |
+
acc_final[ii][k] = 0;
|
| 1118 |
+
}
|
| 1119 |
+
}
|
| 1120 |
+
|
| 1121 |
+
reduction:
|
| 1122 |
+
for(int kk = 0; kk < 8; kk++){
|
| 1123 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1124 |
+
#pragma HLS unroll
|
| 1125 |
+
for(int k = 0; k < 8; k++){
|
| 1126 |
+
#pragma HLS unroll
|
| 1127 |
+
ap_int<19> res0; ap_int<19> res1;
|
| 1128 |
+
(res1, res0) = acc_vec[kk][ii][k];
|
| 1129 |
+
res1 = res1 + res0[18];
|
| 1130 |
+
acc_final[ii][k*2] += res0;
|
| 1131 |
+
acc_final[ii][k*2+1] += res1;
|
| 1132 |
+
if(kk == 7 && stage != 3) {
|
| 1133 |
+
acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
|
| 1134 |
+
acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
|
| 1135 |
+
}
|
| 1136 |
+
}
|
| 1137 |
+
}
|
| 1138 |
+
}
|
| 1139 |
+
|
| 1140 |
+
if(stage == 0){
|
| 1141 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1142 |
+
#pragma HLS unroll
|
| 1143 |
+
for(int k = 0; k < 16; k++){
|
| 1144 |
+
#pragma HLS unroll
|
| 1145 |
+
int offset = k%8;
|
| 1146 |
+
scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
|
| 1147 |
+
}
|
| 1148 |
+
}
|
| 1149 |
+
} else if (stage == 2){
|
| 1150 |
+
for(int ii = 0; ii < 2; ii++){
|
| 1151 |
+
#pragma HLS pipeline II=1
|
| 1152 |
+
ap_uint<1024> tmp;
|
| 1153 |
+
for(int jj = 0; jj < 8; jj++){
|
| 1154 |
+
#pragma HLS unroll
|
| 1155 |
+
for(int k = 0; k < 16; k++){
|
| 1156 |
+
#pragma HLS unroll
|
| 1157 |
+
tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
|
| 1158 |
+
}
|
| 1159 |
+
}
|
| 1160 |
+
fifo_O_out.write(tmp);
|
| 1161 |
+
}
|
| 1162 |
+
} else if (stage == 1){
|
| 1163 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1164 |
+
ap_uint<128> tmp;
|
| 1165 |
+
for(int k = 0; k < 16; k++){
|
| 1166 |
+
#pragma HLS unroll
|
| 1167 |
+
tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
|
| 1168 |
+
}
|
| 1169 |
+
fifo_to_acc0.write(tmp);
|
| 1170 |
+
}
|
| 1171 |
+
} else {
|
| 1172 |
+
final_acc:
|
| 1173 |
+
for(int ii = 0; ii < 16;){
|
| 1174 |
+
#pragma HLS pipeline II=1
|
| 1175 |
+
if(!fifo_reduce_recv.empty()){
|
| 1176 |
+
ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
|
| 1177 |
+
ap_uint<512> tmp;
|
| 1178 |
+
for(int k = 0; k < 16; k++){
|
| 1179 |
+
#pragma HLS unroll
|
| 1180 |
+
acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
|
| 1181 |
+
tmp(k*32+21, k*32) = acc_final[ii][k];
|
| 1182 |
+
}
|
| 1183 |
+
fifo_reduce_send.write(tmp);
|
| 1184 |
+
ii++;
|
| 1185 |
+
}
|
| 1186 |
+
}
|
| 1187 |
+
}
|
| 1188 |
+
}
|
| 1189 |
+
}
|
| 1190 |
+
}
|
| 1191 |
+
fifo_fin.write(true);
|
| 1192 |
+
|
| 1193 |
+
// write out for debug
|
| 1194 |
+
// write:
|
| 1195 |
+
// for(int i = 0; i < L; i++){
|
| 1196 |
+
// for(int j = 0; j < D_head_div_8; j++){
|
| 1197 |
+
// #pragma HLS pipeline II=1
|
| 1198 |
+
// fifo_O_out.write(scratchpad_out[i][j]);
|
| 1199 |
+
// }
|
| 1200 |
+
// }
|
| 1201 |
+
}
|
| 1202 |
+
|
| 1203 |
+
void sfu_buffer( // double buffering
|
| 1204 |
+
const int L,
|
| 1205 |
+
tapa::istream<ap_uint<512>>& fifo_data_in,
|
| 1206 |
+
tapa::ostream<ap_uint<512>>& fifo_data_out
|
| 1207 |
+
){
|
| 1208 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1209 |
+
|
| 1210 |
+
for(int l = 0; l < (L >> 5); l++){
|
| 1211 |
+
float sum[8][16];
|
| 1212 |
+
float cache[MAX_SEQ_LEN][16];
|
| 1213 |
+
#pragma HLS array_partition variable=cache dim=2 complete
|
| 1214 |
+
#pragma HLS array_partition variable=sum dim=2 complete
|
| 1215 |
+
|
| 1216 |
+
for(int i = 0; i < 8; i++){
|
| 1217 |
+
for(int j = 0; j < 16; j++){
|
| 1218 |
+
#pragma HLS unroll
|
| 1219 |
+
sum[i][j] = 0.0;
|
| 1220 |
+
}
|
| 1221 |
+
}
|
| 1222 |
+
|
| 1223 |
+
acc:
|
| 1224 |
+
for(int i = 0; i < L; i++){
|
| 1225 |
+
#pragma HLS pipeline II=1
|
| 1226 |
+
#pragma HLS dependence false variable=sum
|
| 1227 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1228 |
+
ap_uint<512> tmp = fifo_data_in.read();
|
| 1229 |
+
for(int k = 0; k < 16; k++){
|
| 1230 |
+
#pragma HLS unroll
|
| 1231 |
+
float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1232 |
+
sum[i%8][k] += res;
|
| 1233 |
+
cache[i][k] = res;
|
| 1234 |
+
}
|
| 1235 |
+
}
|
| 1236 |
+
|
| 1237 |
+
reduce:
|
| 1238 |
+
for(int i = 1; i < 8; i++){
|
| 1239 |
+
for(int j = 0; j < 8; j++){
|
| 1240 |
+
#pragma HLS pipeline II=1
|
| 1241 |
+
#pragma HLS dependence true variable=sum distance=8
|
| 1242 |
+
for(int k = 0; k < 2; k++){
|
| 1243 |
+
sum[0][j*2+k] += sum[i][j*2+k];
|
| 1244 |
+
}
|
| 1245 |
+
}
|
| 1246 |
+
}
|
| 1247 |
+
|
| 1248 |
+
ap_uint<512> tmp;
|
| 1249 |
+
for(int i = 0; i < 16; i++){
|
| 1250 |
+
#pragma HLS unroll
|
| 1251 |
+
tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
|
| 1252 |
+
}
|
| 1253 |
+
fifo_data_out.write(tmp);
|
| 1254 |
+
|
| 1255 |
+
write:
|
| 1256 |
+
for(int i = 0; i < L; i++){
|
| 1257 |
+
#pragma HLS pipeline II=1
|
| 1258 |
+
ap_uint<512> tmp;
|
| 1259 |
+
for(int j = 0; j < 16; j++){
|
| 1260 |
+
#pragma HLS unroll
|
| 1261 |
+
tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
|
| 1262 |
+
}
|
| 1263 |
+
fifo_data_out.write(tmp);
|
| 1264 |
+
}
|
| 1265 |
+
|
| 1266 |
+
}
|
| 1267 |
+
}
|
| 1268 |
+
|
| 1269 |
+
}
|
| 1270 |
+
|
| 1271 |
+
void sfu_acc_exp(
|
| 1272 |
+
const int L,
|
| 1273 |
+
tapa::istream<ap_uint<512>>& fifo_data_in,
|
| 1274 |
+
tapa::ostreams<ap_uint<512>, 2>& fifo_buf
|
| 1275 |
+
) {
|
| 1276 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1277 |
+
|
| 1278 |
+
for(int l = 0; l < (L >> 4); l++){
|
| 1279 |
+
exp_acc:
|
| 1280 |
+
for(int i = 0; i < L;){
|
| 1281 |
+
#pragma HLS pipeline II=1
|
| 1282 |
+
if(!fifo_data_in.empty()){
|
| 1283 |
+
ap_uint<512> tmp; fifo_data_in.try_read(tmp);
|
| 1284 |
+
ap_uint<512> tmp_o;
|
| 1285 |
+
for(int k = 0; k < 16; k++){
|
| 1286 |
+
#pragma HLS unroll
|
| 1287 |
+
int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
|
| 1288 |
+
float res_exp = 0.0;
|
| 1289 |
+
res_exp = hls::exp(ap_int<32>(res >> 10));
|
| 1290 |
+
tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
|
| 1291 |
+
}
|
| 1292 |
+
fifo_buf[l%2].write(tmp_o);
|
| 1293 |
+
i++;
|
| 1294 |
+
}
|
| 1295 |
+
}
|
| 1296 |
+
}
|
| 1297 |
+
}
|
| 1298 |
+
}
|
| 1299 |
+
|
| 1300 |
+
void sfu_norm(
|
| 1301 |
+
const int L,
|
| 1302 |
+
tapa::istreams<ap_uint<512>, 2>& fifo_buf,
|
| 1303 |
+
tapa::ostream<ap_uint<128>>& fifo_data_out
|
| 1304 |
+
){
|
| 1305 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1306 |
+
|
| 1307 |
+
for(int l = 0; l < (L >> 4); l++){
|
| 1308 |
+
float sum[16];
|
| 1309 |
+
#pragma HLS array_partition variable=sum complete
|
| 1310 |
+
|
| 1311 |
+
ap_uint<512> tmp_in = fifo_buf[l%2].read();
|
| 1312 |
+
|
| 1313 |
+
for(int i = 0; i < 16; i++){
|
| 1314 |
+
#pragma HLS unroll
|
| 1315 |
+
sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
|
| 1316 |
+
}
|
| 1317 |
+
|
| 1318 |
+
for(int i = 0; i < L;){
|
| 1319 |
+
#pragma HLS pipeline II=1
|
| 1320 |
+
if(!fifo_buf[l%2].empty()){
|
| 1321 |
+
ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
|
| 1322 |
+
ap_uint<128> tmp;
|
| 1323 |
+
for(int j = 0; j < 16; j++){
|
| 1324 |
+
#pragma HLS unroll
|
| 1325 |
+
ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
|
| 1326 |
+
tmp(j*8 + 7, j*8) = res;
|
| 1327 |
+
}
|
| 1328 |
+
fifo_data_out.write(tmp);
|
| 1329 |
+
i++;
|
| 1330 |
+
}
|
| 1331 |
+
}
|
| 1332 |
+
}
|
| 1333 |
+
}
|
| 1334 |
+
}
|
| 1335 |
+
|
| 1336 |
+
void context_buffer(
|
| 1337 |
+
const int L,
|
| 1338 |
+
tapa::istream<ap_uint<1024>>& fifo_context,
|
| 1339 |
+
tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
|
| 1340 |
+
tapa::ostream<ap_uint<1024>>& fifo_to_acc1
|
| 1341 |
+
){
|
| 1342 |
+
ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
|
| 1343 |
+
#pragma HLS array_partition variable=context cyclic dim=1 factor=32
|
| 1344 |
+
#pragma HLS bind_storage variable=context type=ram_2p impl=uram
|
| 1345 |
+
|
| 1346 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1347 |
+
for(int i = 0; i < (L >> 4); i++){
|
| 1348 |
+
for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
|
| 1349 |
+
if(!fifo_context.empty()){
|
| 1350 |
+
ap_uint<1024> tmp; fifo_context.try_read(tmp);
|
| 1351 |
+
for(int ii = 0; ii < 16; ii++){
|
| 1352 |
+
#pragma HLS unroll
|
| 1353 |
+
context[i*16+ii][j] = tmp(ii*64+63, ii*64);
|
| 1354 |
+
}
|
| 1355 |
+
j++;
|
| 1356 |
+
}
|
| 1357 |
+
}
|
| 1358 |
+
}
|
| 1359 |
+
}
|
| 1360 |
+
|
| 1361 |
+
// NOTE: change it to write to HBM for debugging
|
| 1362 |
+
// write ops to acc0 and acc1 in parallel
|
| 1363 |
+
for(int stage = 0; stage < 5; stage++){
|
| 1364 |
+
for(int i = 0; i < (L >> 5); i++){
|
| 1365 |
+
for(int l = 0; l < D_div_16; l++){
|
| 1366 |
+
for(int j = 0; j < D_head_div_8; j++){
|
| 1367 |
+
ap_uint<1024> tmp_acc0;
|
| 1368 |
+
ap_uint<1024> tmp_acc1;
|
| 1369 |
+
for(int k = 0; k < 16; k++){
|
| 1370 |
+
#pragma HLS unroll
|
| 1371 |
+
tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
|
| 1372 |
+
tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
|
| 1373 |
+
}
|
| 1374 |
+
fifo_to_acc0.write(tmp_acc0);
|
| 1375 |
+
fifo_to_acc1.write(tmp_acc1);
|
| 1376 |
+
}
|
| 1377 |
+
}
|
| 1378 |
+
}
|
| 1379 |
+
}
|
| 1380 |
+
|
| 1381 |
+
}
|
| 1382 |
+
|
| 1383 |
+
void measure_cycle(tapa::istreams<bool, TOTAL_PORT>& fifo_fin, tapa::mmap<int> cycle_count){
|
| 1384 |
+
for(int cycle = 0;;cycle++){
|
| 1385 |
+
bool flag_cont = false;
|
| 1386 |
+
for(int i = 0; i < TOTAL_PORT; i++){
|
| 1387 |
+
flag_cont |= fifo_fin[i].empty();
|
| 1388 |
+
}
|
| 1389 |
+
if(!flag_cont){
|
| 1390 |
+
for(int i = 0; i < TOTAL_PORT; i++){
|
| 1391 |
+
fifo_fin[i].read(nullptr);
|
| 1392 |
+
}
|
| 1393 |
+
cycle_count[0] = cycle;
|
| 1394 |
+
break;
|
| 1395 |
+
}
|
| 1396 |
+
}
|
| 1397 |
+
}
|
| 1398 |
+
|
| 1399 |
+
void opt_kernel(
|
| 1400 |
+
const int L,
|
| 1401 |
+
const int L_out,
|
| 1402 |
+
const int seq_len,
|
| 1403 |
+
// tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
|
| 1404 |
+
tapa::mmap<ap_uint<512>> X_acc0,
|
| 1405 |
+
tapa::mmap<ap_uint<512>> X_acc1,
|
| 1406 |
+
tapa::mmap<ap_uint<512>> W_acc0,
|
| 1407 |
+
tapa::mmap<ap_uint<512>> W_acc1,
|
| 1408 |
+
tapa::mmap<ap_uint<64>> acc0_out,
|
| 1409 |
+
tapa::mmap<ap_uint<64>> acc1_out,
|
| 1410 |
+
tapa::mmap<int> cycle_count
|
| 1411 |
+
){
|
| 1412 |
+
tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
|
| 1413 |
+
tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
|
| 1414 |
+
tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
|
| 1415 |
+
tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
|
| 1416 |
+
tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
|
| 1417 |
+
tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
|
| 1418 |
+
tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
|
| 1419 |
+
tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
|
| 1420 |
+
// tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
|
| 1421 |
+
tapa::streams<ap_uint<512>, NUM_SLR> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
|
| 1422 |
+
tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
|
| 1423 |
+
tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
|
| 1424 |
+
// tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
|
| 1425 |
+
tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
|
| 1426 |
+
tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
|
| 1427 |
+
tapa::streams<bool, NUM_SLR*2> fifo_fin("fifo_fin");
|
| 1428 |
+
|
| 1429 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
|
| 1430 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
|
| 1431 |
+
tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
|
| 1432 |
+
tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
|
| 1433 |
+
tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
|
| 1434 |
+
|
| 1435 |
+
tapa::stream<ap_uint<64>> fifo_acc0_out("fifo_acc0_out");
|
| 1436 |
+
tapa::stream<ap_uint<64>> fifo_acc1_out("fifo_acc1_out");
|
| 1437 |
+
|
| 1438 |
+
tapa::task()
|
| 1439 |
+
.invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
|
| 1440 |
+
.invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc0, fifo_W_acc0)
|
| 1441 |
+
.invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc1, fifo_W_acc1)
|
| 1442 |
+
.invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
|
| 1443 |
+
.invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
|
| 1444 |
+
.invoke<tapa::join>(
|
| 1445 |
+
temporal_acc0_slr0,
|
| 1446 |
+
seq_len,
|
| 1447 |
+
fifo_inst_acc0, fifo_inst_acc0,
|
| 1448 |
+
fifo_X_acc0_slr0, fifo_X_acc0,
|
| 1449 |
+
fifo_W_acc0, fifo_W_acc0,
|
| 1450 |
+
fifo_from_acc1_to_acc0,
|
| 1451 |
+
fifo_acc0_to_sfu,
|
| 1452 |
+
fifo_cont_to_acc0,
|
| 1453 |
+
fifo_reduce_acc0,
|
| 1454 |
+
fifo_acc0_out,
|
| 1455 |
+
fifo_fin
|
| 1456 |
+
)
|
| 1457 |
+
.invoke<tapa::join>(
|
| 1458 |
+
temporal_acc1_slr0,
|
| 1459 |
+
seq_len,
|
| 1460 |
+
fifo_inst_acc1, fifo_inst_acc1,
|
| 1461 |
+
fifo_X_acc1_slr0, fifo_X_acc1,
|
| 1462 |
+
fifo_W_acc1, fifo_W_acc1,
|
| 1463 |
+
fifo_from_acc1_to_acc0,
|
| 1464 |
+
fifo_from_sfu_to_acc1,
|
| 1465 |
+
fifo_context,
|
| 1466 |
+
fifo_cont_to_acc1,
|
| 1467 |
+
fifo_reduce_acc1,
|
| 1468 |
+
fifo_acc1_out,
|
| 1469 |
+
fifo_fin
|
| 1470 |
+
)
|
| 1471 |
+
.invoke<tapa::join, NUM_SLR-1>(
|
| 1472 |
+
temporal_acc0,
|
| 1473 |
+
seq_len,
|
| 1474 |
+
fifo_inst_acc0, fifo_inst_acc0,
|
| 1475 |
+
fifo_X_acc0, fifo_X_acc0,
|
| 1476 |
+
fifo_W_acc0, fifo_W_acc0,
|
| 1477 |
+
fifo_from_acc1_to_acc0,
|
| 1478 |
+
fifo_acc0_to_sfu,
|
| 1479 |
+
fifo_cont_to_acc0,
|
| 1480 |
+
fifo_reduce_acc0, fifo_reduce_acc0,
|
| 1481 |
+
fifo_fin
|
| 1482 |
+
)
|
| 1483 |
+
.invoke<tapa::join, NUM_SLR-1>(
|
| 1484 |
+
temporal_acc1,
|
| 1485 |
+
seq_len,
|
| 1486 |
+
fifo_inst_acc1, fifo_inst_acc1,
|
| 1487 |
+
fifo_X_acc1, fifo_X_acc1,
|
| 1488 |
+
fifo_W_acc1, fifo_W_acc1,
|
| 1489 |
+
fifo_from_acc1_to_acc0,
|
| 1490 |
+
fifo_from_sfu_to_acc1,
|
| 1491 |
+
fifo_context,
|
| 1492 |
+
fifo_cont_to_acc1,
|
| 1493 |
+
fifo_reduce_acc1, fifo_reduce_acc1,
|
| 1494 |
+
fifo_fin
|
| 1495 |
+
)
|
| 1496 |
+
.invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc0)
|
| 1497 |
+
.invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc1)
|
| 1498 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 1499 |
+
sfu_acc_exp, seq_len,
|
| 1500 |
+
fifo_acc0_to_sfu,
|
| 1501 |
+
fifo_sfu_buf_in
|
| 1502 |
+
)
|
| 1503 |
+
.invoke<tapa::join, NUM_SLR*2>(
|
| 1504 |
+
sfu_buffer, seq_len,
|
| 1505 |
+
fifo_sfu_buf_in,
|
| 1506 |
+
fifo_sfu_buf_out
|
| 1507 |
+
)
|
| 1508 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 1509 |
+
sfu_norm, seq_len,
|
| 1510 |
+
fifo_sfu_buf_out,
|
| 1511 |
+
fifo_from_sfu_to_acc1
|
| 1512 |
+
)
|
| 1513 |
+
.invoke<tapa::join, NUM_SLR>(
|
| 1514 |
+
context_buffer, seq_len,
|
| 1515 |
+
fifo_context,
|
| 1516 |
+
fifo_cont_to_acc0, fifo_cont_to_acc1
|
| 1517 |
+
)
|
| 1518 |
+
// .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
|
| 1519 |
+
.invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc0_out)
|
| 1520 |
+
.invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
|
| 1521 |
+
.invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
|
| 1522 |
+
.invoke<tapa::detach>(black_hole_int, fifo_inst_acc0)
|
| 1523 |
+
.invoke<tapa::detach>(black_hole_int, fifo_inst_acc1)
|
| 1524 |
+
.invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
|
| 1525 |
+
.invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
|
| 1526 |
+
.invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
|
| 1527 |
+
.invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
|
| 1528 |
+
}
|
gpt-2-medium/link_config_versal.ini
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[connectivity]
|
| 2 |
+
sp=opt_kernel.X_acc0:DDR
|
| 3 |
+
sp=opt_kernel.X_acc1:DDR
|
| 4 |
+
sp=opt_kernel.W_acc0:DDR
|
| 5 |
+
sp=opt_kernel.W_acc1:DDR
|
| 6 |
+
sp=opt_kernel.acc0_out:DDR
|
| 7 |
+
sp=opt_kernel.cycle_count:DDR
|
gpt-2-medium/opt-versal-rs.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from rapidstream import RapidStreamTAPA, DeviceFactory
|
| 2 |
+
|
| 3 |
+
rs = RapidStreamTAPA("rs_build/")
|
| 4 |
+
rs.reset()
|
| 5 |
+
factory = DeviceFactory(
|
| 6 |
+
row=4,
|
| 7 |
+
col=2,
|
| 8 |
+
part_num="xcvp1802-lsvc4072-2MP-e-S"
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
# Set the pblocks of the device so that each slot contains half of an SLR:
|
| 12 |
+
factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
|
| 13 |
+
factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
|
| 14 |
+
factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
|
| 15 |
+
factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
|
| 16 |
+
|
| 17 |
+
factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
|
| 18 |
+
factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
|
| 19 |
+
factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
|
| 20 |
+
factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
|
| 21 |
+
|
| 22 |
+
# There are 18870 total SLL nodes for VP1552:
|
| 23 |
+
factory.set_slot_crossing_capacity(0, 0, north=9435)
|
| 24 |
+
factory.set_slot_crossing_capacity(1, 0, north=9435)
|
| 25 |
+
factory.set_slot_crossing_capacity(0, 1, north=9435)
|
| 26 |
+
factory.set_slot_crossing_capacity(1, 1, north=9435)
|
| 27 |
+
factory.set_slot_crossing_capacity(0, 2, north=9435)
|
| 28 |
+
factory.set_slot_crossing_capacity(1, 2, north=9435)
|
| 29 |
+
|
| 30 |
+
# Call factory to extract the slot resources automatically from Vivado:
|
| 31 |
+
factory.extract_slot_resources()
|
| 32 |
+
|
| 33 |
+
# The device can be supplied as the virtual device for the RapidStream APIs:
|
| 34 |
+
device = factory.generate_virtual_device()
|
| 35 |
+
rs.set_virtual_device(device)
|
| 36 |
+
|
| 37 |
+
rs.add_xo_file("./opt-stage4-dot-prod.tapa/opt.hw.xo")
|
| 38 |
+
rs.set_top_module_name("opt_kernel")
|
| 39 |
+
rs.add_clock("ap_clk", period_ns=3.33)
|
| 40 |
+
|
| 41 |
+
rs.set_vitis_connectivity_config("link_config_versal.ini")
|
| 42 |
+
rs.assign_port_to_region(".*", "SLOT_X0Y0:SLOT_X1Y0")
|
| 43 |
+
rs.run_dse(max_workers=1, max_dse_limit=0.9, min_dse_limit=0.6)
|
gpt-2-medium/package_sample.sh
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# TARGET=hw
|
| 3 |
+
TARGET=hw_emu
|
| 4 |
+
DEBUG=-g
|
| 5 |
+
|
| 6 |
+
TOP=opt_kernel
|
| 7 |
+
XO='/path/to/opt_kernel.xo'
|
| 8 |
+
CONSTRAINT='/path/to/constraints.tcl'
|
| 9 |
+
>&2 echo "Using the default clock target of the platform."
|
| 10 |
+
PLATFORM="/path/to/vpk180_pfm_vitis.xpfm"
|
| 11 |
+
VERSAL="/path/to/xilinx-versal-common-v2023.2"
|
| 12 |
+
TARGET_FREQUENCY=300000000
|
| 13 |
+
if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
|
| 14 |
+
|
| 15 |
+
OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
|
| 16 |
+
|
| 17 |
+
MAX_SYNTH_JOBS=16
|
| 18 |
+
STRATEGY="Default"
|
| 19 |
+
PLACEMENT_STRATEGY="Default"
|
| 20 |
+
|
| 21 |
+
emconfigutil --platform ${PLATFORM} --od "${OUTPUT_DIR}/"
|
| 22 |
+
|
| 23 |
+
v++ ${DEBUG}\
|
| 24 |
+
--platform ${PLATFORM} \
|
| 25 |
+
--target ${TARGET} \
|
| 26 |
+
--package \
|
| 27 |
+
"${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
|
| 28 |
+
--temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp/package.build" \
|
| 29 |
+
--save-temps \
|
| 30 |
+
--package.out_dir "${OUTPUT_DIR}/package" \
|
| 31 |
+
--package.boot_mode sd \
|
| 32 |
+
--package.rootfs "${VERSAL}/rootfs.ext4" \
|
| 33 |
+
--package.kernel_image "${VERSAL}/Image" \
|
| 34 |
+
--package.sd_file "${OUTPUT_DIR}/emconfig.json" \
|
| 35 |
+
--package.sd_file "./host-opencl" \
|
| 36 |
+
--package.sd_file "./run_app.sh" \
|
| 37 |
+
--package.sd_file "./xrt.ini" \
|
| 38 |
+
-o "${OUTPUT_DIR}/${TOP}_vpk180.xclbin"
|
gpt-2-medium/parse_floorplan.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from enum import Enum, auto
|
| 3 |
+
from typing import Any
|
| 4 |
+
from argparse import ArgumentParser
|
| 5 |
+
|
| 6 |
+
parser = ArgumentParser()
|
| 7 |
+
parser.add_argument("-f", "--file", dest="filename", type=str,
|
| 8 |
+
help="input floorplan json file", metavar="FILE")
|
| 9 |
+
|
| 10 |
+
class IREnum(Enum):
|
| 11 |
+
"""Enums to parse Rapidstream NOC IR."""
|
| 12 |
+
|
| 13 |
+
PIPELINE = "__rs_hs_pipeline"
|
| 14 |
+
REGION = "REGION"
|
| 15 |
+
BODY = "BODY"
|
| 16 |
+
HEAD_REGION = "__HEAD_REGION"
|
| 17 |
+
TAIL_REGION = "__TAIL_REGION"
|
| 18 |
+
DATA_WIDTH = "DATA_WIDTH"
|
| 19 |
+
DEPTH = "DEPTH"
|
| 20 |
+
BODY_LEVEL = "BODY_LEVEL"
|
| 21 |
+
IF_DOUT = "if_dout"
|
| 22 |
+
IF_EMPTY_N = "if_empty_n"
|
| 23 |
+
IF_READ = "if_read"
|
| 24 |
+
IF_DIN = "if_din"
|
| 25 |
+
IF_FULL_N = "if_full_n"
|
| 26 |
+
IF_WRITE = "if_write"
|
| 27 |
+
NMU = "nmu_"
|
| 28 |
+
NSU = "nsu_"
|
| 29 |
+
CC_MASTER = "_cc_master"
|
| 30 |
+
CC_RET = "_cc_ret"
|
| 31 |
+
RS_ROUTE = "RS_ROUTE"
|
| 32 |
+
FLOORPLAN_REGION = "floorplan_region"
|
| 33 |
+
PRAGMAS = "pragmas"
|
| 34 |
+
LIT = "lit"
|
| 35 |
+
|
| 36 |
+
PIPELINE_MAPPING = {
|
| 37 |
+
"__rs_ap_ctrl_start_ready_pipeline": "AP",
|
| 38 |
+
"__rs_ff_pipeline": "FF",
|
| 39 |
+
"__rs_hs_pipeline": "HS",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def parse_top_mod(ir: dict[str, Any]) -> Any:
|
| 43 |
+
"""Parses the top_mod dict in the Rapidstream IR.
|
| 44 |
+
|
| 45 |
+
Return a dictionary.
|
| 46 |
+
|
| 47 |
+
Example:
|
| 48 |
+
>>> design = {
|
| 49 |
+
... "modules": {
|
| 50 |
+
... "top_name": "FINDME",
|
| 51 |
+
... "module_definitions": [{"name": "FINDME"}],
|
| 52 |
+
... }
|
| 53 |
+
... }
|
| 54 |
+
>>> parse_top_mod(design)
|
| 55 |
+
{'name': 'FINDME'}
|
| 56 |
+
"""
|
| 57 |
+
top_mod = ir["modules"]["top_name"]
|
| 58 |
+
for mod in ir["modules"]["module_definitions"]:
|
| 59 |
+
if mod["name"] == top_mod:
|
| 60 |
+
return mod
|
| 61 |
+
raise AssertionError()
|
| 62 |
+
|
| 63 |
+
def parse_mod(ir: dict[str, Any], name: str) -> Any:
|
| 64 |
+
"""Parses a given module's IR in the Rapidstream IR.
|
| 65 |
+
|
| 66 |
+
Return a dictionary.
|
| 67 |
+
"""
|
| 68 |
+
for mod in ir["modules"]["module_definitions"]:
|
| 69 |
+
if mod["name"] == name:
|
| 70 |
+
return mod
|
| 71 |
+
return {}
|
| 72 |
+
|
| 73 |
+
def find_repr(source: list[dict[str, Any]], key: str) -> str:
|
| 74 |
+
"""Finds the first type repr value of a key in the Rapidstream list IR.
|
| 75 |
+
|
| 76 |
+
Returns a string.
|
| 77 |
+
"""
|
| 78 |
+
for e in find_expr(source, key):
|
| 79 |
+
return str(e["repr"])
|
| 80 |
+
print(f"WARNING: repr for key {key} not found!")
|
| 81 |
+
return ""
|
| 82 |
+
|
| 83 |
+
def find_expr(
|
| 84 |
+
source: list[dict[str, Any | list[dict[str, str]]]], key: str
|
| 85 |
+
) -> list[dict[str, str]]:
|
| 86 |
+
"""Finds the expr value of a key in the Rapidstream list IR.
|
| 87 |
+
|
| 88 |
+
Returns a string.
|
| 89 |
+
"""
|
| 90 |
+
for c in source:
|
| 91 |
+
if c["name"] == key:
|
| 92 |
+
return c["expr"]
|
| 93 |
+
print(f"WARNING: expr for key {key} not found!")
|
| 94 |
+
return []
|
| 95 |
+
|
| 96 |
+
def parse_floorplan(ir: dict[str, Any], grouped_mod_name: str) -> dict[str, list[str]]:
|
| 97 |
+
"""Parses the top module and grouped module's floorplan regions.
|
| 98 |
+
|
| 99 |
+
Return a dictionary where keys are slots and values are submodules.
|
| 100 |
+
"""
|
| 101 |
+
combined_mods = {
|
| 102 |
+
# top
|
| 103 |
+
"": parse_top_mod(ir)["submodules"],
|
| 104 |
+
}
|
| 105 |
+
if grouped_mod_ir := parse_mod(ir, grouped_mod_name):
|
| 106 |
+
# grouped module
|
| 107 |
+
combined_mods[f"{grouped_mod_name}_0/"] = grouped_mod_ir["submodules"]
|
| 108 |
+
|
| 109 |
+
insts = {}
|
| 110 |
+
for parent, mods in combined_mods.items():
|
| 111 |
+
for sub_mod in mods:
|
| 112 |
+
sub_mod_name = parent + sub_mod["name"]
|
| 113 |
+
if sub_mod["floorplan_region"] is not None:
|
| 114 |
+
# regular module
|
| 115 |
+
insts[sub_mod_name] = sub_mod["floorplan_region"]
|
| 116 |
+
elif sub_mod["module"] in PIPELINE_MAPPING:
|
| 117 |
+
# pipeline module, needs to extract slot of each reg
|
| 118 |
+
mapped_name = PIPELINE_MAPPING[sub_mod["module"]]
|
| 119 |
+
body_level = find_repr(sub_mod["parameters"], IREnum.BODY_LEVEL.value)
|
| 120 |
+
insts[f"{sub_mod_name}/RS_{mapped_name}_PP_HEAD"] = find_repr(
|
| 121 |
+
sub_mod["parameters"], IREnum.HEAD_REGION.value
|
| 122 |
+
).strip('"')
|
| 123 |
+
insts[f"{sub_mod_name}/RS_{mapped_name}_PP_TAIL"] = find_repr(
|
| 124 |
+
sub_mod["parameters"], IREnum.TAIL_REGION.value
|
| 125 |
+
).strip('"')
|
| 126 |
+
for i in range(int(body_level)):
|
| 127 |
+
insts[f"{sub_mod_name}/RS_{mapped_name}_PP_BODY_{i}"] = find_repr(
|
| 128 |
+
sub_mod["parameters"], f"__BODY_{i}_REGION"
|
| 129 |
+
).strip('"')
|
| 130 |
+
|
| 131 |
+
# convert {instance: slot} to {slot: [instances]}
|
| 132 |
+
floorplan: dict[str, list[str]] = {}
|
| 133 |
+
for sub_mod_name, slot in insts.items():
|
| 134 |
+
assert slot is not None, f"{sub_mod_name} cannot have null slot!"
|
| 135 |
+
if slot not in floorplan:
|
| 136 |
+
floorplan[slot] = []
|
| 137 |
+
floorplan[slot].append(sub_mod_name)
|
| 138 |
+
return floorplan
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def extract_slot_coord(slot_name: str) -> tuple[int, int]:
|
| 142 |
+
"""Extracts the x and y coordinates from the slot name.
|
| 143 |
+
|
| 144 |
+
Returns a coordinate tuple as (x, y) in int.
|
| 145 |
+
|
| 146 |
+
Example:
|
| 147 |
+
>>> extract_slot_coord("SLOT_X0Y1")
|
| 148 |
+
(0, 1)
|
| 149 |
+
"""
|
| 150 |
+
return int(slot_name.split("X")[1].split("Y")[0]), int(slot_name.split("Y")[1])
|
| 151 |
+
|
| 152 |
+
def export_constraint(floorplan: dict[str, list[str]], kernel_name: str) -> list[str]:
|
| 153 |
+
"""Generates tcl constraints given the floorplan dictionary.
|
| 154 |
+
|
| 155 |
+
Returns a list of tcl commands.
|
| 156 |
+
"""
|
| 157 |
+
tcl = [
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
# Initialize an empty list to store undefined cells
|
| 161 |
+
set undefined_cells {}
|
| 162 |
+
"""
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
cr_map = [
|
| 166 |
+
["CLOCKREGION_X0Y1:CLOCKREGION_X4Y4", "CLOCKREGION_X0Y5:CLOCKREGION_X4Y7", "CLOCKREGION_X0Y8:CLOCKREGION_X4Y10", "CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"],
|
| 167 |
+
["CLOCKREGION_X5Y1:CLOCKREGION_X9Y4", "CLOCKREGION_X5Y5:CLOCKREGION_X9Y7", "CLOCKREGION_X5Y8:CLOCKREGION_X9Y10", "CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"]
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
for slot in floorplan.keys():
|
| 171 |
+
slot1, slot2 = slot.split("_TO_")
|
| 172 |
+
assert slot1 == slot2
|
| 173 |
+
x, y = extract_slot_coord(slot1)
|
| 174 |
+
cr = cr_map[x][y]
|
| 175 |
+
tcl += [
|
| 176 |
+
f"""
|
| 177 |
+
# begin defining a slot for logic resources
|
| 178 |
+
create_pblock {slot}
|
| 179 |
+
resize_pblock {slot} -add {cr}
|
| 180 |
+
"""
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
for slot, _ in floorplan.items():
|
| 184 |
+
tcl += [f"set {slot}_cells {{"]
|
| 185 |
+
tcl += [f" ext_platform_i/VitisRegion/{kernel_name}/inst/{slot}_0/.*"]
|
| 186 |
+
tcl += [
|
| 187 |
+
f"""}}
|
| 188 |
+
add_cells_to_pblock [get_pblocks {slot}] [get_cells -regex ${slot}_cells]
|
| 189 |
+
|
| 190 |
+
# Iterate through each cell in the list
|
| 191 |
+
foreach cell ${slot}_cells {{
|
| 192 |
+
set defined [llength [get_cells $cell]]
|
| 193 |
+
if {{ $defined == 0 }} {{
|
| 194 |
+
lappend undefined_cells $cell
|
| 195 |
+
}}
|
| 196 |
+
}}
|
| 197 |
+
"""
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
tcl += [
|
| 201 |
+
"""
|
| 202 |
+
if {[llength $undefined_cells] > 0} {
|
| 203 |
+
puts "Undefined cells:"
|
| 204 |
+
foreach cell $undefined_cells {
|
| 205 |
+
puts $cell
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
"""
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
return tcl
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
args = parser.parse_args()
|
| 215 |
+
|
| 216 |
+
with open(args.filename, "r", encoding="utf-8") as file:
|
| 217 |
+
ir = json.load(file)
|
| 218 |
+
|
| 219 |
+
pipeline_dict = parse_floorplan(ir, "")
|
| 220 |
+
tcl = export_constraint(pipeline_dict, "opt_kernel")
|
| 221 |
+
|
| 222 |
+
with open("constraints.tcl", "w", encoding="utf-8") as file:
|
| 223 |
+
file.write("\n".join(tcl))
|
gpt-2-medium/run_app.sh
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
export LD_LIBRARY_PATH=/mnt:/tmp:$LD_LIBRARY_PATH
|
| 4 |
+
export XCL_EMULATION_MODE=hw_emu
|
| 5 |
+
export XILINX_XRT=/usr
|
| 6 |
+
export XILINX_VITIS=/mnt
|
| 7 |
+
|
| 8 |
+
./host-opencl opt_kernel_vpk180.xclbin
|
gpt-2-medium/run_tapa.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tapac \
|
| 2 |
+
-o opt.hw.xo \
|
| 3 |
+
--platform xilinx_u280_xdma_201920_3 \
|
| 4 |
+
--top opt_kernel \
|
| 5 |
+
--work-dir opt-stage3.tapa \
|
| 6 |
+
--connectivity hbm_config.ini \
|
| 7 |
+
--enable-hbm-binding-adjustment \
|
| 8 |
+
--enable-synth-util \
|
| 9 |
+
--run-floorplan-dse \
|
| 10 |
+
--min-area-limit 0.55 \
|
| 11 |
+
--min-slr-width-limit 5000 \
|
| 12 |
+
--max-slr-width-limit 19000 \
|
| 13 |
+
--max-parallel-synth-jobs 16 \
|
| 14 |
+
--floorplan-output opt-floorplan.tcl \
|
| 15 |
+
kernel.cpp
|
gpt-2-medium/run_tapa_rs.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
ml load xilinx/vivado/2024.1
|
| 3 |
+
tapac \
|
| 4 |
+
--work-dir opt-stage4-dot-prod.tapa \
|
| 5 |
+
--top opt_kernel \
|
| 6 |
+
--part-num xcvp1802-lsvc4072-2MP-e-S \
|
| 7 |
+
--clock-period 3.33 \
|
| 8 |
+
-o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
|
| 9 |
+
--connectivity link_config_versal.ini \
|
| 10 |
+
--run-tapacc \
|
| 11 |
+
--run-hls \
|
| 12 |
+
--generate-task-rtl \
|
| 13 |
+
--run-floorplanning \
|
| 14 |
+
--generate-top-rtl \
|
| 15 |
+
kernel-versal.cpp
|
| 16 |
+
|
| 17 |
+
ml load xilinx/vivado/2024.1
|
| 18 |
+
tapac \
|
| 19 |
+
--work-dir opt-stage4-dot-prod.tapa \
|
| 20 |
+
--top opt_kernel \
|
| 21 |
+
--part-num xcvp1802-lsvc4072-2MP-e-S \
|
| 22 |
+
--clock-period 3.33 \
|
| 23 |
+
-o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
|
| 24 |
+
--connectivity link_config_versal.ini \
|
| 25 |
+
--pack-xo \
|
| 26 |
+
kernel-versal.cpp
|
| 27 |
+
|
| 28 |
+
|
gpt-2-medium/xo/constraints.tcl
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
# Initialize an empty list to store undefined cells
|
| 4 |
+
set undefined_cells {}
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# begin defining a slot for logic resources
|
| 8 |
+
create_pblock SLOT_X0Y0_TO_SLOT_X0Y0
|
| 9 |
+
resize_pblock SLOT_X0Y0_TO_SLOT_X0Y0 -add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# begin defining a slot for logic resources
|
| 13 |
+
create_pblock SLOT_X0Y2_TO_SLOT_X0Y2
|
| 14 |
+
resize_pblock SLOT_X0Y2_TO_SLOT_X0Y2 -add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# begin defining a slot for logic resources
|
| 18 |
+
create_pblock SLOT_X1Y2_TO_SLOT_X1Y2
|
| 19 |
+
resize_pblock SLOT_X1Y2_TO_SLOT_X1Y2 -add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# begin defining a slot for logic resources
|
| 23 |
+
create_pblock SLOT_X0Y3_TO_SLOT_X0Y3
|
| 24 |
+
resize_pblock SLOT_X0Y3_TO_SLOT_X0Y3 -add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# begin defining a slot for logic resources
|
| 28 |
+
create_pblock SLOT_X1Y3_TO_SLOT_X1Y3
|
| 29 |
+
resize_pblock SLOT_X1Y3_TO_SLOT_X1Y3 -add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# begin defining a slot for logic resources
|
| 33 |
+
create_pblock SLOT_X1Y0_TO_SLOT_X1Y0
|
| 34 |
+
resize_pblock SLOT_X1Y0_TO_SLOT_X1Y0 -add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# begin defining a slot for logic resources
|
| 38 |
+
create_pblock SLOT_X1Y1_TO_SLOT_X1Y1
|
| 39 |
+
resize_pblock SLOT_X1Y1_TO_SLOT_X1Y1 -add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# begin defining a slot for logic resources
|
| 43 |
+
create_pblock SLOT_X0Y1_TO_SLOT_X0Y1
|
| 44 |
+
resize_pblock SLOT_X0Y1_TO_SLOT_X0Y1 -add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7
|
| 45 |
+
|
| 46 |
+
set SLOT_X0Y0_TO_SLOT_X0Y0_cells {
|
| 47 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y0_TO_SLOT_X0Y0.*
|
| 48 |
+
}
|
| 49 |
+
add_cells_to_pblock [get_pblocks SLOT_X0Y0_TO_SLOT_X0Y0] [get_cells -regex $SLOT_X0Y0_TO_SLOT_X0Y0_cells]
|
| 50 |
+
|
| 51 |
+
# Iterate through each cell in the list
|
| 52 |
+
foreach cell $SLOT_X0Y0_TO_SLOT_X0Y0_cells {
|
| 53 |
+
set defined [llength [get_cells $cell]]
|
| 54 |
+
if { $defined == 0 } {
|
| 55 |
+
lappend undefined_cells $cell
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
set SLOT_X0Y2_TO_SLOT_X0Y2_cells {
|
| 60 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y2_TO_SLOT_X0Y2.*
|
| 61 |
+
}
|
| 62 |
+
add_cells_to_pblock [get_pblocks SLOT_X0Y2_TO_SLOT_X0Y2] [get_cells -regex $SLOT_X0Y2_TO_SLOT_X0Y2_cells]
|
| 63 |
+
|
| 64 |
+
# Iterate through each cell in the list
|
| 65 |
+
foreach cell $SLOT_X0Y2_TO_SLOT_X0Y2_cells {
|
| 66 |
+
set defined [llength [get_cells $cell]]
|
| 67 |
+
if { $defined == 0 } {
|
| 68 |
+
lappend undefined_cells $cell
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
set SLOT_X1Y2_TO_SLOT_X1Y2_cells {
|
| 73 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y2_TO_SLOT_X1Y2.*
|
| 74 |
+
}
|
| 75 |
+
add_cells_to_pblock [get_pblocks SLOT_X1Y2_TO_SLOT_X1Y2] [get_cells -regex $SLOT_X1Y2_TO_SLOT_X1Y2_cells]
|
| 76 |
+
|
| 77 |
+
# Iterate through each cell in the list
|
| 78 |
+
foreach cell $SLOT_X1Y2_TO_SLOT_X1Y2_cells {
|
| 79 |
+
set defined [llength [get_cells $cell]]
|
| 80 |
+
if { $defined == 0 } {
|
| 81 |
+
lappend undefined_cells $cell
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
set SLOT_X0Y3_TO_SLOT_X0Y3_cells {
|
| 86 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y3_TO_SLOT_X0Y3.*
|
| 87 |
+
}
|
| 88 |
+
add_cells_to_pblock [get_pblocks SLOT_X0Y3_TO_SLOT_X0Y3] [get_cells -regex $SLOT_X0Y3_TO_SLOT_X0Y3_cells]
|
| 89 |
+
|
| 90 |
+
# Iterate through each cell in the list
|
| 91 |
+
foreach cell $SLOT_X0Y3_TO_SLOT_X0Y3_cells {
|
| 92 |
+
set defined [llength [get_cells $cell]]
|
| 93 |
+
if { $defined == 0 } {
|
| 94 |
+
lappend undefined_cells $cell
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
set SLOT_X1Y3_TO_SLOT_X1Y3_cells {
|
| 99 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y3_TO_SLOT_X1Y3.*
|
| 100 |
+
}
|
| 101 |
+
add_cells_to_pblock [get_pblocks SLOT_X1Y3_TO_SLOT_X1Y3] [get_cells -regex $SLOT_X1Y3_TO_SLOT_X1Y3_cells]
|
| 102 |
+
|
| 103 |
+
# Iterate through each cell in the list
|
| 104 |
+
foreach cell $SLOT_X1Y3_TO_SLOT_X1Y3_cells {
|
| 105 |
+
set defined [llength [get_cells $cell]]
|
| 106 |
+
if { $defined == 0 } {
|
| 107 |
+
lappend undefined_cells $cell
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
set SLOT_X1Y0_TO_SLOT_X1Y0_cells {
|
| 113 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y0_TO_SLOT_X1Y0.*
|
| 114 |
+
}
|
| 115 |
+
add_cells_to_pblock [get_pblocks SLOT_X1Y0_TO_SLOT_X1Y0] [get_cells -regex $SLOT_X1Y0_TO_SLOT_X1Y0_cells]
|
| 116 |
+
|
| 117 |
+
# Iterate through each cell in the list
|
| 118 |
+
foreach cell $SLOT_X1Y0_TO_SLOT_X1Y0_cells {
|
| 119 |
+
set defined [llength [get_cells $cell]]
|
| 120 |
+
if { $defined == 0 } {
|
| 121 |
+
lappend undefined_cells $cell
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
set SLOT_X1Y1_TO_SLOT_X1Y1_cells {
|
| 126 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y1_TO_SLOT_X1Y1.*
|
| 127 |
+
}
|
| 128 |
+
add_cells_to_pblock [get_pblocks SLOT_X1Y1_TO_SLOT_X1Y1] [get_cells -regex $SLOT_X1Y1_TO_SLOT_X1Y1_cells]
|
| 129 |
+
|
| 130 |
+
# Iterate through each cell in the list
|
| 131 |
+
foreach cell $SLOT_X1Y1_TO_SLOT_X1Y1_cells {
|
| 132 |
+
set defined [llength [get_cells $cell]]
|
| 133 |
+
if { $defined == 0 } {
|
| 134 |
+
lappend undefined_cells $cell
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
set SLOT_X0Y1_TO_SLOT_X0Y1_cells {
|
| 139 |
+
ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y1_TO_SLOT_X0Y1.*
|
| 140 |
+
}
|
| 141 |
+
add_cells_to_pblock [get_pblocks SLOT_X0Y1_TO_SLOT_X0Y1] [get_cells -regex $SLOT_X0Y1_TO_SLOT_X0Y1_cells]
|
| 142 |
+
|
| 143 |
+
# Iterate through each cell in the list
|
| 144 |
+
foreach cell $SLOT_X0Y1_TO_SLOT_X0Y1_cells {
|
| 145 |
+
set defined [llength [get_cells $cell]]
|
| 146 |
+
if { $defined == 0 } {
|
| 147 |
+
lappend undefined_cells $cell
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if {[llength $undefined_cells] > 0} {
|
| 153 |
+
puts "Undefined cells:"
|
| 154 |
+
foreach cell $undefined_cells {
|
| 155 |
+
puts $cell
|
| 156 |
+
}
|
| 157 |
+
}
|
gpt-2-medium/xo/opt_kernel.xo
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50ccf71a9ffd437e6e800624723a66b21391130e200641b2c8c7af0875ef73ce
|
| 3 |
+
size 2049244
|
gpt-2-medium/xrt.ini
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Emulation]
|
| 2 |
+
debug_mode=batch
|