OswaldHe123 commited on
Commit
033e60e
·
verified ·
1 Parent(s): e05b93e

Upload Bitstreams

Browse files
Files changed (38) hide show
  1. .gitattributes +10 -0
  2. gpt-2-medium/Makefile +30 -0
  3. gpt-2-medium/README.md +26 -0
  4. gpt-2-medium/bitstreams/opt_kernel_latest.xclbin +3 -0
  5. gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info +497 -0
  6. gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin +3 -0
  7. gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info +490 -0
  8. gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin +3 -0
  9. gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info +502 -0
  10. gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa +3 -0
  11. gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa +3 -0
  12. gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa +3 -0
  13. gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa +3 -0
  14. gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info +485 -0
  15. gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin +3 -0
  16. gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin +3 -0
  17. gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info +502 -0
  18. gpt-2-medium/export_xo.py +52 -0
  19. gpt-2-medium/generate_bitstream_sample.sh +40 -0
  20. gpt-2-medium/hbm_config.ini +7 -0
  21. gpt-2-medium/host-u280.cpp +172 -0
  22. gpt-2-medium/host-versal.cpp +194 -0
  23. gpt-2-medium/host.cpp +194 -0
  24. gpt-2-medium/host_opencl.cpp +273 -0
  25. gpt-2-medium/host_opencl.h +71 -0
  26. gpt-2-medium/kernel-ultrascale.cpp +2091 -0
  27. gpt-2-medium/kernel-versal.cpp +0 -0
  28. gpt-2-medium/kernel.cpp +1528 -0
  29. gpt-2-medium/link_config_versal.ini +7 -0
  30. gpt-2-medium/opt-versal-rs.py +43 -0
  31. gpt-2-medium/package_sample.sh +38 -0
  32. gpt-2-medium/parse_floorplan.py +223 -0
  33. gpt-2-medium/run_app.sh +8 -0
  34. gpt-2-medium/run_tapa.sh +15 -0
  35. gpt-2-medium/run_tapa_rs.sh +28 -0
  36. gpt-2-medium/xo/constraints.tcl +157 -0
  37. gpt-2-medium/xo/opt_kernel.xo +3 -0
  38. gpt-2-medium/xrt.ini +2 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ gpt-2-medium/bitstreams/opt_kernel_latest.xclbin filter=lfs diff=lfs merge=lfs -text
37
+ gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin filter=lfs diff=lfs merge=lfs -text
38
+ gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin filter=lfs diff=lfs merge=lfs -text
39
+ gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa filter=lfs diff=lfs merge=lfs -text
40
+ gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa filter=lfs diff=lfs merge=lfs -text
41
+ gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa filter=lfs diff=lfs merge=lfs -text
42
+ gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa filter=lfs diff=lfs merge=lfs -text
43
+ gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin filter=lfs diff=lfs merge=lfs -text
44
+ gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin filter=lfs diff=lfs merge=lfs -text
45
+ gpt-2-medium/xo/opt_kernel.xo filter=lfs diff=lfs merge=lfs -text
gpt-2-medium/Makefile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GCC=g++
2
+ ARMGCC=$(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++
3
+ SYSROOT=/home/oswaldhe/vpk180_custom_platform/vpk180_custom_platform.vitis/xilinx-versal-common-v2023.2/sysroots/cortexa72-cortexa53-xilinx-linux
4
+ # TAPA_ROOT=$(shell spack location -i tapa@2024-05-18)
5
+ # FRT_ROOT=$(shell spack location -i fpga-runtime)
6
+ # GLOG_ROOT=$(shell spack location -i glog/pqucikz)
7
+ # GFLAGS_ROOT=$(shell spack location -i gflags/y2uaz43)
8
+ INCLUDE_FLAGS=-I$(TAPA_ROOT)/include -I$(FRT_ROOT)/include -I$(GLOG_ROOT)/include -I$(GFLAGS_ROOT)/include -I$(XILINX_HLS)/include
9
+ LDFLAGS=-L$(TAPA_ROOT)/lib -L$(FRT_ROOT)/lib -L$(GLOG_ROOT)/lib -L$(GFLAGS_ROOT)/lib -ltapa -lfrt -lglog -lgflags -lm
10
+ # RPATH_FLAGS=-Wl,-rpath,$(TAPA_ROOT)/lib -Wl,-rpath,$(FRT_ROOT)/lib -Wl,-rpath,$(GLOG_ROOT)/lib -Wl,-rpath,$(GFLAGS_ROOT)/lib
11
+ #OPT=-I$(shell spack location -i tapa@2023-01-08)/include -I$(shell spack location -i fpga-runtime)/include -I$(shell spack location -i glog/pqucikz)/include -I${shell spack location -i gflags/y2uaz43}/include -ltapa -lfrt -lglog -lgflags -lOpenCL -lm -I${XILINX_HLS}/include
12
+ #RPATH_FLAGS=-Wl,-rpath,$(shell spack location -i tapa@2023-01-08)/lib -Wl,-rpath,$(shell spack location -i fpga-runtime)/lib -Wl,-rpath,$(shell spack location -i glog/pqucikz)/lib -Wl,-rpath,$(shell spack location -i gflags/y2uaz43)/lib
13
+
14
+ opt350: kernel.cpp host.cpp
15
+ $(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
16
+
17
+ opt350-ultrascale: kernel-ultrascale.cpp host-u280.cpp
18
+ $(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lgflags -lglog -lm -lOpenCL -I$(XILINX_HLS)/include
19
+
20
+ host-opencl: host_opencl.o
21
+ $(ARMGCC) -o $@ $^ -L$(SYSROOT)/usr/lib/ -lxrt_coreutil -lpthread -lrt -lstdc++ -lgmp -lOpenCL --sysroot=$(SYSROOT)
22
+
23
+ host_opencl.o: host_opencl.cpp
24
+ $(ARMGCC) -c -D__USE_XOPEN2K8 -I$(SYSROOT)/usr/include/xrt -I$(XILINX_VIVADO)/include -I$(SYSROOT)/usr/include -I$(XILINX_HLS)/include -fmessage-length=0 -std=c++17 --sysroot=$(SYSROOT) -o $@ $^
25
+
26
+ opt350-versal: kernel-versal.cpp host-versal.cpp
27
+ $(GCC) -o $@ -O2 $^ -L/lib/x86_64-linux-gnu -L/usr/local/lib -ltapa -lfrt -lglog -lgflags -lm -lOpenCL -I$(XILINX_HLS)/include
28
+
29
+ clean:
30
+ rm opt350 opt-versal opt350-ultrascale
gpt-2-medium/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Place & Route Instructions
2
+
3
+ ### Generate Vitis Platform
4
+
5
+ Follow this [tutorial](https://docs.amd.com/r/2023.2-English/Vitis-Tutorials-Vitis-Platform-Creation/Versal-Platform-Creation-Quick-Start) to generate the Vitis Platform for VPK180. There are a couple of changes:
6
+
7
+ 1. Step 1-3: Select VPK180 as the device. Generate 3 clocks: 100MHz, 200MHz, 300MHz.
8
+ 2. Step 2-2: git-branch should be `xlnx_rel_v2023.2`. `system-user.dtsi` is on [Vitis Tutorial Github Repo](https://github.com/Xilinx/Vitis-Tutorials/blob/2023.2/Vitis_Platform_Creation/Design_Tutorials/03_Edge_VCK190/ref_files/step2_pfm/system-user.dtsi). Change the name to Xilinx custom-vpk180. Board name is `versal-vpk180-reva`.
9
+
10
+ ### Launch V++ Script for P&R
11
+
12
+ After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh` and launch the script to start P&R.
13
+
14
+ ### Hardware Emulation Using QEMU
15
+
16
+ After exporting the xo container, replace the platform path, xo path, and constraint path in `generate_bitstream_sample.sh`. Change target to `hw_emu` and turn on debug mode `-g`. After generating the xsa file for hardware emulation, run `package_sample.sh` with the same modifications as `generate_bitstream_sample.sh`, with the files you want to include in the SD card image (including the host binary, launch scripts, and configuration file `xrt.ini`). You will find a script `/package/launch_hw_emu.sh` to start QEMU directly.
17
+
18
+ ## Latency References vs. SoTA (ms)
19
+
20
+ |Seq Length | Allo | DFX | NVIDIA T4 | NVIDIA A100 | AMD MI210 |
21
+ | ---- | ---- | ---- | ---- | ---- | ---- |
22
+ | 64 | 205.46 | 349.1 | 47.26 | 39.8 | 7.776 |
23
+ | 128 | 370.56 | 692.8 | 56.4 | 39.51 | 8.541 |
24
+ | 256 | 740.76 | 1412.5 | 81.0 | 39.82 | 10.12 |
25
+ | 512 | 1333.79 | 2825.1 | 162.91 | 49.06 | 15.52 |
26
+ | 1024 | 3777.4 | 6079 | 360.9 | 49.17 | 33.08 |
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090f0f57d4d3450a0a44c8bc3c50c3271fe5af186e2c7d165d62ec70ac48dbe7
3
+ size 76134932
gpt-2-medium/bitstreams/opt_kernel_latest.xclbin.info ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ==============================================================================
3
+ XRT Build Version: 2.14.384 (2022.2)
4
+ Build Date: 2022-12-09 00:55:08
5
+ Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
6
+ ==============================================================================
7
+ xclbin Information
8
+ ------------------
9
+ Generated by: v++ (2021.2) on 2021-10-14-04:41:01
10
+ Version: 2.14.384
11
+ Kernels: opt_kernel
12
+ Signature:
13
+ Content: Bitstream
14
+ UUID (xclbin): 41b0c8a4-f618-a8f7-0b11-d3c822641412
15
+ Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
16
+ CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
17
+ EMBEDDED_METADATA, SYSTEM_METADATA,
18
+ GROUP_CONNECTIVITY, GROUP_TOPOLOGY
19
+ ==============================================================================
20
+ Hardware Platform (Shell) Information
21
+ -------------------------------------
22
+ Vendor: xilinx
23
+ Board: u280
24
+ Name: xdma
25
+ Version: 201920.3
26
+ Generated Version: Vivado 2019.2 (SW Build: 2742762)
27
+ Created:
28
+ Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
29
+ Board Vendor: xilinx.com
30
+ Board Name: xilinx.com:au280:1.0
31
+ Board Part: xilinx.com:au280:part0:1.0
32
+ Platform VBNV: xilinx_u280_xdma_201920_3
33
+ Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
34
+ Feature ROM TimeStamp: 1579649056
35
+
36
+ Scalable Clocks
37
+ ---------------
38
+ Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
39
+ Index: 0
40
+ Type: SYSTEM
41
+ Frequency: 450 MHz
42
+
43
+ Name: DATA_CLK
44
+ Index: 1
45
+ Type: DATA
46
+ Frequency: 224 MHz
47
+
48
+ Name: KERNEL_CLK
49
+ Index: 2
50
+ Type: KERNEL
51
+ Frequency: 500 MHz
52
+
53
+ System Clocks
54
+ ------
55
+ Name: _bd_top_clkwiz_kernel2_clk_out1
56
+ Type: SCALABLE
57
+ Default Freq: 500 MHz
58
+ Requested Freq: 500 MHz
59
+ Achieved Freq: 500 MHz
60
+
61
+ Name: _bd_top_clkwiz_kernel_clk_out1
62
+ Type: SCALABLE
63
+ Default Freq: 300 MHz
64
+ Requested Freq: 300 MHz
65
+ Achieved Freq: 224.4 MHz
66
+
67
+ Memory Configuration
68
+ --------------------
69
+ Name: HBM[0]
70
+ Index: 0
71
+ Type: MEM_DDR4
72
+ Base Address: 0x0
73
+ Address Size: 0x10000000
74
+ Bank Used: Yes
75
+
76
+ Name: HBM[1]
77
+ Index: 1
78
+ Type: MEM_DDR4
79
+ Base Address: 0x10000000
80
+ Address Size: 0x10000000
81
+ Bank Used: Yes
82
+
83
+ Name: HBM[2]
84
+ Index: 2
85
+ Type: MEM_DRAM
86
+ Base Address: 0x20000000
87
+ Address Size: 0x10000000
88
+ Bank Used: Yes
89
+
90
+ Name: HBM[3]
91
+ Index: 3
92
+ Type: MEM_DRAM
93
+ Base Address: 0x30000000
94
+ Address Size: 0x10000000
95
+ Bank Used: No
96
+
97
+ Name: HBM[4]
98
+ Index: 4
99
+ Type: MEM_DRAM
100
+ Base Address: 0x40000000
101
+ Address Size: 0x10000000
102
+ Bank Used: No
103
+
104
+ Name: HBM[5]
105
+ Index: 5
106
+ Type: MEM_DRAM
107
+ Base Address: 0x50000000
108
+ Address Size: 0x10000000
109
+ Bank Used: No
110
+
111
+ Name: HBM[6]
112
+ Index: 6
113
+ Type: MEM_DRAM
114
+ Base Address: 0x60000000
115
+ Address Size: 0x10000000
116
+ Bank Used: No
117
+
118
+ Name: HBM[7]
119
+ Index: 7
120
+ Type: MEM_DRAM
121
+ Base Address: 0x70000000
122
+ Address Size: 0x10000000
123
+ Bank Used: No
124
+
125
+ Name: HBM[8]
126
+ Index: 8
127
+ Type: MEM_DRAM
128
+ Base Address: 0x80000000
129
+ Address Size: 0x10000000
130
+ Bank Used: No
131
+
132
+ Name: HBM[9]
133
+ Index: 9
134
+ Type: MEM_DRAM
135
+ Base Address: 0x90000000
136
+ Address Size: 0x10000000
137
+ Bank Used: No
138
+
139
+ Name: HBM[10]
140
+ Index: 10
141
+ Type: MEM_DRAM
142
+ Base Address: 0xa0000000
143
+ Address Size: 0x10000000
144
+ Bank Used: No
145
+
146
+ Name: HBM[11]
147
+ Index: 11
148
+ Type: MEM_DRAM
149
+ Base Address: 0xb0000000
150
+ Address Size: 0x10000000
151
+ Bank Used: No
152
+
153
+ Name: HBM[12]
154
+ Index: 12
155
+ Type: MEM_DRAM
156
+ Base Address: 0xc0000000
157
+ Address Size: 0x10000000
158
+ Bank Used: No
159
+
160
+ Name: HBM[13]
161
+ Index: 13
162
+ Type: MEM_DRAM
163
+ Base Address: 0xd0000000
164
+ Address Size: 0x10000000
165
+ Bank Used: No
166
+
167
+ Name: HBM[14]
168
+ Index: 14
169
+ Type: MEM_DRAM
170
+ Base Address: 0xe0000000
171
+ Address Size: 0x10000000
172
+ Bank Used: No
173
+
174
+ Name: HBM[15]
175
+ Index: 15
176
+ Type: MEM_DRAM
177
+ Base Address: 0xf0000000
178
+ Address Size: 0x10000000
179
+ Bank Used: No
180
+
181
+ Name: HBM[16]
182
+ Index: 16
183
+ Type: MEM_DRAM
184
+ Base Address: 0x100000000
185
+ Address Size: 0x10000000
186
+ Bank Used: Yes
187
+
188
+ Name: HBM[17]
189
+ Index: 17
190
+ Type: MEM_DRAM
191
+ Base Address: 0x110000000
192
+ Address Size: 0x10000000
193
+ Bank Used: Yes
194
+
195
+ Name: HBM[18]
196
+ Index: 18
197
+ Type: MEM_DRAM
198
+ Base Address: 0x120000000
199
+ Address Size: 0x10000000
200
+ Bank Used: Yes
201
+
202
+ Name: HBM[19]
203
+ Index: 19
204
+ Type: MEM_DRAM
205
+ Base Address: 0x130000000
206
+ Address Size: 0x10000000
207
+ Bank Used: Yes
208
+
209
+ Name: HBM[20]
210
+ Index: 20
211
+ Type: MEM_DRAM
212
+ Base Address: 0x140000000
213
+ Address Size: 0x10000000
214
+ Bank Used: No
215
+
216
+ Name: HBM[21]
217
+ Index: 21
218
+ Type: MEM_DRAM
219
+ Base Address: 0x150000000
220
+ Address Size: 0x10000000
221
+ Bank Used: No
222
+
223
+ Name: HBM[22]
224
+ Index: 22
225
+ Type: MEM_DRAM
226
+ Base Address: 0x160000000
227
+ Address Size: 0x10000000
228
+ Bank Used: No
229
+
230
+ Name: HBM[23]
231
+ Index: 23
232
+ Type: MEM_DRAM
233
+ Base Address: 0x170000000
234
+ Address Size: 0x10000000
235
+ Bank Used: No
236
+
237
+ Name: HBM[24]
238
+ Index: 24
239
+ Type: MEM_DRAM
240
+ Base Address: 0x180000000
241
+ Address Size: 0x10000000
242
+ Bank Used: No
243
+
244
+ Name: HBM[25]
245
+ Index: 25
246
+ Type: MEM_DRAM
247
+ Base Address: 0x190000000
248
+ Address Size: 0x10000000
249
+ Bank Used: No
250
+
251
+ Name: HBM[26]
252
+ Index: 26
253
+ Type: MEM_DRAM
254
+ Base Address: 0x1a0000000
255
+ Address Size: 0x10000000
256
+ Bank Used: No
257
+
258
+ Name: HBM[27]
259
+ Index: 27
260
+ Type: MEM_DRAM
261
+ Base Address: 0x1b0000000
262
+ Address Size: 0x10000000
263
+ Bank Used: No
264
+
265
+ Name: HBM[28]
266
+ Index: 28
267
+ Type: MEM_DRAM
268
+ Base Address: 0x1c0000000
269
+ Address Size: 0x10000000
270
+ Bank Used: No
271
+
272
+ Name: HBM[29]
273
+ Index: 29
274
+ Type: MEM_DRAM
275
+ Base Address: 0x1d0000000
276
+ Address Size: 0x10000000
277
+ Bank Used: No
278
+
279
+ Name: HBM[30]
280
+ Index: 30
281
+ Type: MEM_DRAM
282
+ Base Address: 0x1e0000000
283
+ Address Size: 0x10000000
284
+ Bank Used: No
285
+
286
+ Name: HBM[31]
287
+ Index: 31
288
+ Type: MEM_DRAM
289
+ Base Address: 0x1f0000000
290
+ Address Size: 0x10000000
291
+ Bank Used: No
292
+
293
+ Name: DDR[0]
294
+ Index: 32
295
+ Type: MEM_DRAM
296
+ Base Address: 0x0
297
+ Address Size: 0x0
298
+ Bank Used: No
299
+
300
+ Name: DDR[1]
301
+ Index: 33
302
+ Type: MEM_DRAM
303
+ Base Address: 0x0
304
+ Address Size: 0x0
305
+ Bank Used: No
306
+
307
+ Name: PLRAM[0]
308
+ Index: 34
309
+ Type: MEM_DRAM
310
+ Base Address: 0x0
311
+ Address Size: 0x0
312
+ Bank Used: No
313
+
314
+ Name: PLRAM[1]
315
+ Index: 35
316
+ Type: MEM_DRAM
317
+ Base Address: 0x0
318
+ Address Size: 0x0
319
+ Bank Used: No
320
+
321
+ Name: PLRAM[2]
322
+ Index: 36
323
+ Type: MEM_DRAM
324
+ Base Address: 0x0
325
+ Address Size: 0x0
326
+ Bank Used: No
327
+
328
+ Name: PLRAM[3]
329
+ Index: 37
330
+ Type: MEM_DRAM
331
+ Base Address: 0x0
332
+ Address Size: 0x0
333
+ Bank Used: No
334
+
335
+ Name: PLRAM[4]
336
+ Index: 38
337
+ Type: MEM_DRAM
338
+ Base Address: 0x0
339
+ Address Size: 0x0
340
+ Bank Used: No
341
+
342
+ Name: PLRAM[5]
343
+ Index: 39
344
+ Type: MEM_DRAM
345
+ Base Address: 0x0
346
+ Address Size: 0x0
347
+ Bank Used: No
348
+ ==============================================================================
349
+ Kernel: opt_kernel
350
+
351
+ Definition
352
+ ----------
353
+ Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
354
+
355
+ Ports
356
+ -----
357
+ Port: m_axi_X_acc0
358
+ Mode: master
359
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
360
+ Data Width: 512 bits
361
+ Port Type: addressable
362
+
363
+ Port: m_axi_X_acc1
364
+ Mode: master
365
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
366
+ Data Width: 512 bits
367
+ Port Type: addressable
368
+
369
+ Port: m_axi_W_acc0
370
+ Mode: master
371
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
372
+ Data Width: 512 bits
373
+ Port Type: addressable
374
+
375
+ Port: m_axi_W_acc1
376
+ Mode: master
377
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
378
+ Data Width: 512 bits
379
+ Port Type: addressable
380
+
381
+ Port: m_axi_acc0_out
382
+ Mode: master
383
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
384
+ Data Width: 64 bits
385
+ Port Type: addressable
386
+
387
+ Port: m_axi_acc1_out
388
+ Mode: master
389
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
390
+ Data Width: 64 bits
391
+ Port Type: addressable
392
+
393
+ Port: m_axi_cycle_count
394
+ Mode: master
395
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
396
+ Data Width: 32 bits
397
+ Port Type: addressable
398
+
399
+ Port: s_axi_control
400
+ Mode: slave
401
+ Range (bytes): 0x1000
402
+ Data Width: 32 bits
403
+ Port Type: addressable
404
+
405
+ --------------------------
406
+ Instance: opt_kernel
407
+ Base Address: 0x1800000
408
+
409
+ Argument: L
410
+ Register Offset: 0x10
411
+ Port: s_axi_control
412
+ Memory: <not applicable>
413
+
414
+ Argument: L_out
415
+ Register Offset: 0x18
416
+ Port: s_axi_control
417
+ Memory: <not applicable>
418
+
419
+ Argument: seq_len
420
+ Register Offset: 0x20
421
+ Port: s_axi_control
422
+ Memory: <not applicable>
423
+
424
+ Argument: X_acc0
425
+ Register Offset: 0x28
426
+ Port: m_axi_X_acc0
427
+ Memory: HBM[0] (MEM_DDR4)
428
+
429
+ Argument: X_acc1
430
+ Register Offset: 0x34
431
+ Port: m_axi_X_acc1
432
+ Memory: HBM[16] (MEM_DRAM)
433
+
434
+ Argument: W_acc0
435
+ Register Offset: 0x40
436
+ Port: m_axi_W_acc0
437
+ Memory: HBM[1] (MEM_DDR4)
438
+
439
+ Argument: W_acc1
440
+ Register Offset: 0x4c
441
+ Port: m_axi_W_acc1
442
+ Memory: HBM[17] (MEM_DRAM)
443
+
444
+ Argument: acc0_out
445
+ Register Offset: 0x58
446
+ Port: m_axi_acc0_out
447
+ Memory: HBM[2] (MEM_DRAM)
448
+
449
+ Argument: acc1_out
450
+ Register Offset: 0x64
451
+ Port: m_axi_acc1_out
452
+ Memory: HBM[18] (MEM_DRAM)
453
+
454
+ Argument: cycle_count
455
+ Register Offset: 0x70
456
+ Port: m_axi_cycle_count
457
+ Memory: HBM[19] (MEM_DRAM)
458
+ ==============================================================================
459
+ Generated By
460
+ ------------
461
+ Command: v++
462
+ Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
463
+ Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[19] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
464
+ Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/run/link_config.ini
465
+ --connectivity.nk opt_kernel:1:opt_kernel
466
+ --connectivity.sp opt_kernel.X_acc0:HBM[0]
467
+ --connectivity.sp opt_kernel.X_acc1:HBM[16]
468
+ --connectivity.sp opt_kernel.W_acc0:HBM[1]
469
+ --connectivity.sp opt_kernel.W_acc1:HBM[17]
470
+ --connectivity.sp opt_kernel.acc0_out:HBM[2]
471
+ --connectivity.sp opt_kernel.acc1_out:HBM[18]
472
+ --connectivity.sp opt_kernel.cycle_count:HBM[19]
473
+ --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt.hw.xo
474
+ --kernel opt_kernel
475
+ --link
476
+ --optimize 3
477
+ --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
478
+ --platform xilinx_u280_xdma_201920_3
479
+ --report_level 2
480
+ --save-temps
481
+ --target hw
482
+ --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
483
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
484
+ --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
485
+ -propconst
486
+ -sweep
487
+ -shift_register_opt}
488
+ --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high
489
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
490
+ --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
491
+ --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-1/opt-floorplan.tcl
492
+ --vivado.synth.jobs 8
493
+ ==============================================================================
494
+ User Added Key Value Pairs
495
+ --------------------------
496
+ <empty>
497
+ ==============================================================================
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0413a6d5d20f76bc7b5d5376f088edec7ee574db131b857215b5a2fbd99e6075
3
+ size 76961468
gpt-2-medium/bitstreams/opt_kernel_stage_4.xclbin.info ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ==============================================================================
3
+ XRT Build Version: 2.14.384 (2022.2)
4
+ Build Date: 2022-12-09 00:55:08
5
+ Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
6
+ ==============================================================================
7
+ xclbin Information
8
+ ------------------
9
+ Generated by: v++ (2021.2) on 2021-10-14-04:41:01
10
+ Version: 2.14.384
11
+ Kernels: opt_kernel
12
+ Signature:
13
+ Content: Bitstream
14
+ UUID (xclbin): 4617f7da-9790-9c63-864e-303bcf47c723
15
+ Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
16
+ CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
17
+ EMBEDDED_METADATA, SYSTEM_METADATA,
18
+ GROUP_CONNECTIVITY, GROUP_TOPOLOGY
19
+ ==============================================================================
20
+ Hardware Platform (Shell) Information
21
+ -------------------------------------
22
+ Vendor: xilinx
23
+ Board: u280
24
+ Name: xdma
25
+ Version: 201920.3
26
+ Generated Version: Vivado 2019.2 (SW Build: 2742762)
27
+ Created:
28
+ Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
29
+ Board Vendor: xilinx.com
30
+ Board Name: xilinx.com:au280:1.0
31
+ Board Part: xilinx.com:au280:part0:1.0
32
+ Platform VBNV: xilinx_u280_xdma_201920_3
33
+ Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
34
+ Feature ROM TimeStamp: 1579649056
35
+
36
+ Scalable Clocks
37
+ ---------------
38
+ Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
39
+ Index: 0
40
+ Type: SYSTEM
41
+ Frequency: 450 MHz
42
+
43
+ Name: DATA_CLK
44
+ Index: 1
45
+ Type: DATA
46
+ Frequency: 241 MHz
47
+
48
+ Name: KERNEL_CLK
49
+ Index: 2
50
+ Type: KERNEL
51
+ Frequency: 500 MHz
52
+
53
+ System Clocks
54
+ ------
55
+ Name: _bd_top_clkwiz_kernel2_clk_out1
56
+ Type: SCALABLE
57
+ Default Freq: 500 MHz
58
+ Requested Freq: 500 MHz
59
+ Achieved Freq: 500 MHz
60
+
61
+ Name: _bd_top_clkwiz_kernel_clk_out1
62
+ Type: SCALABLE
63
+ Default Freq: 300 MHz
64
+ Requested Freq: 300 MHz
65
+ Achieved Freq: 241.4 MHz
66
+
67
+ Memory Configuration
68
+ --------------------
69
+ Name: HBM[0]
70
+ Index: 0
71
+ Type: MEM_DDR4
72
+ Base Address: 0x0
73
+ Address Size: 0x10000000
74
+ Bank Used: Yes
75
+
76
+ Name: HBM[1]
77
+ Index: 1
78
+ Type: MEM_DDR4
79
+ Base Address: 0x10000000
80
+ Address Size: 0x10000000
81
+ Bank Used: Yes
82
+
83
+ Name: HBM[2]
84
+ Index: 2
85
+ Type: MEM_DRAM
86
+ Base Address: 0x20000000
87
+ Address Size: 0x10000000
88
+ Bank Used: Yes
89
+
90
+ Name: HBM[3]
91
+ Index: 3
92
+ Type: MEM_DRAM
93
+ Base Address: 0x30000000
94
+ Address Size: 0x10000000
95
+ Bank Used: Yes
96
+
97
+ Name: HBM[4]
98
+ Index: 4
99
+ Type: MEM_DRAM
100
+ Base Address: 0x40000000
101
+ Address Size: 0x10000000
102
+ Bank Used: No
103
+
104
+ Name: HBM[5]
105
+ Index: 5
106
+ Type: MEM_DRAM
107
+ Base Address: 0x50000000
108
+ Address Size: 0x10000000
109
+ Bank Used: No
110
+
111
+ Name: HBM[6]
112
+ Index: 6
113
+ Type: MEM_DRAM
114
+ Base Address: 0x60000000
115
+ Address Size: 0x10000000
116
+ Bank Used: No
117
+
118
+ Name: HBM[7]
119
+ Index: 7
120
+ Type: MEM_DRAM
121
+ Base Address: 0x70000000
122
+ Address Size: 0x10000000
123
+ Bank Used: No
124
+
125
+ Name: HBM[8]
126
+ Index: 8
127
+ Type: MEM_DRAM
128
+ Base Address: 0x80000000
129
+ Address Size: 0x10000000
130
+ Bank Used: No
131
+
132
+ Name: HBM[9]
133
+ Index: 9
134
+ Type: MEM_DRAM
135
+ Base Address: 0x90000000
136
+ Address Size: 0x10000000
137
+ Bank Used: No
138
+
139
+ Name: HBM[10]
140
+ Index: 10
141
+ Type: MEM_DRAM
142
+ Base Address: 0xa0000000
143
+ Address Size: 0x10000000
144
+ Bank Used: No
145
+
146
+ Name: HBM[11]
147
+ Index: 11
148
+ Type: MEM_DRAM
149
+ Base Address: 0xb0000000
150
+ Address Size: 0x10000000
151
+ Bank Used: No
152
+
153
+ Name: HBM[12]
154
+ Index: 12
155
+ Type: MEM_DRAM
156
+ Base Address: 0xc0000000
157
+ Address Size: 0x10000000
158
+ Bank Used: No
159
+
160
+ Name: HBM[13]
161
+ Index: 13
162
+ Type: MEM_DRAM
163
+ Base Address: 0xd0000000
164
+ Address Size: 0x10000000
165
+ Bank Used: No
166
+
167
+ Name: HBM[14]
168
+ Index: 14
169
+ Type: MEM_DRAM
170
+ Base Address: 0xe0000000
171
+ Address Size: 0x10000000
172
+ Bank Used: No
173
+
174
+ Name: HBM[15]
175
+ Index: 15
176
+ Type: MEM_DRAM
177
+ Base Address: 0xf0000000
178
+ Address Size: 0x10000000
179
+ Bank Used: No
180
+
181
+ Name: HBM[16]
182
+ Index: 16
183
+ Type: MEM_DRAM
184
+ Base Address: 0x100000000
185
+ Address Size: 0x10000000
186
+ Bank Used: Yes
187
+
188
+ Name: HBM[17]
189
+ Index: 17
190
+ Type: MEM_DRAM
191
+ Base Address: 0x110000000
192
+ Address Size: 0x10000000
193
+ Bank Used: Yes
194
+
195
+ Name: HBM[18]
196
+ Index: 18
197
+ Type: MEM_DRAM
198
+ Base Address: 0x120000000
199
+ Address Size: 0x10000000
200
+ Bank Used: No
201
+
202
+ Name: HBM[19]
203
+ Index: 19
204
+ Type: MEM_DRAM
205
+ Base Address: 0x130000000
206
+ Address Size: 0x10000000
207
+ Bank Used: No
208
+
209
+ Name: HBM[20]
210
+ Index: 20
211
+ Type: MEM_DRAM
212
+ Base Address: 0x140000000
213
+ Address Size: 0x10000000
214
+ Bank Used: No
215
+
216
+ Name: HBM[21]
217
+ Index: 21
218
+ Type: MEM_DRAM
219
+ Base Address: 0x150000000
220
+ Address Size: 0x10000000
221
+ Bank Used: No
222
+
223
+ Name: HBM[22]
224
+ Index: 22
225
+ Type: MEM_DRAM
226
+ Base Address: 0x160000000
227
+ Address Size: 0x10000000
228
+ Bank Used: No
229
+
230
+ Name: HBM[23]
231
+ Index: 23
232
+ Type: MEM_DRAM
233
+ Base Address: 0x170000000
234
+ Address Size: 0x10000000
235
+ Bank Used: No
236
+
237
+ Name: HBM[24]
238
+ Index: 24
239
+ Type: MEM_DRAM
240
+ Base Address: 0x180000000
241
+ Address Size: 0x10000000
242
+ Bank Used: No
243
+
244
+ Name: HBM[25]
245
+ Index: 25
246
+ Type: MEM_DRAM
247
+ Base Address: 0x190000000
248
+ Address Size: 0x10000000
249
+ Bank Used: No
250
+
251
+ Name: HBM[26]
252
+ Index: 26
253
+ Type: MEM_DRAM
254
+ Base Address: 0x1a0000000
255
+ Address Size: 0x10000000
256
+ Bank Used: No
257
+
258
+ Name: HBM[27]
259
+ Index: 27
260
+ Type: MEM_DRAM
261
+ Base Address: 0x1b0000000
262
+ Address Size: 0x10000000
263
+ Bank Used: No
264
+
265
+ Name: HBM[28]
266
+ Index: 28
267
+ Type: MEM_DRAM
268
+ Base Address: 0x1c0000000
269
+ Address Size: 0x10000000
270
+ Bank Used: No
271
+
272
+ Name: HBM[29]
273
+ Index: 29
274
+ Type: MEM_DRAM
275
+ Base Address: 0x1d0000000
276
+ Address Size: 0x10000000
277
+ Bank Used: No
278
+
279
+ Name: HBM[30]
280
+ Index: 30
281
+ Type: MEM_DRAM
282
+ Base Address: 0x1e0000000
283
+ Address Size: 0x10000000
284
+ Bank Used: No
285
+
286
+ Name: HBM[31]
287
+ Index: 31
288
+ Type: MEM_DRAM
289
+ Base Address: 0x1f0000000
290
+ Address Size: 0x10000000
291
+ Bank Used: No
292
+
293
+ Name: DDR[0]
294
+ Index: 32
295
+ Type: MEM_DRAM
296
+ Base Address: 0x0
297
+ Address Size: 0x0
298
+ Bank Used: No
299
+
300
+ Name: DDR[1]
301
+ Index: 33
302
+ Type: MEM_DRAM
303
+ Base Address: 0x0
304
+ Address Size: 0x0
305
+ Bank Used: No
306
+
307
+ Name: PLRAM[0]
308
+ Index: 34
309
+ Type: MEM_DRAM
310
+ Base Address: 0x0
311
+ Address Size: 0x0
312
+ Bank Used: No
313
+
314
+ Name: PLRAM[1]
315
+ Index: 35
316
+ Type: MEM_DRAM
317
+ Base Address: 0x0
318
+ Address Size: 0x0
319
+ Bank Used: No
320
+
321
+ Name: PLRAM[2]
322
+ Index: 36
323
+ Type: MEM_DRAM
324
+ Base Address: 0x0
325
+ Address Size: 0x0
326
+ Bank Used: No
327
+
328
+ Name: PLRAM[3]
329
+ Index: 37
330
+ Type: MEM_DRAM
331
+ Base Address: 0x0
332
+ Address Size: 0x0
333
+ Bank Used: No
334
+
335
+ Name: PLRAM[4]
336
+ Index: 38
337
+ Type: MEM_DRAM
338
+ Base Address: 0x0
339
+ Address Size: 0x0
340
+ Bank Used: No
341
+
342
+ Name: PLRAM[5]
343
+ Index: 39
344
+ Type: MEM_DRAM
345
+ Base Address: 0x0
346
+ Address Size: 0x0
347
+ Bank Used: No
348
+ ==============================================================================
349
+ Kernel: opt_kernel
350
+
351
+ Definition
352
+ ----------
353
+ Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc1_out, int* cycle_count)
354
+
355
+ Ports
356
+ -----
357
+ Port: m_axi_X_acc0
358
+ Mode: master
359
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
360
+ Data Width: 512 bits
361
+ Port Type: addressable
362
+
363
+ Port: m_axi_X_acc1
364
+ Mode: master
365
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
366
+ Data Width: 512 bits
367
+ Port Type: addressable
368
+
369
+ Port: m_axi_W_acc0
370
+ Mode: master
371
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
372
+ Data Width: 512 bits
373
+ Port Type: addressable
374
+
375
+ Port: m_axi_W_acc1
376
+ Mode: master
377
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
378
+ Data Width: 512 bits
379
+ Port Type: addressable
380
+
381
+ Port: m_axi_acc1_out
382
+ Mode: master
383
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
384
+ Data Width: 64 bits
385
+ Port Type: addressable
386
+
387
+ Port: m_axi_cycle_count
388
+ Mode: master
389
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
390
+ Data Width: 32 bits
391
+ Port Type: addressable
392
+
393
+ Port: s_axi_control
394
+ Mode: slave
395
+ Range (bytes): 0x1000
396
+ Data Width: 32 bits
397
+ Port Type: addressable
398
+
399
+ --------------------------
400
+ Instance: opt_kernel
401
+ Base Address: 0x1800000
402
+
403
+ Argument: L
404
+ Register Offset: 0x10
405
+ Port: s_axi_control
406
+ Memory: <not applicable>
407
+
408
+ Argument: L_out
409
+ Register Offset: 0x18
410
+ Port: s_axi_control
411
+ Memory: <not applicable>
412
+
413
+ Argument: seq_len
414
+ Register Offset: 0x20
415
+ Port: s_axi_control
416
+ Memory: <not applicable>
417
+
418
+ Argument: reload
419
+ Register Offset: 0x28
420
+ Port: s_axi_control
421
+ Memory: <not applicable>
422
+
423
+ Argument: X_acc0
424
+ Register Offset: 0x30
425
+ Port: m_axi_X_acc0
426
+ Memory: HBM[16] (MEM_DRAM)
427
+
428
+ Argument: X_acc1
429
+ Register Offset: 0x3c
430
+ Port: m_axi_X_acc1
431
+ Memory: HBM[0] (MEM_DDR4)
432
+
433
+ Argument: W_acc0
434
+ Register Offset: 0x48
435
+ Port: m_axi_W_acc0
436
+ Memory: HBM[17] (MEM_DRAM)
437
+
438
+ Argument: W_acc1
439
+ Register Offset: 0x54
440
+ Port: m_axi_W_acc1
441
+ Memory: HBM[1] (MEM_DDR4)
442
+
443
+ Argument: acc1_out
444
+ Register Offset: 0x60
445
+ Port: m_axi_acc1_out
446
+ Memory: HBM[2] (MEM_DRAM)
447
+
448
+ Argument: cycle_count
449
+ Register Offset: 0x6c
450
+ Port: m_axi_cycle_count
451
+ Memory: HBM[3] (MEM_DRAM)
452
+ ==============================================================================
453
+ Generated By
454
+ ------------
455
+ Command: v++
456
+ Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
457
+ Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[16] --connectivity.sp opt_kernel.X_acc1:HBM[0] --connectivity.sp opt_kernel.W_acc0:HBM[17] --connectivity.sp opt_kernel.W_acc1:HBM[1] --connectivity.sp opt_kernel.acc1_out:HBM[2] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl --vivado.synth.jobs 8
458
+ Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/run/link_config.ini
459
+ --connectivity.nk opt_kernel:1:opt_kernel
460
+ --connectivity.sp opt_kernel.X_acc0:HBM[16]
461
+ --connectivity.sp opt_kernel.X_acc1:HBM[0]
462
+ --connectivity.sp opt_kernel.W_acc0:HBM[17]
463
+ --connectivity.sp opt_kernel.W_acc1:HBM[1]
464
+ --connectivity.sp opt_kernel.acc1_out:HBM[2]
465
+ --connectivity.sp opt_kernel.cycle_count:HBM[3]
466
+ --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt.hw.xo
467
+ --kernel opt_kernel
468
+ --link
469
+ --optimize 3
470
+ --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
471
+ --platform xilinx_u280_xdma_201920_3
472
+ --report_level 2
473
+ --save-temps
474
+ --target hw
475
+ --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
476
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
477
+ --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
478
+ -propconst
479
+ -sweep
480
+ -shift_register_opt}
481
+ --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
482
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
483
+ --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
484
+ --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4.tapa/run-1/opt-floorplan.tcl
485
+ --vivado.synth.jobs 8
486
+ ==============================================================================
487
+ User Added Key Value Pairs
488
+ --------------------------
489
+ <empty>
490
+ ==============================================================================
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30d40b37a38089e3181996d9df02dd4371c8423c93304316bf22637e655992c3
3
+ size 76724924
gpt-2-medium/bitstreams/opt_kernel_stage_4_27b.xclbin.info ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ==============================================================================
3
+ XRT Build Version: 2.14.384 (2022.2)
4
+ Build Date: 2022-12-09 00:55:08
5
+ Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
6
+ ==============================================================================
7
+ xclbin Information
8
+ ------------------
9
+ Generated by: v++ (2021.2) on 2021-10-14-04:41:01
10
+ Version: 2.14.384
11
+ Kernels: opt_kernel
12
+ Signature:
13
+ Content: Bitstream
14
+ UUID (xclbin): cbb0489a-3f5c-066e-845c-af93ba50ad0a
15
+ Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
16
+ CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
17
+ EMBEDDED_METADATA, SYSTEM_METADATA,
18
+ GROUP_CONNECTIVITY, GROUP_TOPOLOGY
19
+ ==============================================================================
20
+ Hardware Platform (Shell) Information
21
+ -------------------------------------
22
+ Vendor: xilinx
23
+ Board: u280
24
+ Name: xdma
25
+ Version: 201920.3
26
+ Generated Version: Vivado 2019.2 (SW Build: 2742762)
27
+ Created:
28
+ Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
29
+ Board Vendor: xilinx.com
30
+ Board Name: xilinx.com:au280:1.0
31
+ Board Part: xilinx.com:au280:part0:1.0
32
+ Platform VBNV: xilinx_u280_xdma_201920_3
33
+ Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
34
+ Feature ROM TimeStamp: 1579649056
35
+
36
+ Scalable Clocks
37
+ ---------------
38
+ Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
39
+ Index: 0
40
+ Type: SYSTEM
41
+ Frequency: 450 MHz
42
+
43
+ Name: DATA_CLK
44
+ Index: 1
45
+ Type: DATA
46
+ Frequency: 202 MHz
47
+
48
+ Name: KERNEL_CLK
49
+ Index: 2
50
+ Type: KERNEL
51
+ Frequency: 500 MHz
52
+
53
+ System Clocks
54
+ ------
55
+ Name: _bd_top_clkwiz_kernel2_clk_out1
56
+ Type: SCALABLE
57
+ Default Freq: 500 MHz
58
+ Requested Freq: 500 MHz
59
+ Achieved Freq: 500 MHz
60
+
61
+ Name: _bd_top_clkwiz_kernel_clk_out1
62
+ Type: SCALABLE
63
+ Default Freq: 300 MHz
64
+ Requested Freq: 300 MHz
65
+ Achieved Freq: 202.5 MHz
66
+
67
+ Memory Configuration
68
+ --------------------
69
+ Name: HBM[0]
70
+ Index: 0
71
+ Type: MEM_DDR4
72
+ Base Address: 0x0
73
+ Address Size: 0x10000000
74
+ Bank Used: Yes
75
+
76
+ Name: HBM[1]
77
+ Index: 1
78
+ Type: MEM_DDR4
79
+ Base Address: 0x10000000
80
+ Address Size: 0x10000000
81
+ Bank Used: Yes
82
+
83
+ Name: HBM[2]
84
+ Index: 2
85
+ Type: MEM_DRAM
86
+ Base Address: 0x20000000
87
+ Address Size: 0x10000000
88
+ Bank Used: Yes
89
+
90
+ Name: HBM[3]
91
+ Index: 3
92
+ Type: MEM_DRAM
93
+ Base Address: 0x30000000
94
+ Address Size: 0x10000000
95
+ Bank Used: Yes
96
+
97
+ Name: HBM[4]
98
+ Index: 4
99
+ Type: MEM_DRAM
100
+ Base Address: 0x40000000
101
+ Address Size: 0x10000000
102
+ Bank Used: No
103
+
104
+ Name: HBM[5]
105
+ Index: 5
106
+ Type: MEM_DRAM
107
+ Base Address: 0x50000000
108
+ Address Size: 0x10000000
109
+ Bank Used: No
110
+
111
+ Name: HBM[6]
112
+ Index: 6
113
+ Type: MEM_DRAM
114
+ Base Address: 0x60000000
115
+ Address Size: 0x10000000
116
+ Bank Used: No
117
+
118
+ Name: HBM[7]
119
+ Index: 7
120
+ Type: MEM_DRAM
121
+ Base Address: 0x70000000
122
+ Address Size: 0x10000000
123
+ Bank Used: No
124
+
125
+ Name: HBM[8]
126
+ Index: 8
127
+ Type: MEM_DRAM
128
+ Base Address: 0x80000000
129
+ Address Size: 0x10000000
130
+ Bank Used: No
131
+
132
+ Name: HBM[9]
133
+ Index: 9
134
+ Type: MEM_DRAM
135
+ Base Address: 0x90000000
136
+ Address Size: 0x10000000
137
+ Bank Used: No
138
+
139
+ Name: HBM[10]
140
+ Index: 10
141
+ Type: MEM_DRAM
142
+ Base Address: 0xa0000000
143
+ Address Size: 0x10000000
144
+ Bank Used: No
145
+
146
+ Name: HBM[11]
147
+ Index: 11
148
+ Type: MEM_DRAM
149
+ Base Address: 0xb0000000
150
+ Address Size: 0x10000000
151
+ Bank Used: No
152
+
153
+ Name: HBM[12]
154
+ Index: 12
155
+ Type: MEM_DRAM
156
+ Base Address: 0xc0000000
157
+ Address Size: 0x10000000
158
+ Bank Used: No
159
+
160
+ Name: HBM[13]
161
+ Index: 13
162
+ Type: MEM_DRAM
163
+ Base Address: 0xd0000000
164
+ Address Size: 0x10000000
165
+ Bank Used: No
166
+
167
+ Name: HBM[14]
168
+ Index: 14
169
+ Type: MEM_DRAM
170
+ Base Address: 0xe0000000
171
+ Address Size: 0x10000000
172
+ Bank Used: No
173
+
174
+ Name: HBM[15]
175
+ Index: 15
176
+ Type: MEM_DRAM
177
+ Base Address: 0xf0000000
178
+ Address Size: 0x10000000
179
+ Bank Used: No
180
+
181
+ Name: HBM[16]
182
+ Index: 16
183
+ Type: MEM_DRAM
184
+ Base Address: 0x100000000
185
+ Address Size: 0x10000000
186
+ Bank Used: Yes
187
+
188
+ Name: HBM[17]
189
+ Index: 17
190
+ Type: MEM_DRAM
191
+ Base Address: 0x110000000
192
+ Address Size: 0x10000000
193
+ Bank Used: Yes
194
+
195
+ Name: HBM[18]
196
+ Index: 18
197
+ Type: MEM_DRAM
198
+ Base Address: 0x120000000
199
+ Address Size: 0x10000000
200
+ Bank Used: Yes
201
+
202
+ Name: HBM[19]
203
+ Index: 19
204
+ Type: MEM_DRAM
205
+ Base Address: 0x130000000
206
+ Address Size: 0x10000000
207
+ Bank Used: No
208
+
209
+ Name: HBM[20]
210
+ Index: 20
211
+ Type: MEM_DRAM
212
+ Base Address: 0x140000000
213
+ Address Size: 0x10000000
214
+ Bank Used: No
215
+
216
+ Name: HBM[21]
217
+ Index: 21
218
+ Type: MEM_DRAM
219
+ Base Address: 0x150000000
220
+ Address Size: 0x10000000
221
+ Bank Used: No
222
+
223
+ Name: HBM[22]
224
+ Index: 22
225
+ Type: MEM_DRAM
226
+ Base Address: 0x160000000
227
+ Address Size: 0x10000000
228
+ Bank Used: No
229
+
230
+ Name: HBM[23]
231
+ Index: 23
232
+ Type: MEM_DRAM
233
+ Base Address: 0x170000000
234
+ Address Size: 0x10000000
235
+ Bank Used: No
236
+
237
+ Name: HBM[24]
238
+ Index: 24
239
+ Type: MEM_DRAM
240
+ Base Address: 0x180000000
241
+ Address Size: 0x10000000
242
+ Bank Used: No
243
+
244
+ Name: HBM[25]
245
+ Index: 25
246
+ Type: MEM_DRAM
247
+ Base Address: 0x190000000
248
+ Address Size: 0x10000000
249
+ Bank Used: No
250
+
251
+ Name: HBM[26]
252
+ Index: 26
253
+ Type: MEM_DRAM
254
+ Base Address: 0x1a0000000
255
+ Address Size: 0x10000000
256
+ Bank Used: No
257
+
258
+ Name: HBM[27]
259
+ Index: 27
260
+ Type: MEM_DRAM
261
+ Base Address: 0x1b0000000
262
+ Address Size: 0x10000000
263
+ Bank Used: No
264
+
265
+ Name: HBM[28]
266
+ Index: 28
267
+ Type: MEM_DRAM
268
+ Base Address: 0x1c0000000
269
+ Address Size: 0x10000000
270
+ Bank Used: No
271
+
272
+ Name: HBM[29]
273
+ Index: 29
274
+ Type: MEM_DRAM
275
+ Base Address: 0x1d0000000
276
+ Address Size: 0x10000000
277
+ Bank Used: No
278
+
279
+ Name: HBM[30]
280
+ Index: 30
281
+ Type: MEM_DRAM
282
+ Base Address: 0x1e0000000
283
+ Address Size: 0x10000000
284
+ Bank Used: No
285
+
286
+ Name: HBM[31]
287
+ Index: 31
288
+ Type: MEM_DRAM
289
+ Base Address: 0x1f0000000
290
+ Address Size: 0x10000000
291
+ Bank Used: No
292
+
293
+ Name: DDR[0]
294
+ Index: 32
295
+ Type: MEM_DRAM
296
+ Base Address: 0x0
297
+ Address Size: 0x0
298
+ Bank Used: No
299
+
300
+ Name: DDR[1]
301
+ Index: 33
302
+ Type: MEM_DRAM
303
+ Base Address: 0x0
304
+ Address Size: 0x0
305
+ Bank Used: No
306
+
307
+ Name: PLRAM[0]
308
+ Index: 34
309
+ Type: MEM_DRAM
310
+ Base Address: 0x0
311
+ Address Size: 0x0
312
+ Bank Used: No
313
+
314
+ Name: PLRAM[1]
315
+ Index: 35
316
+ Type: MEM_DRAM
317
+ Base Address: 0x0
318
+ Address Size: 0x0
319
+ Bank Used: No
320
+
321
+ Name: PLRAM[2]
322
+ Index: 36
323
+ Type: MEM_DRAM
324
+ Base Address: 0x0
325
+ Address Size: 0x0
326
+ Bank Used: No
327
+
328
+ Name: PLRAM[3]
329
+ Index: 37
330
+ Type: MEM_DRAM
331
+ Base Address: 0x0
332
+ Address Size: 0x0
333
+ Bank Used: No
334
+
335
+ Name: PLRAM[4]
336
+ Index: 38
337
+ Type: MEM_DRAM
338
+ Base Address: 0x0
339
+ Address Size: 0x0
340
+ Bank Used: No
341
+
342
+ Name: PLRAM[5]
343
+ Index: 39
344
+ Type: MEM_DRAM
345
+ Base Address: 0x0
346
+ Address Size: 0x0
347
+ Bank Used: No
348
+ ==============================================================================
349
+ Kernel: opt_kernel
350
+
351
+ Definition
352
+ ----------
353
+ Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
354
+
355
+ Ports
356
+ -----
357
+ Port: m_axi_X_acc0
358
+ Mode: master
359
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
360
+ Data Width: 512 bits
361
+ Port Type: addressable
362
+
363
+ Port: m_axi_X_acc1
364
+ Mode: master
365
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
366
+ Data Width: 512 bits
367
+ Port Type: addressable
368
+
369
+ Port: m_axi_W_acc0
370
+ Mode: master
371
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
372
+ Data Width: 512 bits
373
+ Port Type: addressable
374
+
375
+ Port: m_axi_W_acc1
376
+ Mode: master
377
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
378
+ Data Width: 512 bits
379
+ Port Type: addressable
380
+
381
+ Port: m_axi_acc0_out
382
+ Mode: master
383
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
384
+ Data Width: 64 bits
385
+ Port Type: addressable
386
+
387
+ Port: m_axi_acc1_out
388
+ Mode: master
389
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
390
+ Data Width: 64 bits
391
+ Port Type: addressable
392
+
393
+ Port: m_axi_cycle_count
394
+ Mode: master
395
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
396
+ Data Width: 32 bits
397
+ Port Type: addressable
398
+
399
+ Port: s_axi_control
400
+ Mode: slave
401
+ Range (bytes): 0x1000
402
+ Data Width: 32 bits
403
+ Port Type: addressable
404
+
405
+ --------------------------
406
+ Instance: opt_kernel
407
+ Base Address: 0x1800000
408
+
409
+ Argument: L
410
+ Register Offset: 0x10
411
+ Port: s_axi_control
412
+ Memory: <not applicable>
413
+
414
+ Argument: L_out
415
+ Register Offset: 0x18
416
+ Port: s_axi_control
417
+ Memory: <not applicable>
418
+
419
+ Argument: seq_len
420
+ Register Offset: 0x20
421
+ Port: s_axi_control
422
+ Memory: <not applicable>
423
+
424
+ Argument: reload
425
+ Register Offset: 0x28
426
+ Port: s_axi_control
427
+ Memory: <not applicable>
428
+
429
+ Argument: X_acc0
430
+ Register Offset: 0x30
431
+ Port: m_axi_X_acc0
432
+ Memory: HBM[0] (MEM_DDR4)
433
+
434
+ Argument: X_acc1
435
+ Register Offset: 0x3c
436
+ Port: m_axi_X_acc1
437
+ Memory: HBM[16] (MEM_DRAM)
438
+
439
+ Argument: W_acc0
440
+ Register Offset: 0x48
441
+ Port: m_axi_W_acc0
442
+ Memory: HBM[1] (MEM_DDR4)
443
+
444
+ Argument: W_acc1
445
+ Register Offset: 0x54
446
+ Port: m_axi_W_acc1
447
+ Memory: HBM[17] (MEM_DRAM)
448
+
449
+ Argument: acc0_out
450
+ Register Offset: 0x60
451
+ Port: m_axi_acc0_out
452
+ Memory: HBM[2] (MEM_DRAM)
453
+
454
+ Argument: acc1_out
455
+ Register Offset: 0x6c
456
+ Port: m_axi_acc1_out
457
+ Memory: HBM[18] (MEM_DRAM)
458
+
459
+ Argument: cycle_count
460
+ Register Offset: 0x78
461
+ Port: m_axi_cycle_count
462
+ Memory: HBM[3] (MEM_DRAM)
463
+ ==============================================================================
464
+ Generated By
465
+ ------------
466
+ Command: v++
467
+ Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
468
+ Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
469
+ Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
470
+ --connectivity.nk opt_kernel:1:opt_kernel
471
+ --connectivity.sp opt_kernel.X_acc0:HBM[0]
472
+ --connectivity.sp opt_kernel.X_acc1:HBM[16]
473
+ --connectivity.sp opt_kernel.W_acc0:HBM[1]
474
+ --connectivity.sp opt_kernel.W_acc1:HBM[17]
475
+ --connectivity.sp opt_kernel.acc0_out:HBM[2]
476
+ --connectivity.sp opt_kernel.acc1_out:HBM[18]
477
+ --connectivity.sp opt_kernel.cycle_count:HBM[3]
478
+ --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
479
+ --kernel opt_kernel
480
+ --link
481
+ --optimize 3
482
+ --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
483
+ --platform xilinx_u280_xdma_201920_3
484
+ --report_level 2
485
+ --save-temps
486
+ --target hw
487
+ --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
488
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
489
+ --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
490
+ -propconst
491
+ -sweep
492
+ -shift_register_opt}
493
+ --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
494
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
495
+ --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
496
+ --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
497
+ --vivado.synth.jobs 8
498
+ ==============================================================================
499
+ User Added Key Value Pairs
500
+ --------------------------
501
+ <empty>
502
+ ==============================================================================
gpt-2-medium/bitstreams/opt_kernel_vpk180.xsa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722a71423e17da2f05587a9fd3c1e9d695f5cee02962744fcbde569aca21242f
3
+ size 70565471
gpt-2-medium/bitstreams/opt_kernel_vpk180_fixed.xsa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d986bb71631b79c5c2b5c6576e1f78051a671d6d4536e2095c8a39127c456461
3
+ size 86497092
gpt-2-medium/bitstreams/opt_kernel_vpk180_full.xsa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7c1339d6b78b36c4a35cb09709dfebb321bdf0decf037802e5d617356ad42b6
3
+ size 84081530
gpt-2-medium/bitstreams/opt_kernel_vpk180_mask.xsa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d258e7884b1e3c2f42bc8fd7a3878ab976b8cd2cc5042bdaaea949b27f506688
3
+ size 82554104
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.info ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ==============================================================================
3
+ XRT Build Version: 2.14.384 (2022.2)
4
+ Build Date: 2022-12-09 00:55:08
5
+ Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
6
+ ==============================================================================
7
+ xclbin Information
8
+ ------------------
9
+ Generated by: v++ (2021.2) on 2021-10-14-04:41:01
10
+ Version: 2.14.384
11
+ Kernels: opt_kernel
12
+ Signature:
13
+ Content: Bitstream
14
+ UUID (xclbin): 06dfa191-ba53-780e-16db-fd0655f01fc3
15
+ Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
16
+ CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
17
+ EMBEDDED_METADATA, SYSTEM_METADATA,
18
+ GROUP_CONNECTIVITY, GROUP_TOPOLOGY
19
+ ==============================================================================
20
+ Hardware Platform (Shell) Information
21
+ -------------------------------------
22
+ Vendor: xilinx
23
+ Board: u280
24
+ Name: xdma
25
+ Version: 201920.3
26
+ Generated Version: Vivado 2019.2 (SW Build: 2742762)
27
+ Created:
28
+ Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
29
+ Board Vendor: xilinx.com
30
+ Board Name: xilinx.com:au280:1.0
31
+ Board Part: xilinx.com:au280:part0:1.0
32
+ Platform VBNV: xilinx_u280_xdma_201920_3
33
+ Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
34
+ Feature ROM TimeStamp: 1579649056
35
+
36
+ Scalable Clocks
37
+ ---------------
38
+ Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
39
+ Index: 0
40
+ Type: SYSTEM
41
+ Frequency: 450 MHz
42
+
43
+ Name: DATA_CLK
44
+ Index: 1
45
+ Type: DATA
46
+ Frequency: 257 MHz
47
+
48
+ Name: KERNEL_CLK
49
+ Index: 2
50
+ Type: KERNEL
51
+ Frequency: 500 MHz
52
+
53
+ System Clocks
54
+ ------
55
+ Name: _bd_top_clkwiz_kernel2_clk_out1
56
+ Type: SCALABLE
57
+ Default Freq: 500 MHz
58
+ Requested Freq: 500 MHz
59
+ Achieved Freq: 500 MHz
60
+
61
+ Name: _bd_top_clkwiz_kernel_clk_out1
62
+ Type: SCALABLE
63
+ Default Freq: 300 MHz
64
+ Requested Freq: 300 MHz
65
+ Achieved Freq: 257.2 MHz
66
+
67
+ Memory Configuration
68
+ --------------------
69
+ Name: HBM[0]
70
+ Index: 0
71
+ Type: MEM_DDR4
72
+ Base Address: 0x0
73
+ Address Size: 0x10000000
74
+ Bank Used: No
75
+
76
+ Name: HBM[1]
77
+ Index: 1
78
+ Type: MEM_DDR4
79
+ Base Address: 0x10000000
80
+ Address Size: 0x10000000
81
+ Bank Used: Yes
82
+
83
+ Name: HBM[2]
84
+ Index: 2
85
+ Type: MEM_DRAM
86
+ Base Address: 0x20000000
87
+ Address Size: 0x10000000
88
+ Bank Used: Yes
89
+
90
+ Name: HBM[3]
91
+ Index: 3
92
+ Type: MEM_DRAM
93
+ Base Address: 0x30000000
94
+ Address Size: 0x10000000
95
+ Bank Used: Yes
96
+
97
+ Name: HBM[4]
98
+ Index: 4
99
+ Type: MEM_DRAM
100
+ Base Address: 0x40000000
101
+ Address Size: 0x10000000
102
+ Bank Used: Yes
103
+
104
+ Name: HBM[5]
105
+ Index: 5
106
+ Type: MEM_DRAM
107
+ Base Address: 0x50000000
108
+ Address Size: 0x10000000
109
+ Bank Used: No
110
+
111
+ Name: HBM[6]
112
+ Index: 6
113
+ Type: MEM_DRAM
114
+ Base Address: 0x60000000
115
+ Address Size: 0x10000000
116
+ Bank Used: No
117
+
118
+ Name: HBM[7]
119
+ Index: 7
120
+ Type: MEM_DRAM
121
+ Base Address: 0x70000000
122
+ Address Size: 0x10000000
123
+ Bank Used: Yes
124
+
125
+ Name: HBM[8]
126
+ Index: 8
127
+ Type: MEM_DRAM
128
+ Base Address: 0x80000000
129
+ Address Size: 0x10000000
130
+ Bank Used: No
131
+
132
+ Name: HBM[9]
133
+ Index: 9
134
+ Type: MEM_DRAM
135
+ Base Address: 0x90000000
136
+ Address Size: 0x10000000
137
+ Bank Used: Yes
138
+
139
+ Name: HBM[10]
140
+ Index: 10
141
+ Type: MEM_DRAM
142
+ Base Address: 0xa0000000
143
+ Address Size: 0x10000000
144
+ Bank Used: No
145
+
146
+ Name: HBM[11]
147
+ Index: 11
148
+ Type: MEM_DRAM
149
+ Base Address: 0xb0000000
150
+ Address Size: 0x10000000
151
+ Bank Used: No
152
+
153
+ Name: HBM[12]
154
+ Index: 12
155
+ Type: MEM_DRAM
156
+ Base Address: 0xc0000000
157
+ Address Size: 0x10000000
158
+ Bank Used: No
159
+
160
+ Name: HBM[13]
161
+ Index: 13
162
+ Type: MEM_DRAM
163
+ Base Address: 0xd0000000
164
+ Address Size: 0x10000000
165
+ Bank Used: No
166
+
167
+ Name: HBM[14]
168
+ Index: 14
169
+ Type: MEM_DRAM
170
+ Base Address: 0xe0000000
171
+ Address Size: 0x10000000
172
+ Bank Used: No
173
+
174
+ Name: HBM[15]
175
+ Index: 15
176
+ Type: MEM_DRAM
177
+ Base Address: 0xf0000000
178
+ Address Size: 0x10000000
179
+ Bank Used: No
180
+
181
+ Name: HBM[16]
182
+ Index: 16
183
+ Type: MEM_DRAM
184
+ Base Address: 0x100000000
185
+ Address Size: 0x10000000
186
+ Bank Used: No
187
+
188
+ Name: HBM[17]
189
+ Index: 17
190
+ Type: MEM_DRAM
191
+ Base Address: 0x110000000
192
+ Address Size: 0x10000000
193
+ Bank Used: No
194
+
195
+ Name: HBM[18]
196
+ Index: 18
197
+ Type: MEM_DRAM
198
+ Base Address: 0x120000000
199
+ Address Size: 0x10000000
200
+ Bank Used: No
201
+
202
+ Name: HBM[19]
203
+ Index: 19
204
+ Type: MEM_DRAM
205
+ Base Address: 0x130000000
206
+ Address Size: 0x10000000
207
+ Bank Used: No
208
+
209
+ Name: HBM[20]
210
+ Index: 20
211
+ Type: MEM_DRAM
212
+ Base Address: 0x140000000
213
+ Address Size: 0x10000000
214
+ Bank Used: No
215
+
216
+ Name: HBM[21]
217
+ Index: 21
218
+ Type: MEM_DRAM
219
+ Base Address: 0x150000000
220
+ Address Size: 0x10000000
221
+ Bank Used: No
222
+
223
+ Name: HBM[22]
224
+ Index: 22
225
+ Type: MEM_DRAM
226
+ Base Address: 0x160000000
227
+ Address Size: 0x10000000
228
+ Bank Used: No
229
+
230
+ Name: HBM[23]
231
+ Index: 23
232
+ Type: MEM_DRAM
233
+ Base Address: 0x170000000
234
+ Address Size: 0x10000000
235
+ Bank Used: No
236
+
237
+ Name: HBM[24]
238
+ Index: 24
239
+ Type: MEM_DRAM
240
+ Base Address: 0x180000000
241
+ Address Size: 0x10000000
242
+ Bank Used: No
243
+
244
+ Name: HBM[25]
245
+ Index: 25
246
+ Type: MEM_DRAM
247
+ Base Address: 0x190000000
248
+ Address Size: 0x10000000
249
+ Bank Used: No
250
+
251
+ Name: HBM[26]
252
+ Index: 26
253
+ Type: MEM_DRAM
254
+ Base Address: 0x1a0000000
255
+ Address Size: 0x10000000
256
+ Bank Used: No
257
+
258
+ Name: HBM[27]
259
+ Index: 27
260
+ Type: MEM_DRAM
261
+ Base Address: 0x1b0000000
262
+ Address Size: 0x10000000
263
+ Bank Used: No
264
+
265
+ Name: HBM[28]
266
+ Index: 28
267
+ Type: MEM_DRAM
268
+ Base Address: 0x1c0000000
269
+ Address Size: 0x10000000
270
+ Bank Used: No
271
+
272
+ Name: HBM[29]
273
+ Index: 29
274
+ Type: MEM_DRAM
275
+ Base Address: 0x1d0000000
276
+ Address Size: 0x10000000
277
+ Bank Used: No
278
+
279
+ Name: HBM[30]
280
+ Index: 30
281
+ Type: MEM_DRAM
282
+ Base Address: 0x1e0000000
283
+ Address Size: 0x10000000
284
+ Bank Used: No
285
+
286
+ Name: HBM[31]
287
+ Index: 31
288
+ Type: MEM_DRAM
289
+ Base Address: 0x1f0000000
290
+ Address Size: 0x10000000
291
+ Bank Used: No
292
+
293
+ Name: DDR[0]
294
+ Index: 32
295
+ Type: MEM_DRAM
296
+ Base Address: 0x0
297
+ Address Size: 0x0
298
+ Bank Used: No
299
+
300
+ Name: DDR[1]
301
+ Index: 33
302
+ Type: MEM_DRAM
303
+ Base Address: 0x0
304
+ Address Size: 0x0
305
+ Bank Used: No
306
+
307
+ Name: PLRAM[0]
308
+ Index: 34
309
+ Type: MEM_DRAM
310
+ Base Address: 0x0
311
+ Address Size: 0x0
312
+ Bank Used: No
313
+
314
+ Name: PLRAM[1]
315
+ Index: 35
316
+ Type: MEM_DRAM
317
+ Base Address: 0x0
318
+ Address Size: 0x0
319
+ Bank Used: No
320
+
321
+ Name: PLRAM[2]
322
+ Index: 36
323
+ Type: MEM_DRAM
324
+ Base Address: 0x0
325
+ Address Size: 0x0
326
+ Bank Used: No
327
+
328
+ Name: PLRAM[3]
329
+ Index: 37
330
+ Type: MEM_DRAM
331
+ Base Address: 0x0
332
+ Address Size: 0x0
333
+ Bank Used: No
334
+
335
+ Name: PLRAM[4]
336
+ Index: 38
337
+ Type: MEM_DRAM
338
+ Base Address: 0x0
339
+ Address Size: 0x0
340
+ Bank Used: No
341
+
342
+ Name: PLRAM[5]
343
+ Index: 39
344
+ Type: MEM_DRAM
345
+ Base Address: 0x0
346
+ Address Size: 0x0
347
+ Bank Used: No
348
+ ==============================================================================
349
+ Kernel: opt_kernel
350
+
351
+ Definition
352
+ ----------
353
+ Signature: opt_kernel (const int L, const int L_out, const int seq_len, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<128>* acc0_out, int* cycle_count)
354
+
355
+ Ports
356
+ -----
357
+ Port: m_axi_X_acc0
358
+ Mode: master
359
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
360
+ Data Width: 512 bits
361
+ Port Type: addressable
362
+
363
+ Port: m_axi_X_acc1
364
+ Mode: master
365
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
366
+ Data Width: 512 bits
367
+ Port Type: addressable
368
+
369
+ Port: m_axi_W_acc0
370
+ Mode: master
371
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
372
+ Data Width: 512 bits
373
+ Port Type: addressable
374
+
375
+ Port: m_axi_W_acc1
376
+ Mode: master
377
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
378
+ Data Width: 512 bits
379
+ Port Type: addressable
380
+
381
+ Port: m_axi_acc0_out
382
+ Mode: master
383
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
384
+ Data Width: 128 bits
385
+ Port Type: addressable
386
+
387
+ Port: m_axi_cycle_count
388
+ Mode: master
389
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
390
+ Data Width: 32 bits
391
+ Port Type: addressable
392
+
393
+ Port: s_axi_control
394
+ Mode: slave
395
+ Range (bytes): 0x1000
396
+ Data Width: 32 bits
397
+ Port Type: addressable
398
+
399
+ --------------------------
400
+ Instance: opt_kernel
401
+ Base Address: 0x1800000
402
+
403
+ Argument: L
404
+ Register Offset: 0x10
405
+ Port: s_axi_control
406
+ Memory: <not applicable>
407
+
408
+ Argument: L_out
409
+ Register Offset: 0x18
410
+ Port: s_axi_control
411
+ Memory: <not applicable>
412
+
413
+ Argument: seq_len
414
+ Register Offset: 0x20
415
+ Port: s_axi_control
416
+ Memory: <not applicable>
417
+
418
+ Argument: X_acc0
419
+ Register Offset: 0x28
420
+ Port: m_axi_X_acc0
421
+ Memory: HBM[1] (MEM_DDR4)
422
+
423
+ Argument: X_acc1
424
+ Register Offset: 0x34
425
+ Port: m_axi_X_acc1
426
+ Memory: HBM[2] (MEM_DRAM)
427
+
428
+ Argument: W_acc0
429
+ Register Offset: 0x40
430
+ Port: m_axi_W_acc0
431
+ Memory: HBM[3] (MEM_DRAM)
432
+
433
+ Argument: W_acc1
434
+ Register Offset: 0x4c
435
+ Port: m_axi_W_acc1
436
+ Memory: HBM[4] (MEM_DRAM)
437
+
438
+ Argument: acc0_out
439
+ Register Offset: 0x58
440
+ Port: m_axi_acc0_out
441
+ Memory: HBM[7] (MEM_DRAM)
442
+
443
+ Argument: cycle_count
444
+ Register Offset: 0x64
445
+ Port: m_axi_cycle_count
446
+ Memory: HBM[9] (MEM_DRAM)
447
+ ==============================================================================
448
+ Generated By
449
+ ------------
450
+ Command: v++
451
+ Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
452
+ Command Line: v++ --config /scratch/oswaldhe/hbm_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[1] --connectivity.sp opt_kernel.X_acc1:HBM[2] --connectivity.sp opt_kernel.W_acc0:HBM[3] --connectivity.sp opt_kernel.W_acc1:HBM[4] --connectivity.sp opt_kernel.acc0_out:HBM[7] --connectivity.sp opt_kernel.cycle_count:HBM[9] --input_files /scratch/oswaldhe/work.out/run-1/design-point.xo --kernel opt_kernel --link --optimize 3 --output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl --vivado.synth.jobs 8
453
+ Options: --config /scratch/oswaldhe/hbm_config.ini
454
+ --connectivity.nk opt_kernel:1:opt_kernel
455
+ --connectivity.sp opt_kernel.X_acc0:HBM[1]
456
+ --connectivity.sp opt_kernel.X_acc1:HBM[2]
457
+ --connectivity.sp opt_kernel.W_acc0:HBM[3]
458
+ --connectivity.sp opt_kernel.W_acc1:HBM[4]
459
+ --connectivity.sp opt_kernel.acc0_out:HBM[7]
460
+ --connectivity.sp opt_kernel.cycle_count:HBM[9]
461
+ --input_files /scratch/oswaldhe/work.out/run-1/design-point.xo
462
+ --kernel opt_kernel
463
+ --link
464
+ --optimize 3
465
+ --output /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
466
+ --platform xilinx_u280_xdma_201920_3
467
+ --report_level 2
468
+ --save-temps
469
+ --target hw
470
+ --temp_dir /scratch/oswaldhe/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
471
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
472
+ --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
473
+ -propconst
474
+ -sweep
475
+ -shift_register_opt}
476
+ --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadSLLs
477
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Default
478
+ --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Default
479
+ --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/scratch/oswaldhe/work.out/run-1/constraints.tcl
480
+ --vivado.synth.jobs 8
481
+ ==============================================================================
482
+ User Added Key Value Pairs
483
+ --------------------------
484
+ <empty>
485
+ ==============================================================================
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_full.xclbin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc34da0da50c9058d7705e2529b37d1b88d2da38c315fa4d8ca878255a43b282
3
+ size 68746361
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c64f06b252dc6400e5a6a4910f803b6c120b828876009ed128b25db1719c05d
3
+ size 76311460
gpt-2-medium/bitstreams/opt_kernel_xilinx_u280_xdma_201920_3.xclbin.info ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ==============================================================================
3
+ XRT Build Version: 2.14.384 (2022.2)
4
+ Build Date: 2022-12-09 00:55:08
5
+ Hash ID: 090bb050d570d2b668477c3bd0f979dc3a34b9db
6
+ ==============================================================================
7
+ xclbin Information
8
+ ------------------
9
+ Generated by: v++ (2021.2) on 2021-10-14-04:41:01
10
+ Version: 2.14.384
11
+ Kernels: opt_kernel
12
+ Signature:
13
+ Content: Bitstream
14
+ UUID (xclbin): ce5651b8-ff94-7baf-4833-5b6446d1a345
15
+ Sections: DEBUG_IP_LAYOUT, BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT,
16
+ CONNECTIVITY, CLOCK_FREQ_TOPOLOGY, BUILD_METADATA,
17
+ EMBEDDED_METADATA, SYSTEM_METADATA,
18
+ GROUP_CONNECTIVITY, GROUP_TOPOLOGY
19
+ ==============================================================================
20
+ Hardware Platform (Shell) Information
21
+ -------------------------------------
22
+ Vendor: xilinx
23
+ Board: u280
24
+ Name: xdma
25
+ Version: 201920.3
26
+ Generated Version: Vivado 2019.2 (SW Build: 2742762)
27
+ Created:
28
+ Tue Jan 21 23:21:22 2020 FPGA Device: xcu280
29
+ Board Vendor: xilinx.com
30
+ Board Name: xilinx.com:au280:1.0
31
+ Board Part: xilinx.com:au280:part0:1.0
32
+ Platform VBNV: xilinx_u280_xdma_201920_3
33
+ Static UUID: f2b82d53-372f-45a4-bbe9-3d1c980216da
34
+ Feature ROM TimeStamp: 1579649056
35
+
36
+ Scalable Clocks
37
+ ---------------
38
+ Name: clk_out1_pfm_top_clkwiz_hbm_aclk_0
39
+ Index: 0
40
+ Type: SYSTEM
41
+ Frequency: 450 MHz
42
+
43
+ Name: DATA_CLK
44
+ Index: 1
45
+ Type: DATA
46
+ Frequency: 220 MHz
47
+
48
+ Name: KERNEL_CLK
49
+ Index: 2
50
+ Type: KERNEL
51
+ Frequency: 500 MHz
52
+
53
+ System Clocks
54
+ ------
55
+ Name: _bd_top_clkwiz_kernel2_clk_out1
56
+ Type: SCALABLE
57
+ Default Freq: 500 MHz
58
+ Requested Freq: 500 MHz
59
+ Achieved Freq: 500 MHz
60
+
61
+ Name: _bd_top_clkwiz_kernel_clk_out1
62
+ Type: SCALABLE
63
+ Default Freq: 300 MHz
64
+ Requested Freq: 300 MHz
65
+ Achieved Freq: 220 MHz
66
+
67
+ Memory Configuration
68
+ --------------------
69
+ Name: HBM[0]
70
+ Index: 0
71
+ Type: MEM_DDR4
72
+ Base Address: 0x0
73
+ Address Size: 0x10000000
74
+ Bank Used: Yes
75
+
76
+ Name: HBM[1]
77
+ Index: 1
78
+ Type: MEM_DDR4
79
+ Base Address: 0x10000000
80
+ Address Size: 0x10000000
81
+ Bank Used: Yes
82
+
83
+ Name: HBM[2]
84
+ Index: 2
85
+ Type: MEM_DRAM
86
+ Base Address: 0x20000000
87
+ Address Size: 0x10000000
88
+ Bank Used: Yes
89
+
90
+ Name: HBM[3]
91
+ Index: 3
92
+ Type: MEM_DRAM
93
+ Base Address: 0x30000000
94
+ Address Size: 0x10000000
95
+ Bank Used: Yes
96
+
97
+ Name: HBM[4]
98
+ Index: 4
99
+ Type: MEM_DRAM
100
+ Base Address: 0x40000000
101
+ Address Size: 0x10000000
102
+ Bank Used: No
103
+
104
+ Name: HBM[5]
105
+ Index: 5
106
+ Type: MEM_DRAM
107
+ Base Address: 0x50000000
108
+ Address Size: 0x10000000
109
+ Bank Used: No
110
+
111
+ Name: HBM[6]
112
+ Index: 6
113
+ Type: MEM_DRAM
114
+ Base Address: 0x60000000
115
+ Address Size: 0x10000000
116
+ Bank Used: No
117
+
118
+ Name: HBM[7]
119
+ Index: 7
120
+ Type: MEM_DRAM
121
+ Base Address: 0x70000000
122
+ Address Size: 0x10000000
123
+ Bank Used: No
124
+
125
+ Name: HBM[8]
126
+ Index: 8
127
+ Type: MEM_DRAM
128
+ Base Address: 0x80000000
129
+ Address Size: 0x10000000
130
+ Bank Used: No
131
+
132
+ Name: HBM[9]
133
+ Index: 9
134
+ Type: MEM_DRAM
135
+ Base Address: 0x90000000
136
+ Address Size: 0x10000000
137
+ Bank Used: No
138
+
139
+ Name: HBM[10]
140
+ Index: 10
141
+ Type: MEM_DRAM
142
+ Base Address: 0xa0000000
143
+ Address Size: 0x10000000
144
+ Bank Used: No
145
+
146
+ Name: HBM[11]
147
+ Index: 11
148
+ Type: MEM_DRAM
149
+ Base Address: 0xb0000000
150
+ Address Size: 0x10000000
151
+ Bank Used: No
152
+
153
+ Name: HBM[12]
154
+ Index: 12
155
+ Type: MEM_DRAM
156
+ Base Address: 0xc0000000
157
+ Address Size: 0x10000000
158
+ Bank Used: No
159
+
160
+ Name: HBM[13]
161
+ Index: 13
162
+ Type: MEM_DRAM
163
+ Base Address: 0xd0000000
164
+ Address Size: 0x10000000
165
+ Bank Used: No
166
+
167
+ Name: HBM[14]
168
+ Index: 14
169
+ Type: MEM_DRAM
170
+ Base Address: 0xe0000000
171
+ Address Size: 0x10000000
172
+ Bank Used: No
173
+
174
+ Name: HBM[15]
175
+ Index: 15
176
+ Type: MEM_DRAM
177
+ Base Address: 0xf0000000
178
+ Address Size: 0x10000000
179
+ Bank Used: No
180
+
181
+ Name: HBM[16]
182
+ Index: 16
183
+ Type: MEM_DRAM
184
+ Base Address: 0x100000000
185
+ Address Size: 0x10000000
186
+ Bank Used: Yes
187
+
188
+ Name: HBM[17]
189
+ Index: 17
190
+ Type: MEM_DRAM
191
+ Base Address: 0x110000000
192
+ Address Size: 0x10000000
193
+ Bank Used: Yes
194
+
195
+ Name: HBM[18]
196
+ Index: 18
197
+ Type: MEM_DRAM
198
+ Base Address: 0x120000000
199
+ Address Size: 0x10000000
200
+ Bank Used: Yes
201
+
202
+ Name: HBM[19]
203
+ Index: 19
204
+ Type: MEM_DRAM
205
+ Base Address: 0x130000000
206
+ Address Size: 0x10000000
207
+ Bank Used: No
208
+
209
+ Name: HBM[20]
210
+ Index: 20
211
+ Type: MEM_DRAM
212
+ Base Address: 0x140000000
213
+ Address Size: 0x10000000
214
+ Bank Used: No
215
+
216
+ Name: HBM[21]
217
+ Index: 21
218
+ Type: MEM_DRAM
219
+ Base Address: 0x150000000
220
+ Address Size: 0x10000000
221
+ Bank Used: No
222
+
223
+ Name: HBM[22]
224
+ Index: 22
225
+ Type: MEM_DRAM
226
+ Base Address: 0x160000000
227
+ Address Size: 0x10000000
228
+ Bank Used: No
229
+
230
+ Name: HBM[23]
231
+ Index: 23
232
+ Type: MEM_DRAM
233
+ Base Address: 0x170000000
234
+ Address Size: 0x10000000
235
+ Bank Used: No
236
+
237
+ Name: HBM[24]
238
+ Index: 24
239
+ Type: MEM_DRAM
240
+ Base Address: 0x180000000
241
+ Address Size: 0x10000000
242
+ Bank Used: No
243
+
244
+ Name: HBM[25]
245
+ Index: 25
246
+ Type: MEM_DRAM
247
+ Base Address: 0x190000000
248
+ Address Size: 0x10000000
249
+ Bank Used: No
250
+
251
+ Name: HBM[26]
252
+ Index: 26
253
+ Type: MEM_DRAM
254
+ Base Address: 0x1a0000000
255
+ Address Size: 0x10000000
256
+ Bank Used: No
257
+
258
+ Name: HBM[27]
259
+ Index: 27
260
+ Type: MEM_DRAM
261
+ Base Address: 0x1b0000000
262
+ Address Size: 0x10000000
263
+ Bank Used: No
264
+
265
+ Name: HBM[28]
266
+ Index: 28
267
+ Type: MEM_DRAM
268
+ Base Address: 0x1c0000000
269
+ Address Size: 0x10000000
270
+ Bank Used: No
271
+
272
+ Name: HBM[29]
273
+ Index: 29
274
+ Type: MEM_DRAM
275
+ Base Address: 0x1d0000000
276
+ Address Size: 0x10000000
277
+ Bank Used: No
278
+
279
+ Name: HBM[30]
280
+ Index: 30
281
+ Type: MEM_DRAM
282
+ Base Address: 0x1e0000000
283
+ Address Size: 0x10000000
284
+ Bank Used: No
285
+
286
+ Name: HBM[31]
287
+ Index: 31
288
+ Type: MEM_DRAM
289
+ Base Address: 0x1f0000000
290
+ Address Size: 0x10000000
291
+ Bank Used: No
292
+
293
+ Name: DDR[0]
294
+ Index: 32
295
+ Type: MEM_DRAM
296
+ Base Address: 0x0
297
+ Address Size: 0x0
298
+ Bank Used: No
299
+
300
+ Name: DDR[1]
301
+ Index: 33
302
+ Type: MEM_DRAM
303
+ Base Address: 0x0
304
+ Address Size: 0x0
305
+ Bank Used: No
306
+
307
+ Name: PLRAM[0]
308
+ Index: 34
309
+ Type: MEM_DRAM
310
+ Base Address: 0x0
311
+ Address Size: 0x0
312
+ Bank Used: No
313
+
314
+ Name: PLRAM[1]
315
+ Index: 35
316
+ Type: MEM_DRAM
317
+ Base Address: 0x0
318
+ Address Size: 0x0
319
+ Bank Used: No
320
+
321
+ Name: PLRAM[2]
322
+ Index: 36
323
+ Type: MEM_DRAM
324
+ Base Address: 0x0
325
+ Address Size: 0x0
326
+ Bank Used: No
327
+
328
+ Name: PLRAM[3]
329
+ Index: 37
330
+ Type: MEM_DRAM
331
+ Base Address: 0x0
332
+ Address Size: 0x0
333
+ Bank Used: No
334
+
335
+ Name: PLRAM[4]
336
+ Index: 38
337
+ Type: MEM_DRAM
338
+ Base Address: 0x0
339
+ Address Size: 0x0
340
+ Bank Used: No
341
+
342
+ Name: PLRAM[5]
343
+ Index: 39
344
+ Type: MEM_DRAM
345
+ Base Address: 0x0
346
+ Address Size: 0x0
347
+ Bank Used: No
348
+ ==============================================================================
349
+ Kernel: opt_kernel
350
+
351
+ Definition
352
+ ----------
353
+ Signature: opt_kernel (const int L, const int L_out, const int seq_len, const int reload, ap_uint<512>* X_acc0, ap_uint<512>* X_acc1, ap_uint<512>* W_acc0, ap_uint<512>* W_acc1, ap_uint<64>* acc0_out, ap_uint<64>* acc1_out, int* cycle_count)
354
+
355
+ Ports
356
+ -----
357
+ Port: m_axi_X_acc0
358
+ Mode: master
359
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
360
+ Data Width: 512 bits
361
+ Port Type: addressable
362
+
363
+ Port: m_axi_X_acc1
364
+ Mode: master
365
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
366
+ Data Width: 512 bits
367
+ Port Type: addressable
368
+
369
+ Port: m_axi_W_acc0
370
+ Mode: master
371
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
372
+ Data Width: 512 bits
373
+ Port Type: addressable
374
+
375
+ Port: m_axi_W_acc1
376
+ Mode: master
377
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
378
+ Data Width: 512 bits
379
+ Port Type: addressable
380
+
381
+ Port: m_axi_acc0_out
382
+ Mode: master
383
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
384
+ Data Width: 64 bits
385
+ Port Type: addressable
386
+
387
+ Port: m_axi_acc1_out
388
+ Mode: master
389
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
390
+ Data Width: 64 bits
391
+ Port Type: addressable
392
+
393
+ Port: m_axi_cycle_count
394
+ Mode: master
395
+ Range (bytes): 0xFFFFFFFFFFFFFFFF
396
+ Data Width: 32 bits
397
+ Port Type: addressable
398
+
399
+ Port: s_axi_control
400
+ Mode: slave
401
+ Range (bytes): 0x1000
402
+ Data Width: 32 bits
403
+ Port Type: addressable
404
+
405
+ --------------------------
406
+ Instance: opt_kernel
407
+ Base Address: 0x1800000
408
+
409
+ Argument: L
410
+ Register Offset: 0x10
411
+ Port: s_axi_control
412
+ Memory: <not applicable>
413
+
414
+ Argument: L_out
415
+ Register Offset: 0x18
416
+ Port: s_axi_control
417
+ Memory: <not applicable>
418
+
419
+ Argument: seq_len
420
+ Register Offset: 0x20
421
+ Port: s_axi_control
422
+ Memory: <not applicable>
423
+
424
+ Argument: reload
425
+ Register Offset: 0x28
426
+ Port: s_axi_control
427
+ Memory: <not applicable>
428
+
429
+ Argument: X_acc0
430
+ Register Offset: 0x30
431
+ Port: m_axi_X_acc0
432
+ Memory: HBM[0] (MEM_DDR4)
433
+
434
+ Argument: X_acc1
435
+ Register Offset: 0x3c
436
+ Port: m_axi_X_acc1
437
+ Memory: HBM[16] (MEM_DRAM)
438
+
439
+ Argument: W_acc0
440
+ Register Offset: 0x48
441
+ Port: m_axi_W_acc0
442
+ Memory: HBM[1] (MEM_DDR4)
443
+
444
+ Argument: W_acc1
445
+ Register Offset: 0x54
446
+ Port: m_axi_W_acc1
447
+ Memory: HBM[17] (MEM_DRAM)
448
+
449
+ Argument: acc0_out
450
+ Register Offset: 0x60
451
+ Port: m_axi_acc0_out
452
+ Memory: HBM[2] (MEM_DRAM)
453
+
454
+ Argument: acc1_out
455
+ Register Offset: 0x6c
456
+ Port: m_axi_acc1_out
457
+ Memory: HBM[18] (MEM_DRAM)
458
+
459
+ Argument: cycle_count
460
+ Register Offset: 0x78
461
+ Port: m_axi_cycle_count
462
+ Memory: HBM[3] (MEM_DRAM)
463
+ ==============================================================================
464
+ Generated By
465
+ ------------
466
+ Command: v++
467
+ Version: 2021.2 - 2021-10-14-04:41:01 (SW BUILD: 3363252)
468
+ Command Line: v++ --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini --connectivity.nk opt_kernel:1:opt_kernel --connectivity.sp opt_kernel.X_acc0:HBM[0] --connectivity.sp opt_kernel.X_acc1:HBM[16] --connectivity.sp opt_kernel.W_acc0:HBM[1] --connectivity.sp opt_kernel.W_acc1:HBM[17] --connectivity.sp opt_kernel.acc0_out:HBM[2] --connectivity.sp opt_kernel.acc1_out:HBM[18] --connectivity.sp opt_kernel.cycle_count:HBM[3] --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo --kernel opt_kernel --link --optimize 3 --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin --platform xilinx_u280_xdma_201920_3 --report_level 2 --save-temps --target hw --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget -propconst -sweep -shift_register_opt} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl --vivado.synth.jobs 8
469
+ Options: --config /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/run/link_config.ini
470
+ --connectivity.nk opt_kernel:1:opt_kernel
471
+ --connectivity.sp opt_kernel.X_acc0:HBM[0]
472
+ --connectivity.sp opt_kernel.X_acc1:HBM[16]
473
+ --connectivity.sp opt_kernel.W_acc0:HBM[1]
474
+ --connectivity.sp opt_kernel.W_acc1:HBM[17]
475
+ --connectivity.sp opt_kernel.acc0_out:HBM[2]
476
+ --connectivity.sp opt_kernel.acc1_out:HBM[18]
477
+ --connectivity.sp opt_kernel.cycle_count:HBM[3]
478
+ --input_files /home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt.hw.xo
479
+ --kernel opt_kernel
480
+ --link
481
+ --optimize 3
482
+ --output /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.xclbin
483
+ --platform xilinx_u280_xdma_201920_3
484
+ --report_level 2
485
+ --save-temps
486
+ --target hw
487
+ --temp_dir /home/oswaldhe/fpga_transformer/opt-fluid-model/vitis_run_hw/opt_kernel_xilinx_u280_xdma_201920_3.temp
488
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1
489
+ --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-retarget
490
+ -propconst
491
+ -sweep
492
+ -shift_register_opt}
493
+ --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=EarlyBlockPlacement
494
+ --vivado.prop run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore
495
+ --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore
496
+ --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/home/oswaldhe/fpga_transformer/opt-fluid-model/opt-stage4-context.tapa/run-2/opt-floorplan.tcl
497
+ --vivado.synth.jobs 8
498
+ ==============================================================================
499
+ User Added Key Value Pairs
500
+ --------------------------
501
+ <empty>
502
+ ==============================================================================
gpt-2-medium/export_xo.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rapidstream import RapidStreamTAPA, DeviceFactory, get_u250_vitis_device_factory
2
+ from pathlib import Path
3
+ import os
4
+
5
+ CURR_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ BUILD_DIR = "rs_build"
7
+ VITIS_PLATFORM = "~/vpk180_linux_platform/vpk180_pfm_vitis/export/vpk180_pfm_vitis/vpk180_pfm_vitis.xpfm"
8
+
9
+
10
+ rs = RapidStreamTAPA(BUILD_DIR)
11
+
12
+ # factory = get_u250_vitis_device_factory(VITIS_PLATFORM)
13
+ factory = DeviceFactory(
14
+ row=4,
15
+ col=2,
16
+ part_num="xcvp1802-lsvc4072-2MP-e-S",
17
+ board_name="xilinx.com:vpk180:part0:1.1",
18
+ )
19
+
20
+ # Set the pblocks of the device so that each slot contains half of an SLR:
21
+ factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
22
+ factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
23
+ factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
24
+ factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
25
+
26
+ factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
27
+ factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
28
+ factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
29
+ factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
30
+
31
+ # There are 18870 total SLL nodes for VP1552:
32
+ factory.set_slot_capacity(0, 0, north=9435)
33
+ factory.set_slot_capacity(1, 0, north=9435)
34
+ factory.set_slot_capacity(0, 1, north=9435)
35
+ factory.set_slot_capacity(1, 1, north=9435)
36
+ factory.set_slot_capacity(0, 2, north=9435)
37
+ factory.set_slot_capacity(1, 2, north=9435)
38
+
39
+ # Call factory to extract the slot resources automatically from Vivado:
40
+ factory.extract_slot_resources()
41
+
42
+ rs.set_virtual_device(factory.generate_virtual_device())
43
+
44
+ rs.add_xo_file("./gpt2-sa.tapa/gpt2.xo")
45
+ rs.set_top_module_name("opt_kernel")
46
+ rs.add_clock("ap_clk", period_ns=3.33)
47
+ rs.set_vitis_connectivity_config("link_config_versal.ini")
48
+
49
+ work_dir_to_ir = {Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5'): Path(f'{CURR_DIR}/{BUILD_DIR}/dse/candidate_5/add_pipeline.json')}
50
+ rs.remote_ip_cache = Path(f"{CURR_DIR}/{BUILD_DIR}")
51
+ rs.set_vitis_platform(VITIS_PLATFORM)
52
+ rs.parallel_export_candidates(work_dir_to_ir)
gpt-2-medium/generate_bitstream_sample.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ TARGET=hw
3
+ # TARGET=hw_emu
4
+ # DEBUG=-g
5
+
6
+ TOP=opt_kernel
7
+ XO='/path/to/opt_kernel.xo'
8
+ CONSTRAINT='/path/to/floorplanning/constraint.tcl'
9
+ >&2 echo "Using the default clock target of the platform."
10
+ PLATFORM="/path/to/vitis/vpk180.xpfm"
11
+ TARGET_FREQUENCY=240000000
12
+ if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
13
+
14
+ OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
15
+
16
+ MAX_SYNTH_JOBS=16
17
+ STRATEGY="Explore"
18
+ PLACEMENT_STRATEGY="Explore"
19
+
20
+ v++ ${DEBUG} \
21
+ --link \
22
+ --output "${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
23
+ --kernel ${TOP} \
24
+ --platform ${PLATFORM} \
25
+ --target ${TARGET} \
26
+ --report_level 2 \
27
+ --temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp" \
28
+ --optimize 3 \
29
+ --connectivity.nk ${TOP}:1:${TOP} \
30
+ --save-temps \
31
+ "${XO}" \
32
+ --vivado.synth.jobs ${MAX_SYNTH_JOBS} \
33
+ --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
34
+ --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
35
+ --vivado.prop=run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE\ OPTIONS}={-debug_log} \
36
+ --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$PLACEMENT_STRATEGY \
37
+ --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
38
+ --vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$STRATEGY \
39
+ --clock.default_freqhz ${TARGET_FREQUENCY} \
40
+ --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=$CONSTRAINT \
gpt-2-medium/hbm_config.ini ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [connectivity]
2
+ sp=opt_kernel.X_acc0:HBM[0]
3
+ sp=opt_kernel.X_acc1:HBM[16]
4
+ sp=opt_kernel.W_acc0:HBM[1]
5
+ sp=opt_kernel.W_acc1:HBM[17]
6
+ sp=opt_kernel.acc0_out:HBM[2]
7
+ sp=opt_kernel.cycle_count:HBM[19]
gpt-2-medium/host-u280.cpp ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include <cmath>
3
+ #include <iostream>
4
+ #include <string>
5
+ #include <ctime>
6
+ #include <cmath>
7
+ #include <tapa.h>
8
+ #include <gflags/gflags.h>
9
+ #include <ap_int.h>
10
+
11
+ constexpr int D = 1024;
12
+ constexpr int D_ffn = 5504;
13
+ constexpr int N_head = 16;
14
+ constexpr int MAX_SEQ_LEN = 1024;
15
+ constexpr int NUM_SLR = 3;
16
+ constexpr int NUM_DUM_SLR = 4;
17
+ constexpr int D_head = D / N_head;
18
+ constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
19
+ constexpr int OUT_WEIGHT_SIZE = D * D;
20
+ constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
21
+
22
+ using std::vector;
23
+ using int_v16 = tapa::vec_t<int, 16>;
24
+ using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
25
+ using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
26
+
27
+ void opt_kernel(
28
+ const int L,
29
+ const int L_out,
30
+ const int seq_len,
31
+ // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
32
+ tapa::mmap<ap_uint<512>> X_acc0,
33
+ tapa::mmap<ap_uint<512>> X_acc1,
34
+ tapa::mmap<ap_uint<512>> W_acc0,
35
+ tapa::mmap<ap_uint<512>> W_acc1,
36
+ tapa::mmap<ap_uint<128>> acc0_out,
37
+ // tapa::mmap<ap_uint<64>> acc1_out,
38
+ tapa::mmap<int> cycle_count
39
+ );
40
+
41
+ template <typename T>
42
+ using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
43
+
44
+ DEFINE_string(bitstream, "", "path to bitstream file");
45
+
46
+ int main(int argc, char *argv[]){
47
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
48
+
49
+ const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
50
+
51
+ srand((unsigned)time(nullptr));
52
+
53
+ // data preparation
54
+ aligned_vector<int> inst = {L, 1};
55
+ aligned_vector<ap_int<8>> X_acc0(L * D, 0);
56
+ aligned_vector<ap_int<8>> X_acc1(L * D, 0);
57
+ aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
58
+ aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10 + D * D_ffn, 0);
59
+ aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
60
+ // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
61
+ aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
62
+ aligned_vector<int> cycle_count(1);
63
+
64
+
65
+ vector<int> X_copy(L * D);
66
+ vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
67
+ vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
68
+ vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
69
+ vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
70
+ vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
71
+ vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
72
+ vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
73
+
74
+ // for(int i = 0; i < L * D; i++){
75
+ // int val = (rand() % 8) + 1;
76
+ // ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
77
+ // X_copy[i] = val;
78
+ // X_acc0[i] = ap_int<8>(full(7, 0));
79
+ // X_acc1[i] = ap_int<8>(full(7, 0));
80
+ // }
81
+
82
+ // for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
83
+ // int val = (rand() % 6) - 1;
84
+ // ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
85
+ // W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
86
+ // W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
87
+ // }
88
+
89
+ // for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
90
+ // int val = (rand() % 6) - 1;
91
+ // ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
92
+ // W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
93
+ // W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
94
+ // }
95
+
96
+ // for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
97
+ // int val = (rand() % 6) - 1;
98
+ // int ind = i - D * D_head * NUM_DUM_SLR * 4;
99
+ // ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
100
+ // W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
101
+ // W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
102
+ // W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
103
+ // }
104
+
105
+ // // cpu
106
+ // for(int i = 0; i < NUM_SLR; i++){
107
+ // // WqX
108
+ // for(int j = 0; j < L; j++){
109
+ // for(int k = 0; k < D_head; k++){
110
+ // int acc = 0;
111
+ // for(int l = 0; l < D; l++){
112
+ // acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
113
+ // }
114
+ // q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
115
+ // }
116
+ // }
117
+
118
+ // //WvX
119
+ // for(int j = 0; j < L; j++){
120
+ // for(int k = 0; k < D_head; k++){
121
+ // int acc = 0;
122
+ // for(int l = 0; l < D; l++){
123
+ // acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
124
+ // }
125
+ // acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
126
+ // }
127
+ // }
128
+
129
+ // //WkX
130
+ // for(int j = 0; j < L; j++){
131
+ // for(int k = 0; k < D_head; k++){
132
+ // int acc = 0;
133
+ // for(int l = 0; l < D; l++){
134
+ // acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
135
+ // }
136
+ // k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
137
+ // }
138
+ // }
139
+
140
+ // // QK^T
141
+ // for(int j = 0; j < L; j++){
142
+ // for(int k = 0; k < L; k++){
143
+ // int acc = 0;
144
+ // for(int l = 0; l < D_head; l++){
145
+ // acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
146
+ // }
147
+ // attn_golden[i][j*D_head+k] = acc;
148
+ // }
149
+ // }
150
+ // }
151
+
152
+
153
+ // invoke the kernel
154
+ int64_t kernel_time_ns = 0;
155
+ for(int i = 0; i < 24; i++){
156
+ kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
157
+ L * D, L * D / 16, L,
158
+ // tapa::read_only_mmap<int>(inst),
159
+ tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
160
+ tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
161
+ tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
162
+ tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
163
+ tapa::write_only_mmap<ap_uint<128>>(acc0_out),
164
+ // tapa::write_only_mmap<ap_uint<64>>(acc1_out),
165
+ tapa::write_only_mmap<int>(cycle_count));
166
+ }
167
+
168
+ std::clog << "cycle time: " << cycle_count[0] << std::endl;
169
+ std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
170
+
171
+ }
172
+
gpt-2-medium/host-versal.cpp ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include <cmath>
3
+ #include <iostream>
4
+ #include <string>
5
+ #include <ctime>
6
+ #include <cmath>
7
+ #include <tapa.h>
8
+ #include <gflags/gflags.h>
9
+ #include <ap_int.h>
10
+
11
+ constexpr int D = 1024;
12
+ constexpr int D_ffn = 4096;
13
+ constexpr int N_head = 16;
14
+ constexpr int MAX_SEQ_LEN = 1024;
15
+ constexpr int NUM_SLR = 4;
16
+ constexpr int NUM_DUM_SLR = 4;
17
+ constexpr int D_head = D / N_head;
18
+ constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
19
+ constexpr int OUT_WEIGHT_SIZE = D * D;
20
+ constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
21
+
22
+ using std::vector;
23
+ using int_v16 = tapa::vec_t<int, 16>;
24
+ using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
25
+ using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
26
+
27
+ void opt_kernel(
28
+ const int L,
29
+ const int L_out,
30
+ const int seq_len,
31
+ // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
32
+ tapa::mmap<ap_uint<512>> X_acc0,
33
+ tapa::mmap<ap_uint<512>> X_acc1,
34
+ tapa::mmap<ap_uint<512>> W_acc0,
35
+ tapa::mmap<ap_uint<512>> W_acc1,
36
+ tapa::mmap<ap_uint<128>> acc0_out,
37
+ // tapa::mmap<ap_uint<64>> acc1_out,
38
+ tapa::mmap<int> cycle_count
39
+ );
40
+
41
+ template <typename T>
42
+ using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
43
+
44
+ DEFINE_string(bitstream, "", "path to bitstream file");
45
+
46
+ int main(int argc, char *argv[]){
47
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
48
+
49
+ const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
50
+
51
+ srand((unsigned)time(nullptr));
52
+
53
+ // data preparation
54
+ aligned_vector<int> inst = {L, 1};
55
+ aligned_vector<ap_int<8>> X_acc0(L * D);
56
+ aligned_vector<ap_int<8>> X_acc1(L * D);
57
+ aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
58
+ aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, 1);
59
+ aligned_vector<ap_uint<128>> acc0_out(NUM_SLR * L * D / 8);
60
+ // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
61
+ aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
62
+ aligned_vector<int> cycle_count(1);
63
+
64
+
65
+ vector<int> X_copy(L * D);
66
+ vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
67
+ vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
68
+ vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 8));
69
+ vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
70
+ vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
71
+ vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
72
+ vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
73
+
74
+ for(int i = 0; i < L * D; i++){
75
+ int val = (rand() % 8) + 1;
76
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
77
+ X_copy[i] = val;
78
+ X_acc0[i] = ap_int<8>(full(7, 0));
79
+ X_acc1[i] = ap_int<8>(full(7, 0));
80
+ }
81
+
82
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
83
+ int val = (rand() % 6) - 1;
84
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
85
+ W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
86
+ W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
87
+ }
88
+
89
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 4; i++){
90
+ int val = (rand() % 6) - 1;
91
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
92
+ W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
93
+ W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
94
+ }
95
+
96
+ for(int i = D * D_head * NUM_DUM_SLR * 4; i < D * D_head * NUM_DUM_SLR * 12; i++){
97
+ int val = (rand() % 6) - 1;
98
+ int ind = i - D * D_head * NUM_DUM_SLR * 4;
99
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
100
+ W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
101
+ W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
102
+ W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
103
+ }
104
+
105
+ // cpu
106
+ for(int i = 0; i < NUM_SLR; i++){
107
+ // WqX
108
+ for(int j = 0; j < L; j++){
109
+ for(int k = 0; k < D_head; k++){
110
+ int acc = 0;
111
+ for(int l = 0; l < D; l++){
112
+ acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
113
+ }
114
+ q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
115
+ }
116
+ }
117
+
118
+ //WvX
119
+ for(int j = 0; j < L; j++){
120
+ for(int k = 0; k < D_head; k++){
121
+ int acc = 0;
122
+ for(int l = 0; l < D; l++){
123
+ acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
124
+ }
125
+ acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
126
+ }
127
+ }
128
+
129
+ //WkX
130
+ for(int j = 0; j < L; j++){
131
+ for(int k = 0; k < D_head; k++){
132
+ int acc = 0;
133
+ for(int l = 0; l < D; l++){
134
+ acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
135
+ }
136
+ k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
137
+ }
138
+ }
139
+
140
+ // QK^T
141
+ for(int j = 0; j < L; j++){
142
+ for(int k = 0; k < L; k++){
143
+ int acc = 0;
144
+ for(int l = 0; l < D_head; l++){
145
+ acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
146
+ }
147
+ attn_golden[i][j*D_head+k] = acc;
148
+ }
149
+ }
150
+ }
151
+
152
+
153
+ // invoke the kernel
154
+ int64_t kernel_time_ns = 0;
155
+ for(int i = 0; i < 1; i++){
156
+ kernel_time_ns = tapa::invoke(opt_kernel, FLAGS_bitstream,
157
+ L * D, L * D / 16, L,
158
+ // tapa::read_only_mmap<int>(inst),
159
+ tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
160
+ tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
161
+ tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
162
+ tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
163
+ tapa::write_only_mmap<ap_uint<128>>(acc0_out),
164
+ // tapa::write_only_mmap<ap_uint<64>>(acc1_out),
165
+ tapa::write_only_mmap<int>(cycle_count));
166
+ }
167
+
168
+ std::clog << "cycle time: " << cycle_count[0] << std::endl;
169
+ std::clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << std::endl;
170
+
171
+ int error = 0;
172
+
173
+ // compare
174
+ // for(int i = 0; i < NUM_SLR; i++){
175
+ // for(int j = 0; j < 4; j++){
176
+ // for(int k = 0; k < 16; k++){
177
+ // if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
178
+ // std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
179
+ // error++;
180
+ // }
181
+ // }
182
+ // }
183
+ // }
184
+
185
+ if (error == 0) {
186
+ std::clog << "PASSED" << std::endl;
187
+ } else {
188
+ std::clog << "FAILED" << std::endl;
189
+ return 1;
190
+ }
191
+ return 0;
192
+
193
+ }
194
+
gpt-2-medium/host.cpp ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include <cmath>
3
+ #include <iostream>
4
+ #include <string>
5
+ #include <ctime>
6
+ #include <cmath>
7
+ #include <tapa.h>
8
+ #include <gflags/gflags.h>
9
+ #include <ap_int.h>
10
+
11
+ constexpr int D = 1024;
12
+ constexpr int D_ffn = 4096;
13
+ constexpr int N_head = 16;
14
+ constexpr int MAX_SEQ_LEN = 1024;
15
+ constexpr int NUM_SLR = 3;
16
+ constexpr int NUM_DUM_SLR = 4;
17
+ constexpr int D_head = D / N_head;
18
+ constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
19
+ constexpr int OUT_WEIGHT_SIZE = D * D;
20
+ constexpr int QKV_WEIGHT_SIZE = D * D / N_head * NUM_DUM_SLR * 2; // multi-head attention
21
+
22
+ using std::vector;
23
+ using int_v16 = tapa::vec_t<int, 16>;
24
+ using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
25
+ using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
26
+
27
+ void opt_kernel(
28
+ const int L,
29
+ const int L_out,
30
+ const int seq_len,
31
+ // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
32
+ tapa::mmap<ap_uint<512>> X_acc0,
33
+ tapa::mmap<ap_uint<512>> X_acc1,
34
+ tapa::mmap<ap_uint<512>> W_acc0,
35
+ tapa::mmap<ap_uint<512>> W_acc1,
36
+ tapa::mmap<ap_uint<64>> acc0_out,
37
+ tapa::mmap<ap_uint<64>> acc1_out,
38
+ tapa::mmap<int> cycle_count
39
+ );
40
+
41
+ template <typename T>
42
+ using aligned_vector = std::vector<T, tapa::aligned_allocator<T>>;
43
+
44
+ DEFINE_string(bitstream, "", "path to bitstream file");
45
+
46
+ int main(int argc, char *argv[]){
47
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
48
+
49
+ const int L = argc > 1 ? atoll(argv[1]) : MAX_SEQ_LEN;
50
+
51
+ srand((unsigned)time(nullptr));
52
+
53
+ // data preparation
54
+ aligned_vector<int> inst = {L, 1};
55
+ aligned_vector<ap_int<8>> X_acc0(L * D);
56
+ aligned_vector<ap_int<8>> X_acc1(L * D);
57
+ aligned_vector<ap_int<8>> W_acc0(D * D_head * NUM_DUM_SLR * 10);
58
+ aligned_vector<ap_int<8>> W_acc1(D * D_head * NUM_DUM_SLR * 10);
59
+ aligned_vector<ap_uint<64>> acc0_out(NUM_SLR * L * D / 8);
60
+ // aligned_vector<ap_uint<512>> acc0_out(NUM_SLR, aligned_vector<ap_uint<512>>(L * L / 16));
61
+ aligned_vector<ap_uint<64>> acc1_out(NUM_SLR * L * D / 8);
62
+ aligned_vector<int> cycle_count(1);
63
+
64
+
65
+ vector<int> X_copy(L * D);
66
+ vector<vector<int>> W_acc0_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
67
+ vector<vector<int>> W_acc1_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
68
+ vector<vector<int>> W_k_split(NUM_DUM_SLR, vector<int>(D * D_head * 10));
69
+ vector<aligned_vector<int>> q_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
70
+ vector<aligned_vector<int>> k_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
71
+ vector<aligned_vector<int>> attn_golden(NUM_DUM_SLR, aligned_vector<int>(L * L));
72
+ vector<aligned_vector<int>> acc1_out_golden(NUM_DUM_SLR, aligned_vector<int>(L * D_head));
73
+
74
+ for(int i = 0; i < L * D; i++){
75
+ int val = (rand() % 8) + 1;
76
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
77
+ X_copy[i] = val;
78
+ X_acc0[i] = ap_int<8>(full(7, 0));
79
+ X_acc1[i] = ap_int<8>(full(7, 0));
80
+ }
81
+
82
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
83
+ int val = (rand() % 6) - 1;
84
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
85
+ W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
86
+ W_acc0_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
87
+ }
88
+
89
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 5; i++){
90
+ int val = (rand() % 6) - 1;
91
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
92
+ W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
93
+ W_acc1_split[(i / 32) % 4][(i / 128) * 32 + (i % 32)] = val;
94
+ }
95
+
96
+ for(int i = D * D_head * NUM_DUM_SLR * 5; i < D * D_head * NUM_DUM_SLR * 15; i++){
97
+ int val = (rand() % 6) - 1;
98
+ int ind = i - D * D_head * NUM_DUM_SLR * 5;
99
+ ap_int<32> full = tapa::bit_cast<ap_int<32>>(val);
100
+ W_acc0[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
101
+ W_acc1[i/2]((i%2+1)*4-1, (i%2)*4) = ap_int<4>(full(3, 0));
102
+ W_k_split[(ind / 32) % 4][(ind / 128) * 32 + (ind % 32)] = val;
103
+ }
104
+
105
+ // cpu
106
+ for(int i = 0; i < NUM_SLR; i++){
107
+ // WqX
108
+ for(int j = 0; j < L; j++){
109
+ for(int k = 0; k < D_head; k++){
110
+ int acc = 0;
111
+ for(int l = 0; l < D; l++){
112
+ acc += X_copy[j*D+l] * W_acc0_split[i][l*D_head + k];
113
+ }
114
+ q_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
115
+ }
116
+ }
117
+
118
+ //WvX
119
+ for(int j = 0; j < L; j++){
120
+ for(int k = 0; k < D_head; k++){
121
+ int acc = 0;
122
+ for(int l = 0; l < D; l++){
123
+ acc += X_copy[j*D+l] * W_acc1_split[i][l*D_head + k];
124
+ }
125
+ acc1_out_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
126
+ }
127
+ }
128
+
129
+ //WkX
130
+ for(int j = 0; j < L; j++){
131
+ for(int k = 0; k < D_head; k++){
132
+ int acc = 0;
133
+ for(int l = 0; l < D; l++){
134
+ acc += X_copy[j*D+l] * W_k_split[i][l*D_head + k];
135
+ }
136
+ k_golden[i][j * D_head + k] = std::min(std::max((acc >> 8), -128), 127);
137
+ }
138
+ }
139
+
140
+ // QK^T
141
+ for(int j = 0; j < L; j++){
142
+ for(int k = 0; k < L; k++){
143
+ int acc = 0;
144
+ for(int l = 0; l < D_head; l++){
145
+ acc += q_golden[i][k*D_head+l] * k_golden[i][j*D_head+l];
146
+ }
147
+ attn_golden[i][j*D_head+k] = acc;
148
+ }
149
+ }
150
+ }
151
+
152
+
153
+ // invoke the kernel
154
+ int64_t kernel_time_ns = 0;
155
+ for(int i = 0; i < 24; i++){
156
+ kernel_time_ns += tapa::invoke(opt_kernel, FLAGS_bitstream,
157
+ L * D, L * D / 8, L,
158
+ // tapa::read_only_mmap<int>(inst),
159
+ tapa::read_only_mmap<ap_int<8>>(X_acc0).reinterpret<ap_uint<512>>(),
160
+ tapa::read_only_mmap<ap_int<8>>(X_acc1).reinterpret<ap_uint<512>>(),
161
+ tapa::read_only_mmap<ap_int<8>>(W_acc0).reinterpret<ap_uint<512>>(),
162
+ tapa::read_only_mmap<ap_int<8>>(W_acc1).reinterpret<ap_uint<512>>(),
163
+ tapa::write_only_mmap<ap_uint<64>>(acc0_out),
164
+ tapa::write_only_mmap<ap_uint<64>>(acc1_out),
165
+ tapa::write_only_mmap<int>(cycle_count));
166
+ }
167
+
168
+ // std::clog << "cycle time: " << cycle_count[0] << std::endl;
169
+ std::clog << "kernel time: " << kernel_time_ns * 2e-9 << " s" << std::endl;
170
+
171
+ int error = 0;
172
+
173
+ // compare
174
+ // for(int i = 0; i < NUM_SLR; i++){
175
+ // for(int j = 0; j < 4; j++){
176
+ // for(int k = 0; k < 16; k++){
177
+ // if(tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32)))-attn_golden[i][j*16+k] != 0){
178
+ // std::clog << "slr: " << i << ", index: " << j << ", actual: " << tapa::bit_cast<int>(ap_int<32>(acc0_out[i][j](k*32+31,k*32))) << ", expect: " << attn_golden[i][j*16+k] << std::endl;
179
+ // error++;
180
+ // }
181
+ // }
182
+ // }
183
+ // }
184
+
185
+ if (error == 0) {
186
+ std::clog << "PASSED" << std::endl;
187
+ } else {
188
+ std::clog << "FAILED" << std::endl;
189
+ return 1;
190
+ }
191
+ return 0;
192
+
193
+ }
194
+
gpt-2-medium/host_opencl.cpp ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ Vendor: Xilinx
3
+ Associated Filename: vadd.cpp
4
+ Purpose: VITIS vector addition
5
+
6
+ *******************************************************************************
7
+ Copyright (C) 2019 XILINX, Inc.
8
+
9
+ This file contains confidential and proprietary information of Xilinx, Inc. and
10
+ is protected under U.S. and international copyright and other intellectual
11
+ property laws.
12
+
13
+ DISCLAIMER
14
+ This disclaimer is not a license and does not grant any rights to the materials
15
+ distributed herewith. Except as otherwise provided in a valid license issued to
16
+ you by Xilinx, and to the maximum extent permitted by applicable law:
17
+ (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
18
+ HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
19
+ INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
20
+ FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
21
+ in contract or tort, including negligence, or under any other theory of
22
+ liability) for any loss or damage of any kind or nature related to, arising under
23
+ or in connection with these materials, including for any direct, or any indirect,
24
+ special, incidental, or consequential loss or damage (including loss of data,
25
+ profits, goodwill, or any type of loss or damage suffered as a result of any
26
+ action brought by a third party) even if such damage or loss was reasonably
27
+ foreseeable or Xilinx had been advised of the possibility of the same.
28
+
29
+ CRITICAL APPLICATIONS
30
+ Xilinx products are not designed or intended to be fail-safe, or for use in any
31
+ application requiring fail-safe performance, such as life-support or safety
32
+ devices or systems, Class III medical devices, nuclear facilities, applications
33
+ related to the deployment of airbags, or any other applications that could lead
34
+ to death, personal injury, or severe property or environmental damage
35
+ (individually and collectively, "Critical Applications"). Customer assumes the
36
+ sole risk and liability of any use of Xilinx products in Critical Applications,
37
+ subject only to applicable laws and regulations governing limitations on product
38
+ liability.
39
+
40
+ THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
41
+ ALL TIMES.
42
+
43
+ *******************************************************************************/
44
+
45
+ #define OCL_CHECK(error, call) \
46
+ call; \
47
+ if (error != CL_SUCCESS) { \
48
+ printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \
49
+ exit(EXIT_FAILURE); \
50
+ }
51
+
52
+ #include "host_opencl.h"
53
+ #include <fstream>
54
+ #include <iostream>
55
+ #include <stdlib.h>
56
+ #include <ap_int.h>
57
+
58
+ static const int DATA_SIZE = 4096;
59
+
60
+ static const std::string error_message =
61
+ "Error: Result mismatch:\n"
62
+ "i = %d CPU result = %d Device result = %d\n";
63
+
64
+ int main(int argc, char* argv[]) {
65
+ // TARGET_DEVICE macro needs to be passed from gcc command line
66
+ if (argc < 2) {
67
+ std::cout << "Usage: " << argv[0] << " <xclbin>" << std::endl;
68
+ return EXIT_FAILURE;
69
+ }
70
+
71
+ std::string xclbinFilename = argv[1];
72
+
73
+ // Compute the size of array in bytes
74
+ size_t size_in_bytes = DATA_SIZE * sizeof(int);
75
+ int L = 64;
76
+ if (argc == 3) {
77
+ L = atoi(argv[2]);
78
+ }
79
+ const int D = 1024;
80
+ const int NUM_DUM_SLR = 4;
81
+ const int NUM_SLR = 4;
82
+ const int D_head = 64;
83
+ const int D_ffn = 4096;
84
+
85
+ // Creates a vector of DATA_SIZE elements with an initial value of 10 and 32
86
+ // using customized allocator for getting buffer alignment to 4k boundary
87
+
88
+ std::vector<cl::Device> devices;
89
+ cl_int err;
90
+ cl::Context context;
91
+ cl::CommandQueue q;
92
+ cl::Kernel krnl_vector_add;
93
+ cl::Program program;
94
+ std::vector<cl::Platform> platforms;
95
+ bool found_device = false;
96
+
97
+ // traversing all Platforms To find Xilinx Platform and targeted
98
+ // Device in Xilinx Platform
99
+ cl::Platform::get(&platforms);
100
+ for (size_t i = 0; (i < platforms.size()) & (found_device == false); i++) {
101
+ cl::Platform platform = platforms[i];
102
+ std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
103
+ if (platformName == "Xilinx") {
104
+ devices.clear();
105
+ platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
106
+ if (devices.size()) {
107
+ found_device = true;
108
+ break;
109
+ }
110
+ }
111
+ }
112
+ if (found_device == false) {
113
+ std::cout << "Error: Unable to find Target Device " << std::endl;
114
+ return EXIT_FAILURE;
115
+ }
116
+
117
+ std::cout << "INFO: Reading " << xclbinFilename << std::endl;
118
+ FILE* fp;
119
+ if ((fp = fopen(xclbinFilename.c_str(), "r")) == nullptr) {
120
+ printf("ERROR: %s xclbin not available please build\n", xclbinFilename.c_str());
121
+ exit(EXIT_FAILURE);
122
+ }
123
+ // Load xclbin
124
+ std::cout << "Loading: '" << xclbinFilename << "'\n";
125
+ std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
126
+ bin_file.seekg(0, bin_file.end);
127
+ unsigned nb = bin_file.tellg();
128
+ bin_file.seekg(0, bin_file.beg);
129
+ char* buf = new char[nb];
130
+ bin_file.read(buf, nb);
131
+
132
+ // Creating Program from Binary File
133
+ cl::Program::Binaries bins;
134
+ bins.push_back({buf, nb});
135
+ bool valid_device = false;
136
+ for (unsigned int i = 0; i < devices.size(); i++) {
137
+ auto device = devices[i];
138
+ // Creating Context and Command Queue for selected Device
139
+ OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
140
+ OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
141
+ std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
142
+ cl::Program program(context, {device}, bins, nullptr, &err);
143
+ if (err != CL_SUCCESS) {
144
+ std::cout << "Failed to program device[" << i << "] with xclbin file!\n";
145
+ } else {
146
+ std::cout << "Device[" << i << "]: program successful!\n";
147
+ OCL_CHECK(err, krnl_vector_add = cl::Kernel(program, "opt_kernel", &err));
148
+ valid_device = true;
149
+ break; // we break because we found a valid device
150
+ }
151
+ }
152
+ if (!valid_device) {
153
+ std::cout << "Failed to program any device found, exit!\n";
154
+ exit(EXIT_FAILURE);
155
+ }
156
+
157
+ // These commands will allocate memory on the Device. The cl::Buffer objects can
158
+ // be used to reference the memory locations on the device.
159
+ OCL_CHECK(err, cl::Buffer buffer_X_acc0(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
160
+ OCL_CHECK(err, cl::Buffer buffer_X_acc1(context, CL_MEM_READ_ONLY, (size_t)(L*D), NULL, &err));
161
+ OCL_CHECK(err, cl::Buffer buffer_W_acc0(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
162
+ OCL_CHECK(err, cl::Buffer buffer_W_acc1(context, CL_MEM_READ_ONLY, (size_t)(D * D_head * NUM_DUM_SLR * 8 + D * D_ffn), NULL, &err));
163
+ OCL_CHECK(err, cl::Buffer buffer_acc0_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D * 8), NULL, &err));
164
+ // OCL_CHECK(err, cl::Buffer buffer_acc1_out(context, CL_MEM_WRITE_ONLY, (size_t)(NUM_SLR * L * D), NULL, &err));
165
+ OCL_CHECK(err, cl::Buffer buffer_cycle(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &err));
166
+
167
+ std::cout << "Finish creating buffer\n";
168
+
169
+ // set the kernel Arguments
170
+ int narg = 0;
171
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D));
172
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L*D/16));
173
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, L));
174
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc0));
175
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_X_acc1));
176
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc0));
177
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_W_acc1));
178
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc0_out));
179
+ // OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_acc1_out));
180
+ OCL_CHECK(err, err = krnl_vector_add.setArg(narg++, buffer_cycle));
181
+
182
+ std::cout << "Finish setArgs\n";
183
+
184
+ // We then need to map our OpenCL buffers to get the pointers
185
+ ap_int<8>* X_acc0;
186
+ ap_int<8>* X_acc1;
187
+ ap_int<8>* W_acc0;
188
+ ap_int<8>* W_acc1;
189
+ ap_uint<128>* acc0_out;
190
+ // ap_uint<64>* acc1_out;
191
+ int* cycle;
192
+ OCL_CHECK(err,
193
+ X_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc0, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
194
+ OCL_CHECK(err,
195
+ X_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_X_acc1, CL_TRUE, CL_MAP_WRITE, 0, L*D, NULL, NULL, &err));
196
+ OCL_CHECK(err,
197
+ W_acc0 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc0, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
198
+ OCL_CHECK(err,
199
+ W_acc1 = (ap_int<8>*)q.enqueueMapBuffer(buffer_W_acc1, CL_TRUE, CL_MAP_WRITE, 0, D * D_head * NUM_DUM_SLR * 8 + D * D_ffn, NULL, NULL, &err));
200
+ OCL_CHECK(err, acc0_out = (ap_uint<128>*)q.enqueueMapBuffer(buffer_acc0_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D * 2, NULL,
201
+ NULL, &err));
202
+ // OCL_CHECK(err, acc1_out = (ap_uint<64>*)q.enqueueMapBuffer(buffer_acc1_out, CL_TRUE, CL_MAP_READ, 0, NUM_SLR * L * D, NULL,
203
+ // NULL, &err));
204
+ OCL_CHECK(err, cycle = (int*)q.enqueueMapBuffer(buffer_cycle, CL_TRUE, CL_MAP_READ, 0, sizeof(int), NULL,
205
+ NULL, &err));
206
+
207
+ // Initialize the vectors used in the test
208
+ for(int i = 0; i < L * D; i++){
209
+ X_acc0[i] = 1;
210
+ X_acc1[i] = 1;
211
+ }
212
+
213
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
214
+ W_acc1[i] = 1;
215
+ }
216
+
217
+ for(int i = 0; i < D * D_head * NUM_DUM_SLR * 8 + D * D_ffn; i++){
218
+ W_acc0[i] = 1;
219
+ }
220
+
221
+ std::cout << "Finish assigning values\n";
222
+
223
+ cl::Event event;
224
+ uint64_t nstimestart, nstimeend;
225
+ uint64_t exe_time = 0;
226
+
227
+ // Data will be migrated to kernel space
228
+ OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_X_acc0, buffer_X_acc1, buffer_W_acc0, buffer_W_acc1}, 0 /* 0 means from host*/));
229
+
230
+ std::cout << "Start kernel\n";
231
+
232
+ // Launch the Kernel
233
+ OCL_CHECK(err, err = q.enqueueTask(krnl_vector_add, nullptr, &event));
234
+
235
+ std::cout << "Finish kernel\n";
236
+
237
+ // The result of the previous kernel execution will need to be retrieved in
238
+ // order to view the results. This call will transfer the data from FPGA to
239
+ // source_results vector
240
+ OCL_CHECK(err, q.enqueueMigrateMemObjects({buffer_acc0_out, buffer_cycle}, CL_MIGRATE_MEM_OBJECT_HOST));
241
+
242
+ std::cout << "Receive data\n";
243
+
244
+ OCL_CHECK(err, q.finish());
245
+ OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_START, &nstimestart));
246
+ OCL_CHECK(err, err = event.getProfilingInfo<uint64_t>(CL_PROFILING_COMMAND_END, &nstimeend));
247
+ exe_time += nstimeend - nstimestart;
248
+
249
+ // Verify the result
250
+ int match = 0;
251
+ // for (int i = 0; i < DATA_SIZE; i++) {
252
+ // int host_result = ptr_a[i] + ptr_b[i];
253
+ // if (ptr_result[i] != host_result) {
254
+ // printf(error_message.c_str(), i, host_result, ptr_result[i]);
255
+ // match = 1;
256
+ // break;
257
+ // }
258
+ // }
259
+ std::cout << "Cycle count: " << cycle[0] << std::endl;
260
+ std::cout << "Latency: " << exe_time << " ns" << std::endl;
261
+
262
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc0, X_acc0));
263
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_X_acc1, X_acc1));
264
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc0, W_acc0));
265
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_W_acc1, W_acc1));
266
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc0_out, acc0_out));
267
+ // OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_acc1_out, acc1_out));
268
+ OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_cycle, cycle));
269
+ OCL_CHECK(err, err = q.finish());
270
+
271
+ std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl;
272
+ return (match ? EXIT_FAILURE : EXIT_SUCCESS);
273
+ }
gpt-2-medium/host_opencl.h ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*******************************************************************************
2
+ Vendor: Xilinx
3
+ Associated Filename: vadd.h
4
+ Purpose: VITIS vector addition
5
+ Revision History: January 28, 2016
6
+
7
+ *******************************************************************************
8
+ Copyright (C) 2019 XILINX, Inc.
9
+
10
+ This file contains confidential and proprietary information of Xilinx, Inc. and
11
+ is protected under U.S. and international copyright and other intellectual
12
+ property laws.
13
+
14
+ DISCLAIMER
15
+ This disclaimer is not a license and does not grant any rights to the materials
16
+ distributed herewith. Except as otherwise provided in a valid license issued to
17
+ you by Xilinx, and to the maximum extent permitted by applicable law:
18
+ (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
19
+ HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
20
+ INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
21
+ FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
22
+ in contract or tort, including negligence, or under any other theory of
23
+ liability) for any loss or damage of any kind or nature related to, arising under
24
+ or in connection with these materials, including for any direct, or any indirect,
25
+ special, incidental, or consequential loss or damage (including loss of data,
26
+ profits, goodwill, or any type of loss or damage suffered as a result of any
27
+ action brought by a third party) even if such damage or loss was reasonably
28
+ foreseeable or Xilinx had been advised of the possibility of the same.
29
+
30
+ CRITICAL APPLICATIONS
31
+ Xilinx products are not designed or intended to be fail-safe, or for use in any
32
+ application requiring fail-safe performance, such as life-support or safety
33
+ devices or systems, Class III medical devices, nuclear facilities, applications
34
+ related to the deployment of airbags, or any other applications that could lead
35
+ to death, personal injury, or severe property or environmental damage
36
+ (individually and collectively, "Critical Applications"). Customer assumes the
37
+ sole risk and liability of any use of Xilinx products in Critical Applications,
38
+ subject only to applicable laws and regulations governing limitations on product
39
+ liability.
40
+
41
+ THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
42
+ ALL TIMES.
43
+
44
+ *******************************************************************************/
45
+
46
+ #pragma once
47
+
48
+ #define CL_HPP_CL_1_2_DEFAULT_BUILD
49
+ #define CL_HPP_TARGET_OPENCL_VERSION 120
50
+ #define CL_HPP_MINIMUM_OPENCL_VERSION 120
51
+ #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
52
+
53
+ #include <CL/cl2.hpp>
54
+
55
+ //Customized buffer allocation for 4K boundary alignment
56
+ template <typename T>
57
+ struct aligned_allocator
58
+ {
59
+ using value_type = T;
60
+ T* allocate(std::size_t num)
61
+ {
62
+ void* ptr = nullptr;
63
+ if (posix_memalign(&ptr,4096,num*sizeof(T)))
64
+ throw std::bad_alloc();
65
+ return reinterpret_cast<T*>(ptr);
66
+ }
67
+ void deallocate(T* p, std::size_t num)
68
+ {
69
+ free(p);
70
+ }
71
+ };
gpt-2-medium/kernel-ultrascale.cpp ADDED
@@ -0,0 +1,2091 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cmath>
2
+ #include <string>
3
+ #include <tapa.h>
4
+ #include <ap_int.h>
5
+ #include <hls_math.h>
6
+
7
+ constexpr int D = 1024;
8
+ constexpr int D_div_2 = D / 2;
9
+ constexpr int D_div_4 = D / 4;
10
+ constexpr int D_ffn = 3072;
11
+ constexpr int N_head = 16;
12
+ constexpr int MAX_SEQ_LEN = 1024;
13
+ constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
14
+ constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
15
+ constexpr int NUM_SLR = 3;
16
+ constexpr int NUM_DUM_SLR = 4;
17
+ constexpr int TOTAL_PORT = NUM_SLR * 2;
18
+ constexpr int D_head = D / N_head;
19
+ constexpr int D_head_div_32 = D_head / 32;
20
+ constexpr int D_head_div_16 = D_head / 16;
21
+ constexpr int D_head_div_8 = D_head / 8;
22
+ constexpr int D_head_div_4 = D_head / 4;
23
+ constexpr int D_head_div_2 = D_head / 2;
24
+ constexpr int D_div_8 = D / 8;
25
+ constexpr int D_div_16 = D / 16;
26
+ constexpr int D_ffn_SLR = 1376;
27
+ constexpr int D_ffn_SLR_div_8 = D_ffn_SLR / 8;
28
+ constexpr int D_ffn_SLR_div_2 = D_ffn_SLR / 2;
29
+ constexpr int FFN_WEIGHT_SIZE = D * D_ffn_SLR * NUM_DUM_SLR;
30
+ constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
31
+ constexpr int WEIGHT_D = D * 2;
32
+ constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 15 / 2; // multi-head attention
33
+ constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE + FFN_WEIGHT_SIZE;
34
+ constexpr int CONTEXT_D = D_head_div_8 * 5;
35
+ constexpr int D_head_mul_4 = D_head * 4;
36
+ constexpr int D_write_zero_acc0 = D / 32;
37
+ constexpr int D_write_zero_acc1 = D / 32 + D / 16;
38
+
39
+ using int_v16 = tapa::vec_t<int, 16>;
40
+ using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
41
+ using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
42
+
43
+ template <typename data_t>
44
+ inline void bh(tapa::istream<data_t> & q) {
45
+ #pragma HLS inline
46
+ for (;;) {
47
+ #pragma HLS pipeline II=1 style=stp
48
+ data_t tmp; q.try_read(tmp);
49
+ }
50
+ }
51
+
52
+ struct ConfigInst {
53
+ ap_uint<3> stage; // stage 7 -> read L
54
+ ap_uint<11> weight_bound;
55
+ ap_uint<7> i_bound;
56
+ ap_uint<8> j_bound;
57
+ ap_uint<8> k_bound;
58
+ };
59
+
60
+ void black_hole_int(tapa::istream<int> & fifo_in) {
61
+ bh(fifo_in);
62
+ }
63
+
64
+ void black_hole_inst(tapa::istream<ConfigInst> & fifo_in) {
65
+ bh(fifo_in);
66
+ }
67
+
68
+ void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
69
+ bh(fifo_in);
70
+ }
71
+
72
+ void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
73
+ bh(fifo_in);
74
+ }
75
+
76
+ void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
77
+ bh(fifo_in);
78
+ }
79
+
80
+ void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
81
+ bh(fifo_in);
82
+ }
83
+
84
+ void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
85
+ bh(fifo_in);
86
+ }
87
+
88
+ void read_W(
89
+ tapa::async_mmap<ap_uint<512>>& vec,
90
+ tapa::ostream<ap_uint<512>>& fifo_out
91
+ ){
92
+ for(int i_req = 0, i_resp = 0; i_resp < (TOTAL_WEIGHT_SIZE >> 7);){
93
+ #pragma HLS pipeline II=1 style=stp
94
+ if((i_req < (TOTAL_WEIGHT_SIZE >> 7)) & !vec.read_addr.full()){
95
+ vec.read_addr.write(i_req);
96
+ i_req++;
97
+ }
98
+ ap_uint<512> tmp_o;
99
+ bool success = vec.read_data.try_read(tmp_o);
100
+ if(success){
101
+ fifo_out.write(tmp_o);
102
+ i_resp++;
103
+ }
104
+ }
105
+ }
106
+
107
+ void read_X(
108
+ const int N,
109
+ tapa::async_mmap<ap_uint<512>>& vec,
110
+ tapa::ostream<ap_uint<512>>& fifo_out
111
+ ){
112
+ for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
113
+ #pragma HLS pipeline II=1 style=stp
114
+ if((i_req < (N >> 6)) & !vec.read_addr.full()){
115
+ vec.read_addr.write(i_req);
116
+ i_req++;
117
+ }
118
+ ap_uint<512> tmp_o;
119
+ bool success = vec.read_data.try_read(tmp_o);
120
+ if(success){
121
+ fifo_out.write(tmp_o);
122
+ i_resp++;
123
+ }
124
+ }
125
+ }
126
+
127
+ void read_inst(
128
+ const int L,
129
+ tapa::ostream<ConfigInst>& fifo_out_acc0,
130
+ tapa::ostream<ConfigInst>& fifo_out_acc1
131
+ ){
132
+ ConfigInst len;
133
+ len.stage = 7;
134
+ len.weight_bound = L;
135
+
136
+ fifo_out_acc0.write(len);
137
+ fifo_out_acc1.write(len);
138
+
139
+ for(int stage_i = 0; stage_i < 17; stage_i++){
140
+ #pragma HLS pipeline II=1 style=stp
141
+
142
+ ConfigInst inst_acc0;
143
+ ConfigInst inst_acc1;
144
+ const int stage = (stage_i < 15) ? (stage_i % 3) : (stage_i - 12);
145
+
146
+ inst_acc0.stage = ap_uint<3>(stage);
147
+ inst_acc1.stage = ap_uint<3>(stage);
148
+ if(stage == 0){
149
+ inst_acc0.weight_bound = D_head_div_4;
150
+ inst_acc0.i_bound = (L >> 4);
151
+ inst_acc0.j_bound = D_head_div_16;
152
+ inst_acc0.k_bound = D_div_8;
153
+
154
+ inst_acc1 = inst_acc0;
155
+ } else if (stage == 1){
156
+ inst_acc0.weight_bound = D_head_div_8;
157
+ inst_acc0.i_bound = (L >> 4);
158
+ inst_acc0.j_bound = D_head_div_32;
159
+ inst_acc0.k_bound = D_div_8;
160
+
161
+ inst_acc1 = inst_acc0;
162
+ } else if (stage == 2){
163
+ inst_acc0.weight_bound = 0;
164
+ inst_acc0.i_bound = (L >> 4);
165
+ inst_acc0.j_bound = (L >> 4);
166
+ inst_acc0.k_bound = D_head_div_8;
167
+
168
+ inst_acc1.weight_bound = 0;
169
+ inst_acc1.i_bound = (L >> 4);
170
+ inst_acc1.j_bound = D_head_div_16;
171
+ inst_acc1.k_bound = (L >> 3);
172
+ } else if (stage == 3){
173
+ inst_acc0.weight_bound = (CONTEXT_D << 1);
174
+ inst_acc0.i_bound = (L >> 5);
175
+ inst_acc0.j_bound = D_div_16;
176
+ inst_acc0.k_bound = CONTEXT_D;
177
+
178
+ inst_acc1 = inst_acc0;
179
+ } else {
180
+ inst_acc0.weight_bound = (D_ffn_SLR >> 2);
181
+ inst_acc0.i_bound = (L >> 4);
182
+ inst_acc0.j_bound = (D_ffn_SLR >> 4);
183
+ inst_acc0.k_bound = D_div_8;
184
+
185
+ inst_acc1.weight_bound = D_div_4;
186
+ inst_acc1.i_bound = (L >> 4);
187
+ inst_acc1.j_bound = D_div_16;
188
+ inst_acc1.k_bound = D_ffn_SLR_div_8;
189
+ }
190
+ fifo_out_acc0.write(inst_acc0);
191
+ fifo_out_acc1.write(inst_acc1);
192
+ }
193
+ }
194
+
195
+ void packet_switch_acc(
196
+ tapa::istream<int>& fifo_inst_in,
197
+ tapa::ostream<int>& fifo_sfu_out,
198
+ tapa::ostream<int>& fifo_sfu_gelu
199
+ ) {
200
+ const int L = fifo_inst_in.read();
201
+ fifo_sfu_out.write(L);
202
+ fifo_sfu_gelu.write(L);
203
+ }
204
+
205
+ void write_mtx(
206
+ const int N,
207
+ tapa::async_mmap<ap_uint<128>>& output_mtx,
208
+ tapa::istream<ap_uint<128>>& fifo_in,
209
+ tapa::ostream<bool>& fifo_fin
210
+ ){
211
+
212
+ for(int i_req = 0, i_resp = 0; i_resp < N;){
213
+ #pragma HLS pipeline II=1 style=stp
214
+ if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
215
+ output_mtx.write_addr.try_write(i_req);
216
+ ap_uint<128> tmp; fifo_in.try_read(tmp);
217
+ output_mtx.write_data.try_write(tmp);
218
+ ++i_req;
219
+ }
220
+ bool success = false;
221
+ auto resp = output_mtx.write_resp.read(success);
222
+ if(success){
223
+ i_resp += unsigned(resp)+1;
224
+ }
225
+ }
226
+ fifo_fin.write(true);
227
+ }
228
+
229
+ void write_zero(
230
+ const int L,
231
+ const int D,
232
+ tapa::ostream<ap_uint<512>>& fifo_zero
233
+ ){
234
+ for(int i = 0; i < L * D;){
235
+ if(!fifo_zero.full()){
236
+ ap_uint<512> tmp = 0;
237
+ fifo_zero.try_write(tmp);
238
+ i++;
239
+ }
240
+ }
241
+ }
242
+
243
+ // acc slr0 master node
244
+ void temporal_acc0_slr0(
245
+ tapa::istream<ConfigInst>& fifo_inst_in,
246
+ tapa::ostream<ConfigInst>& fifo_inst_out,
247
+ tapa::ostream<int>& fifo_len_sfu,
248
+ tapa::istream<ap_uint<512>>& fifo_X_in,
249
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
250
+ tapa::istream<ap_uint<512>>& fifo_W_in,
251
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
252
+ tapa::istream<ap_uint<256>>& fifo_from_acc1,
253
+ tapa::ostream<ap_uint<512>>& fifo_O_out,
254
+ tapa::ostream<ap_uint<512>>& fifo_ffn_out,
255
+ tapa::istream<ap_uint<1024>>& fifo_context,
256
+ tapa::istream<ap_uint<1024>>& fifo_ffn_in,
257
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
258
+ tapa::ostream<ap_uint<512>>& fifo_res_send
259
+ // tapa::ostream<ap_uint<64>>& fifo_write,
260
+ // tapa::ostream<bool>& fifo_fin
261
+ ){
262
+
263
+ ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
264
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
265
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
266
+ #pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
267
+
268
+ ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
269
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
270
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
271
+ #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
272
+
273
+ ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
274
+ #pragma HLS array_partition variable=X cyclic dim=1 factor=16
275
+ #pragma HLS array_partition variable=X cyclic dim=2 factor=2
276
+ #pragma HLS bind_storage variable=X type=ram_2p impl=uram
277
+
278
+ ConfigInst len = fifo_inst_in.read();
279
+ const int L = len.weight_bound;
280
+ fifo_inst_out.write(len);
281
+ fifo_len_sfu.write(L);
282
+
283
+ for(int stage_i = 0; stage_i < 17; stage_i++){
284
+
285
+ //TODO: stage send from inst
286
+
287
+ // stage 0: WqX
288
+ // stage 1: WkX0 <- acc1
289
+ // stage 2: QK^T
290
+
291
+ ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // TODO: reduce dimension
292
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=8
293
+ #pragma HLS bind_storage variable=W type=ram_2p impl=uram
294
+
295
+ ConfigInst inst = fifo_inst_in.read();
296
+ fifo_inst_out.write(inst);
297
+
298
+ const ap_uint<3> stage = inst.stage;
299
+
300
+ // load weights and forward
301
+ if(stage != 2) { // TODO: 1d array & uniform access
302
+ const int weight_bound = inst.weight_bound;
303
+ for(int i = 0; i < weight_bound; i++){
304
+ load_weight:
305
+ for(int j = 0; j < D_div_8;){
306
+ if(!fifo_W_in.empty()){
307
+ ap_uint<512> val; fifo_W_in.try_read(val);
308
+
309
+ for(int k = 0; k < 2; k++){
310
+ #pragma HLS unroll
311
+ W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
312
+ }
313
+ val = ap_uint<512>(val >> 128);
314
+ fifo_W_out.write(val);
315
+ j++;
316
+ }
317
+ }
318
+ }
319
+ }
320
+
321
+ // stage 1: compute Q
322
+ const int i_bound = inst.i_bound;
323
+ const int j_bound = inst.j_bound;
324
+ const int k_bound = inst.k_bound;
325
+
326
+ for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 16
327
+ if(stage_i == 0){
328
+ for(int ii = 0; ii < 2; ii++){ // load only 1 time
329
+ load_x:
330
+ for(int jj = 0; jj < D_div_8;){
331
+ if(!fifo_X_in.empty()){
332
+ ap_uint<512> val; fifo_X_in.try_read(val);
333
+
334
+ for(int k = 0; k < 8; k++){
335
+ #pragma HLS unroll
336
+ X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
337
+ }
338
+ jj++;
339
+ }
340
+ }
341
+ }
342
+ }
343
+
344
+ for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
345
+ #pragma HLS loop_flatten off
346
+
347
+ ap_int<38> acc_vec[8][16][8];
348
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
349
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
350
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
351
+
352
+ for(int ii = 0; ii < 8; ii++){
353
+ #pragma HLS unroll
354
+ for(int kk = 0; kk < 16; kk++){
355
+ #pragma HLS unroll
356
+ for(int k = 0; k < 8; k++){
357
+ #pragma HLS unroll
358
+ acc_vec[ii][kk][k] = 0;
359
+ }
360
+ }
361
+ }
362
+
363
+ compute:
364
+ for(int k = 0; k < k_bound; k++){ // reduction dim
365
+ #pragma HLS pipeline II=1 style=stp
366
+
367
+ ap_uint<64> op1_mtx[16];
368
+ ap_uint<64> op2_mtx[16];
369
+ #pragma HLS array_partition variable=op1_mtx complete
370
+ #pragma HLS array_partition variable=op2_mtx complete
371
+
372
+ ap_uint<1024> recv_pkt;
373
+
374
+ if(stage == 3) {
375
+ recv_pkt = fifo_context.read();
376
+ }else if(stage == 4) {
377
+ recv_pkt = fifo_ffn_in.read();
378
+ }
379
+
380
+ for(int ii = 0; ii < 16; ii++){
381
+ #pragma HLS unroll
382
+ if(stage > 2){
383
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
384
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
385
+ } else if(stage == 2) {
386
+ op1_mtx[ii] = scratchpad_k[j*16+ii][k];
387
+ op2_mtx[ii] = scratchpad_q[i*16+ii][k];
388
+ } else {
389
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
390
+ op2_mtx[ii] = X[i*16+ii][k];
391
+ }
392
+ }
393
+
394
+ if(stage < 2){
395
+ ap_uint<1024> send_pkt = ap_uint<1024>((
396
+ op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
397
+ op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
398
+ ));
399
+ fifo_X_out.write(send_pkt);
400
+ } else if (stage == 4) {
401
+ fifo_X_out.write(recv_pkt);
402
+ }
403
+
404
+ for(int ii = 0; ii < 8; ii++){
405
+ #pragma HLS unroll
406
+ for(int kk = 0; kk < 16; kk++){
407
+ #pragma HLS unroll
408
+ for(int l = 0; l < 8; l++){
409
+ #pragma HLS unroll
410
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
411
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
412
+ if(stage == 2){
413
+ op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
414
+ op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
415
+ } else {
416
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
417
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
418
+ }
419
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
420
+ acc_vec[ii][kk][l] += w_pack * op3;
421
+ }
422
+ }
423
+ }
424
+ }
425
+
426
+ ap_int<22> acc_final[16][16];
427
+ #pragma HLS array_partition variable=acc_final dim=1 complete
428
+ #pragma HLS array_partition variable=acc_final dim=2 complete
429
+
430
+ for(int ii = 0; ii < 16; ii++){
431
+ #pragma HLS unroll
432
+ for(int k = 0; k < 16; k++){
433
+ #pragma HLS unroll
434
+ acc_final[ii][k] = 0;
435
+ }
436
+ }
437
+
438
+ reduction:
439
+ for(int kk = 0; kk < 8; kk++){
440
+ for(int ii = 0; ii < 16; ii++){
441
+ #pragma HLS unroll
442
+ for(int k = 0; k < 8; k++){
443
+ #pragma HLS unroll
444
+ ap_int<19> res0; ap_int<19> res1;
445
+ (res1, res0) = acc_vec[kk][ii][k];
446
+ res1 = res1 + res0[18];
447
+ acc_final[ii][k*2] += res0;
448
+ acc_final[ii][k*2+1] += res1;
449
+ }
450
+ }
451
+ }
452
+
453
+ if(stage == 0){
454
+ for(int ii = 0; ii < 16; ii++){
455
+ #pragma HLS unroll
456
+ for(int k = 0; k < 16; k++){
457
+ #pragma HLS unroll
458
+ int offset = k%8;
459
+ scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
460
+ }
461
+ }
462
+ } else if (stage == 1){
463
+ for(int ii = 0; ii < 4; ii++){
464
+ for(int jj = 0; jj < 2; jj++){
465
+ #pragma HLS pipeline II=1 style=stp
466
+ ap_uint<256> tmp = fifo_from_acc1.read();
467
+
468
+ for(int l = 0; l < 4; l++){
469
+ #pragma HLS unroll
470
+ ap_uint<64> tmp_pack;
471
+ for(int k = 0; k < 8; k++){
472
+ #pragma HLS unroll
473
+ tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
474
+ }
475
+ scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
476
+ }
477
+ for(int l = 0; l < 4; l++){
478
+ #pragma HLS unroll
479
+ scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
480
+ }
481
+ }
482
+ }
483
+ } else if(stage == 2 || stage == 4){
484
+ for(int kk = 0; kk < 16; kk++){
485
+ #pragma HLS pipeline II=1 style=stp
486
+ ap_uint<512> tmp;
487
+ for(int ii = 0; ii < 16; ii++){
488
+ #pragma HLS unroll
489
+ if(stage == 2 && (i*16+ii < j*16+kk)){
490
+ tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
491
+ } else {
492
+ tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
493
+ }
494
+ }
495
+ if(stage == 2) fifo_O_out.write(tmp);
496
+ else fifo_ffn_out.write(tmp);
497
+ }
498
+ } else {
499
+ final_acc:
500
+ for(int ii = 0; ii < 16; ii++){
501
+ #pragma HLS pipeline II=1 style=stp
502
+ #pragma HLS dependence variable=X type=inter false
503
+ ap_uint<512> tmp_recv = fifo_reduce_recv.read();
504
+ ap_uint<512> tmp_send;
505
+ for(int k = 0; k < 16; k++){
506
+ #pragma HLS unroll
507
+ ap_int<32> tmp = acc_final[ii][k] + ap_int<32>(tmp_recv(k*32+31, k*32));
508
+ tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
509
+ tmp_send(k*32+31, k*32) = tmp;
510
+ }
511
+ fifo_res_send.write(tmp_send);
512
+ }
513
+ }
514
+ }
515
+ }
516
+ }
517
+ // fifo_fin.write(true);
518
+
519
+ // write:
520
+ // for(int i = 0; i < L; i++){
521
+ // for(int j = 0; j < D_div_8; j++){
522
+ // #pragma HLS pipeline II=1 style=stp
523
+ // fifo_write.write(X[i][j]);
524
+ // }
525
+ // }
526
+ }
527
+
528
+ void temporal_acc0(
529
+ tapa::istream<ConfigInst>& fifo_inst_in,
530
+ tapa::ostream<ConfigInst>& fifo_inst_out,
531
+ tapa::ostream<int>& fifo_len_sfu,
532
+ tapa::istream<ap_uint<1024>>& fifo_X_in,
533
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
534
+ tapa::istream<ap_uint<512>>& fifo_W_in,
535
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
536
+ tapa::istream<ap_uint<256>>& fifo_from_acc1,
537
+ tapa::ostream<ap_uint<512>>& fifo_O_out,
538
+ tapa::istream<ap_uint<1024>>& fifo_context,
539
+ tapa::ostream<ap_uint<512>>& fifo_ffn_out,
540
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
541
+ tapa::ostream<ap_uint<512>>& fifo_reduce_send
542
+ ){
543
+
544
+ ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
545
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
546
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
547
+ #pragma HLS bind_storage variable=scratchpad_q type=ram_2p impl=bram
548
+
549
+ ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
550
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
551
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
552
+ #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=bram
553
+
554
+ ConfigInst len = fifo_inst_in.read();
555
+ const int L = len.weight_bound;
556
+ fifo_inst_out.write(len);
557
+ fifo_len_sfu.write(L);
558
+
559
+ for(int stage_i = 0; stage_i < 17; stage_i++){
560
+ #pragma HLS loop_flatten off
561
+
562
+ // stage 0: WqX
563
+ // stage 1: WkX0 <- acc1
564
+ // stage 2: QK^T
565
+ // stage 3: WoO
566
+
567
+ ap_uint<64> W[D_ffn_SLR_div_2][D_div_8]; // 4 bit
568
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=8
569
+ #pragma HLS bind_storage variable=W type=ram_2p impl=uram
570
+
571
+ ConfigInst inst = fifo_inst_in.read();
572
+ fifo_inst_out.write(inst);
573
+
574
+ const ap_uint<3> stage = inst.stage;
575
+
576
+ // load weights and forward
577
+ if(stage != 2) {
578
+ const int weight_bound = inst.weight_bound;
579
+ for(int i = 0; i < weight_bound; i++){
580
+ load_weight:
581
+ for(int j = 0; j < D_div_8;){
582
+ if(!fifo_W_in.empty()){
583
+ ap_uint<512> val; fifo_W_in.try_read(val);
584
+
585
+ for(int k = 0; k < 2; k++){
586
+ #pragma HLS unroll
587
+ W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
588
+ }
589
+ val = ap_uint<512>(val >> 128);
590
+ fifo_W_out.write(val);
591
+ j++;
592
+ }
593
+ }
594
+ }
595
+ }
596
+
597
+ const int i_bound = inst.i_bound;
598
+ const int j_bound = inst.j_bound;
599
+ const int k_bound = inst.k_bound;
600
+
601
+ // stage 1: compute Q
602
+ for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 64
603
+
604
+ for(int j = 0; (j < j_bound) & ((stage != 2) | (j <= i)); j++){
605
+ #pragma HLS loop_flatten off
606
+
607
+ ap_int<38> acc_vec[8][16][8];
608
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
609
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
610
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
611
+
612
+ for(int ii = 0; ii < 8; ii++){
613
+ #pragma HLS unroll
614
+ for(int kk = 0; kk < 16; kk++){
615
+ #pragma HLS unroll
616
+ for(int k = 0; k < 8; k++){
617
+ #pragma HLS unroll
618
+ acc_vec[ii][kk][k] = 0;
619
+ }
620
+ }
621
+ }
622
+
623
+ compute:
624
+ for(int k = 0; k < k_bound; k++){ // reduction dim
625
+ #pragma HLS pipeline II=1 style=stp
626
+
627
+ ap_uint<64> op1_mtx[16];
628
+ ap_uint<64> op2_mtx[16];
629
+ #pragma HLS array_partition variable=op1_mtx complete
630
+ #pragma HLS array_partition variable=op2_mtx complete
631
+
632
+ ap_uint<1024> recv_pkt;
633
+ if(stage == 3){
634
+ recv_pkt = fifo_context.read();
635
+ } else if(stage != 2) {
636
+ recv_pkt = fifo_X_in.read();
637
+ fifo_X_out.write(recv_pkt);
638
+ }
639
+
640
+ for(int ii = 0; ii < 16; ii++){
641
+ #pragma HLS unroll
642
+ if(stage == 2) {
643
+ op1_mtx[ii] = scratchpad_q[i*16+ii][k];
644
+ op2_mtx[ii] = scratchpad_k[j*16+ii][k];
645
+ } else {
646
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
647
+ op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
648
+ }
649
+ }
650
+
651
+ for(int ii = 0; ii < 8; ii++){
652
+ #pragma HLS unroll
653
+ for(int kk = 0; kk < 16; kk++){
654
+ #pragma HLS unroll
655
+ for(int l = 0; l < 8; l++){
656
+ #pragma HLS unroll
657
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
658
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
659
+ if(stage == 2){
660
+ op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
661
+ op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
662
+ } else {
663
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
664
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
665
+ }
666
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
667
+ acc_vec[ii][kk][l] += w_pack * op3;
668
+ }
669
+ }
670
+ }
671
+ }
672
+
673
+ ap_int<22> acc_final[16][16];
674
+ #pragma HLS array_partition variable=acc_final dim=1 complete
675
+ #pragma HLS array_partition variable=acc_final dim=2 complete
676
+
677
+ for(int ii = 0; ii < 16; ii++){
678
+ #pragma HLS unroll
679
+ for(int k = 0; k < 16; k++){
680
+ #pragma HLS unroll
681
+ acc_final[ii][k] = 0;
682
+ }
683
+ }
684
+
685
+ reduction:
686
+ for(int kk = 0; kk < 8; kk++){
687
+ for(int ii = 0; ii < 16; ii++){
688
+ #pragma HLS unroll
689
+ for(int k = 0; k < 8; k++){
690
+ #pragma HLS unroll
691
+ ap_int<19> res0; ap_int<19> res1;
692
+ (res1, res0) = acc_vec[kk][ii][k];
693
+ res1 = res1 + res0[18];
694
+ acc_final[ii][k*2] += res0;
695
+ acc_final[ii][k*2+1] += res1;
696
+ }
697
+ }
698
+ }
699
+
700
+ if(stage == 0){
701
+ for(int ii = 0; ii < 16; ii++){
702
+ #pragma HLS unroll
703
+ for(int k = 0; k < 16; k++){
704
+ #pragma HLS unroll
705
+ int offset = k%8;
706
+ scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
707
+ }
708
+ }
709
+ } else if (stage == 1){
710
+ for(int ii = 0; ii < 4; ii++){
711
+ for(int jj = 0; jj < 2; jj++){
712
+ #pragma HLS pipeline II=1 style=stp
713
+ ap_uint<256> tmp = fifo_from_acc1.read();
714
+
715
+ for(int l = 0; l < 4; l++){
716
+ #pragma HLS unroll
717
+ ap_uint<64> tmp_pack;
718
+ for(int k = 0; k < 8; k++){
719
+ #pragma HLS unroll
720
+ tmp_pack(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+l][jj*8+k] >> 8);
721
+ }
722
+ scratchpad_k[i*16+ii*4+l][j*4+jj*2] = tmp_pack;
723
+ }
724
+ for(int l = 0; l < 4; l++){
725
+ #pragma HLS unroll
726
+ scratchpad_k[i*16+ii*4+l][j*4+jj*2+1] = tmp(l*64+63, l*64);
727
+ }
728
+ }
729
+ }
730
+ } else if(stage == 2 || stage == 4){
731
+ for(int kk = 0; kk < 16; kk++){
732
+ #pragma HLS pipeline II=1 style=stp
733
+ ap_uint<512> tmp;
734
+ for(int ii = 0; ii < 16; ii++){
735
+ #pragma HLS unroll
736
+ if(stage == 2 && (i*16+ii < j*16+kk)){
737
+ tmp(ii*32+31, ii*32) = ap_int<32>(-1e8); // masking (inefficient)
738
+ } else {
739
+ tmp(ii*32+31, ii*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
740
+ }
741
+ }
742
+ if(stage == 2) fifo_O_out.write(tmp);
743
+ else fifo_ffn_out.write(tmp);
744
+ }
745
+ } else {
746
+ final_acc:
747
+ for(int ii = 0; ii < 16; ii++){
748
+ #pragma HLS pipeline II=1 style=stp
749
+ ap_uint<512> tmp_recv = fifo_reduce_recv.read();
750
+ ap_uint<512> tmp;
751
+ for(int k = 0; k < 16; k++){
752
+ #pragma HLS unroll
753
+ acc_final[ii][k] += ap_int<24>(tmp_recv(k*32+23, k*32));
754
+ tmp(k*32+23, k*32) = acc_final[ii][k];
755
+ }
756
+ fifo_reduce_send.write(tmp);
757
+ }
758
+ }
759
+ }
760
+ }
761
+ }
762
+ }
763
+
764
+ // acc slr0 master node
765
+ void temporal_acc1_slr0(
766
+ tapa::istream<ConfigInst>& fifo_inst_in,
767
+ tapa::ostream<ConfigInst>& fifo_inst_out,
768
+ tapa::ostream<int>& fifo_len_context,
769
+ tapa::istream<ap_uint<512>>& fifo_X_in,
770
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
771
+ tapa::istream<ap_uint<512>>& fifo_W_in,
772
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
773
+ tapa::ostream<ap_uint<256>>& fifo_to_acc0,
774
+ tapa::istream<ap_uint<128>>& fifo_from_sfu,
775
+ tapa::ostream<ap_uint<1024>>& fifo_O_out,
776
+ tapa::istream<ap_uint<1024>>& fifo_context,
777
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
778
+ tapa::ostream<ap_uint<512>>& fifo_res_send,
779
+ tapa::istream<ap_uint<1024>>& fifo_gelu_in,
780
+ tapa::ostream<ap_uint<512>>& fifo_ffn_out
781
+ // tapa::ostream<ap_uint<64>>& fifo_write,
782
+ // tapa::ostream<bool>& fifo_fin
783
+ ){
784
+ ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
785
+ #pragma HLS array_partition variable=X cyclic dim=1 factor=16
786
+ #pragma HLS array_partition variable=X cyclic dim=2 factor=2
787
+ #pragma HLS bind_storage variable=X type=ram_2p impl=uram
788
+
789
+ ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
790
+ #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
791
+ #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
792
+ #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
793
+
794
+ // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
795
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
796
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
797
+
798
+ ConfigInst len = fifo_inst_in.read();
799
+ const int L = len.weight_bound;
800
+ fifo_inst_out.write(len);
801
+ fifo_len_context.write(L);
802
+
803
+ for(int stage_i = 0; stage_i < 17; stage_i++){
804
+
805
+ // stage 0: WvX
806
+ // stage 1: WkX1 -> acc0
807
+ // stage 2: Softmax(QK)V <- acc0
808
+ // stage 3: WoO
809
+
810
+ ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
811
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=8
812
+ #pragma HLS bind_storage variable=W type=ram_2p impl=uram
813
+
814
+
815
+ ConfigInst inst = fifo_inst_in.read();
816
+ fifo_inst_out.write(inst);
817
+
818
+ const ap_uint<3> stage = inst.stage;
819
+
820
+ // load weights and forward
821
+ if(stage != 2) {
822
+ const int weight_bound = inst.weight_bound;
823
+ int sub_bound = D_div_8;
824
+ if (stage == 4) sub_bound = D_ffn_SLR_div_8;
825
+ for(int i = 0; i < weight_bound; i++){
826
+ load_weight:
827
+ for(int j = 0; j < sub_bound;){
828
+ if(!fifo_W_in.empty()){
829
+ ap_uint<512> val; fifo_W_in.try_read(val);
830
+
831
+ for(int k = 0; k < 2; k++){
832
+ #pragma HLS unroll
833
+ W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
834
+ }
835
+ val = ap_uint<512>(val >> 128);
836
+ fifo_W_out.write(val);
837
+ j++;
838
+ }
839
+ }
840
+ }
841
+ }
842
+
843
+ const int i_bound = inst.i_bound;
844
+ const int j_bound = inst.j_bound;
845
+
846
+ for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
847
+
848
+ const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
849
+
850
+ ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
851
+ #pragma HLS array_partition variable=cache_attn dim=2 complete
852
+ #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
853
+
854
+ if(stage_i == 0){
855
+ for(int ii = 0; ii < 2; ii++){ // load only 1 time
856
+ load_x:
857
+ for(int jj = 0; jj < D_div_8;){
858
+ if(!fifo_X_in.empty()){
859
+ ap_uint<512> val; fifo_X_in.try_read(val);
860
+
861
+ for(int k = 0; k < 8; k++){
862
+ #pragma HLS unroll
863
+ X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
864
+ }
865
+ jj++;
866
+ }
867
+ }
868
+ }
869
+ } else if (stage == 2) {
870
+ for(int ii = 0; ii < ((i+1)*2); ii++){
871
+ ap_uint<32> fuse_reg[16];
872
+ load_attn:
873
+ for(int offset = 0; offset < 8;){
874
+ #pragma HLS pipeline II=1 style=stp
875
+ if(!fifo_from_sfu.empty()){
876
+ ap_uint<128> val; fifo_from_sfu.try_read(val);
877
+ for(int k = 0; k < 16; k++){
878
+ #pragma HLS unroll
879
+ fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
880
+ }
881
+ offset++;
882
+ }
883
+ }
884
+ for(int k = 0; k < 16; k++){
885
+ #pragma HLS unroll
886
+ cache_attn[ii][k] = fuse_reg[k];
887
+ }
888
+ }
889
+ }
890
+
891
+ for(int j = 0; j < j_bound; j++){
892
+ #pragma HLS loop_flatten off
893
+
894
+ ap_int<38> acc_vec[8][16][8];
895
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
896
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
897
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
898
+
899
+ for(int ii = 0; ii < 8; ii++){
900
+ #pragma HLS unroll
901
+ for(int kk = 0; kk < 16; kk++){
902
+ #pragma HLS unroll
903
+ for(int k = 0; k < 8; k++){
904
+ #pragma HLS unroll
905
+ acc_vec[ii][kk][k] = 0;
906
+ }
907
+ }
908
+ }
909
+
910
+ compute:
911
+ for(int k = 0; k < k_bound; k++){
912
+ #pragma HLS pipeline II=1 style=stp
913
+
914
+ ap_uint<64> op1_mtx[16];
915
+ ap_uint<64> op2_mtx[16];
916
+ #pragma HLS array_partition variable=op1_mtx complete
917
+ #pragma HLS array_partition variable=op2_mtx complete
918
+
919
+ ap_uint<1024> recv_pkt;
920
+
921
+ if(stage == 3) {
922
+ recv_pkt = fifo_context.read();
923
+ } else if(stage == 4) {
924
+ recv_pkt = fifo_gelu_in.read();
925
+ }
926
+
927
+ for(int ii = 0; ii < 16; ii++){
928
+ #pragma HLS unroll
929
+ if(stage == 3){
930
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
931
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
932
+ } else if(stage != 2) {
933
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
934
+ op2_mtx[ii] = X[i*16+ii][k];
935
+ } else {
936
+ op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
937
+ op2_mtx[ii] = scratchpad[k][j*16+ii];
938
+ }
939
+ }
940
+
941
+ if(stage < 2){
942
+ ap_uint<1024> send_pkt = ap_uint<1024>((
943
+ op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
944
+ op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
945
+ ));
946
+ fifo_X_out.write(send_pkt);
947
+ }
948
+
949
+ for(int ii = 0; ii < 8; ii++){
950
+ #pragma HLS unroll
951
+ for(int kk = 0; kk < 16; kk++){
952
+ #pragma HLS unroll
953
+ for(int l = 0; l < 8; l++){
954
+ #pragma HLS unroll
955
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
956
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
957
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
958
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
959
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
960
+ acc_vec[ii][kk][l] += w_pack * op3;
961
+ }
962
+ }
963
+ }
964
+ }
965
+
966
+ ap_int<22> acc_final[16][16];
967
+ #pragma HLS array_partition variable=acc_final dim=1 complete
968
+ #pragma HLS array_partition variable=acc_final dim=2 complete
969
+
970
+ for(int ii = 0; ii < 16; ii++){
971
+ #pragma HLS unroll
972
+ for(int k = 0; k < 16; k++){
973
+ #pragma HLS unroll
974
+ acc_final[ii][k] = 0;
975
+ }
976
+ }
977
+
978
+ reduction:
979
+ for(int kk = 0; kk < 8; kk++){
980
+ for(int ii = 0; ii < 16; ii++){
981
+ #pragma HLS unroll
982
+ for(int k = 0; k < 8; k++){
983
+ #pragma HLS unroll
984
+ ap_int<19> res0; ap_int<19> res1;
985
+ (res1, res0) = acc_vec[kk][ii][k];
986
+ res1 = res1 + res0[18];
987
+ acc_final[ii][k*2] += res0;
988
+ acc_final[ii][k*2+1] += res1;
989
+ }
990
+ }
991
+ }
992
+
993
+ if(stage == 0){
994
+ for(int ii = 0; ii < 16; ii++){
995
+ #pragma HLS unroll
996
+ for(int k = 0; k < 16; k++){
997
+ #pragma HLS unroll
998
+ int offset = ii%8;
999
+ scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
1000
+ }
1001
+ }
1002
+ } else if (stage == 2){
1003
+ for(int ii = 0; ii < 2; ii++){
1004
+ #pragma HLS pipeline II=1 style=stp
1005
+ ap_uint<1024> tmp;
1006
+ for(int jj = 0; jj < 8; jj++){
1007
+ #pragma HLS unroll
1008
+ for(int k = 0; k < 16; k++){
1009
+ #pragma HLS unroll
1010
+ tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
1011
+ }
1012
+ }
1013
+ fifo_O_out.write(tmp);
1014
+ }
1015
+ } else if (stage == 1) {
1016
+ for(int ii = 0; ii < 4; ii++){
1017
+ for(int jj = 0; jj < 2; jj++){
1018
+ ap_uint<256> tmp;
1019
+ for(int k = 0; k < 32; k++){
1020
+ #pragma HLS unroll
1021
+ tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
1022
+ }
1023
+ fifo_to_acc0.write(tmp);
1024
+ }
1025
+ }
1026
+ } else {
1027
+ final_acc:
1028
+ for(int ii = 0; ii < 16; ii++){
1029
+ #pragma HLS pipeline II=1 style=stp
1030
+ #pragma HLS dependence variable=X type=inter false
1031
+ ap_uint<512> tmp_recv = fifo_reduce_recv.read();
1032
+ ap_uint<512> tmp_send;
1033
+ for(int k = 0; k < 16; k++){
1034
+ #pragma HLS unroll
1035
+ ap_int<32> tmp = acc_final[ii][k] + ap_int<24>(tmp_recv(k*32+23, k*32));
1036
+ if(stage == 3) tmp += ap_int<8>(X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8));
1037
+ tmp_send(k*32+31, k*32) = tmp;
1038
+ }
1039
+ if(stage == 3) fifo_res_send.write(tmp_send);
1040
+ else fifo_ffn_out.write(tmp_send);
1041
+ }
1042
+ }
1043
+ }
1044
+ }
1045
+ }
1046
+ }
1047
+
1048
+ void residual(
1049
+ const int L,
1050
+ tapa::istream<ap_uint<512>>& fifo_res_in,
1051
+ tapa::ostream<ap_uint<512>>& fifo_res_out
1052
+ ){
1053
+ for(int i = 0; i < (L >> 5); i++){
1054
+ for(int j = 0; j < D_div_16; j++){
1055
+ ap_uint<32> res_buffer[16][16];
1056
+ #pragma HLS array_partition variable=res_buffer complete dim=1
1057
+ #pragma HLS array_partition variable=res_buffer complete dim=2
1058
+
1059
+ read:
1060
+ for(int k = 0; k < 16;){
1061
+ #pragma HLS pipeline II=1 style=stp
1062
+ ap_uint<512> tmp;
1063
+ bool success = fifo_res_in.try_read(tmp);
1064
+ if(success){
1065
+ for(int l = 0; l < 16; l++){
1066
+ #pragma HLS unroll
1067
+ res_buffer[k][l] = ap_uint<32>(tmp(l*32+31, l*32));
1068
+ }
1069
+ k++;
1070
+ }
1071
+ }
1072
+ transpose:
1073
+ for(int k = 0; k < 16; k++){
1074
+ #pragma HLS pipeline II=1 style=stp
1075
+ ap_uint<512> tmp;
1076
+ for(int l = 0; l < 16; l++){
1077
+ #pragma HLS unroll
1078
+ tmp(l*32+31, l*32) = ap_uint<32>(res_buffer[l][k]);
1079
+ }
1080
+ fifo_res_out.write(tmp);
1081
+ }
1082
+ }
1083
+ }
1084
+ }
1085
+
1086
+
1087
+ void temporal_acc1(
1088
+ tapa::istream<ConfigInst>& fifo_inst_in,
1089
+ tapa::ostream<ConfigInst>& fifo_inst_out,
1090
+ tapa::ostream<int>& fifo_len_context,
1091
+ tapa::istream<ap_uint<1024>>& fifo_X_in,
1092
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
1093
+ tapa::istream<ap_uint<512>>& fifo_W_in,
1094
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
1095
+ tapa::ostream<ap_uint<256>>& fifo_to_acc0,
1096
+ tapa::istream<ap_uint<128>>& fifo_from_sfu,
1097
+ tapa::ostream<ap_uint<1024>>& fifo_O_out,
1098
+ tapa::istream<ap_uint<1024>>& fifo_context,
1099
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
1100
+ tapa::ostream<ap_uint<512>>& fifo_reduce_send,
1101
+ tapa::istream<ap_uint<1024>>& fifo_gelu_in
1102
+ ){
1103
+
1104
+ ap_uint<64> scratchpad[MAX_SEQ_LEN_div_8][D_head]; // 8 bit
1105
+ #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=2
1106
+ #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=16
1107
+ #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=bram
1108
+
1109
+ // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
1110
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
1111
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
1112
+
1113
+ ConfigInst len = fifo_inst_in.read();
1114
+ const int L = len.weight_bound;
1115
+ fifo_inst_out.write(len);
1116
+ fifo_len_context.write(L);
1117
+
1118
+ for(int stage_i = 0; stage_i < 17; stage_i++){
1119
+
1120
+ // stage 0: WvX
1121
+ // stage 1: WkX1 -> acc0
1122
+ // stage 2: Softmax(QK)V <- acc0
1123
+ // stage 3: WoO
1124
+
1125
+ ap_uint<64> W[D_div_2][D_ffn_SLR_div_8]; // 4 bit
1126
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=8
1127
+ #pragma HLS bind_storage variable=W type=ram_2p impl=uram
1128
+
1129
+ ConfigInst inst = fifo_inst_in.read();
1130
+ fifo_inst_out.write(inst);
1131
+
1132
+ const ap_uint<3> stage = inst.stage;
1133
+
1134
+ // load weights and forward
1135
+ if(stage != 2) {
1136
+ const int weight_bound = inst.weight_bound;
1137
+ int sub_bound = D_div_8;
1138
+ if (stage == 4) sub_bound = D_ffn_SLR_div_8;
1139
+ for(int i = 0; i < weight_bound; i++){
1140
+ load_weight:
1141
+ for(int j = 0; j < sub_bound;){
1142
+ if(!fifo_W_in.empty()){
1143
+ ap_uint<512> val; fifo_W_in.try_read(val);
1144
+
1145
+ for(int k = 0; k < 2; k++){
1146
+ #pragma HLS unroll
1147
+ W[i*2+k][j] = ap_uint<64>(val(k*64+63, k*64));
1148
+ }
1149
+ val = ap_uint<512>(val >> 128);
1150
+ fifo_W_out.write(val);
1151
+ j++;
1152
+ }
1153
+ }
1154
+ }
1155
+ }
1156
+
1157
+ const int i_bound = inst.i_bound;
1158
+ const int j_bound = inst.j_bound;
1159
+
1160
+ for(int i = 0; i < i_bound; i++){ // make sure L is multiple of 4
1161
+
1162
+ const int k_bound = (stage == 2) ? ap_uint<8>((i+1)*2) : inst.k_bound;
1163
+
1164
+ ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
1165
+ #pragma HLS array_partition variable=cache_attn dim=2 complete
1166
+ #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
1167
+
1168
+ if(stage == 2){
1169
+ for(int ii = 0; ii < ((i+1)*2); ii++){
1170
+ ap_uint<32> fuse_reg[16];
1171
+ load_attn:
1172
+ for(int offset = 0; offset < 8;){
1173
+ #pragma HLS pipeline II=1 style=stp
1174
+ if(!fifo_from_sfu.empty()){
1175
+ ap_uint<128> val; fifo_from_sfu.try_read(val);
1176
+ for(int k = 0; k < 16; k++){
1177
+ #pragma HLS unroll
1178
+ fuse_reg[k](offset*4+3, offset*4) = ap_int<8>(val(k*8+3, k*8));
1179
+ }
1180
+ offset++;
1181
+ }
1182
+ }
1183
+ for(int k = 0; k < 16; k++){
1184
+ #pragma HLS unroll
1185
+ cache_attn[ii][k] = fuse_reg[k];
1186
+ }
1187
+ }
1188
+ }
1189
+
1190
+ for(int j = 0; j < j_bound; j++){
1191
+ #pragma HLS loop_flatten off
1192
+
1193
+ ap_int<38> acc_vec[8][16][8];
1194
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
1195
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
1196
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
1197
+
1198
+ for(int ii = 0; ii < 8; ii++){
1199
+ #pragma HLS unroll
1200
+ for(int kk = 0; kk < 16; kk++){
1201
+ #pragma HLS unroll
1202
+ for(int k = 0; k < 8; k++){
1203
+ #pragma HLS unroll
1204
+ acc_vec[ii][kk][k] = 0;
1205
+ }
1206
+ }
1207
+ }
1208
+ compute:
1209
+ for(int k = 0; k < k_bound; k++){
1210
+ #pragma HLS pipeline II=1 style=stp
1211
+
1212
+ ap_uint<64> op1_mtx[16];
1213
+ ap_uint<64> op2_mtx[16];
1214
+ #pragma HLS array_partition variable=op1_mtx complete
1215
+ #pragma HLS array_partition variable=op2_mtx complete
1216
+
1217
+ ap_uint<1024> recv_pkt;
1218
+
1219
+ if(stage == 3) {
1220
+ recv_pkt = fifo_context.read();
1221
+ }else if(stage == 4) {
1222
+ recv_pkt = fifo_gelu_in.read();
1223
+ }else if(stage != 2) {
1224
+ recv_pkt = fifo_X_in.read();
1225
+ fifo_X_out.write(recv_pkt);
1226
+ }
1227
+
1228
+ for(int ii = 0; ii < 16; ii++){ //TODO: change logic
1229
+ #pragma HLS unroll
1230
+ if (stage != 2) {
1231
+ op1_mtx[ii] = ap_uint<64>(W[j*8+ii/2][k]((ii%2)*32+31, (ii%2)*32));
1232
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
1233
+ } else {
1234
+ op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
1235
+ op2_mtx[ii] = scratchpad[k][j*16+ii];
1236
+ }
1237
+ }
1238
+
1239
+ for(int ii = 0; ii < 8; ii++){
1240
+ #pragma HLS unroll
1241
+ for(int kk = 0; kk < 16; kk++){
1242
+ #pragma HLS unroll
1243
+ for(int l = 0; l < 8; l++){
1244
+ #pragma HLS unroll
1245
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
1246
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
1247
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
1248
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
1249
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
1250
+ acc_vec[ii][kk][l] += w_pack * op3;
1251
+ }
1252
+ }
1253
+ }
1254
+ }
1255
+
1256
+ ap_int<22> acc_final[16][16];
1257
+ #pragma HLS array_partition variable=acc_final dim=1 complete
1258
+ #pragma HLS array_partition variable=acc_final dim=2 complete
1259
+
1260
+ for(int ii = 0; ii < 16; ii++){
1261
+ #pragma HLS unroll
1262
+ for(int k = 0; k < 16; k++){
1263
+ #pragma HLS unroll
1264
+ acc_final[ii][k] = 0;
1265
+ }
1266
+ }
1267
+
1268
+ reduction:
1269
+ for(int kk = 0; kk < 8; kk++){
1270
+ for(int ii = 0; ii < 16; ii++){
1271
+ #pragma HLS unroll
1272
+ for(int k = 0; k < 8; k++){
1273
+ #pragma HLS unroll
1274
+ ap_int<19> res0; ap_int<19> res1;
1275
+ (res1, res0) = acc_vec[kk][ii][k];
1276
+ res1 = res1 + res0[18];
1277
+ acc_final[ii][k*2] += res0;
1278
+ acc_final[ii][k*2+1] += res1;
1279
+ }
1280
+ }
1281
+ }
1282
+
1283
+ if(stage == 0){
1284
+ for(int ii = 0; ii < 16; ii++){
1285
+ #pragma HLS unroll
1286
+ for(int k = 0; k < 16; k++){
1287
+ #pragma HLS unroll
1288
+ int offset = ii%8;
1289
+ scratchpad[i*2+ii/8][j*16+k](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k] >> 8);
1290
+ }
1291
+ }
1292
+ } else if (stage == 2){
1293
+ for(int ii = 0; ii < 2; ii++){
1294
+ #pragma HLS pipeline II=1 style=stp
1295
+ ap_uint<1024> tmp;
1296
+ for(int jj = 0; jj < 8; jj++){
1297
+ #pragma HLS unroll
1298
+ for(int k = 0; k < 16; k++){
1299
+ #pragma HLS unroll
1300
+ tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[ii*8+jj][k] >> 13);
1301
+ }
1302
+ }
1303
+ fifo_O_out.write(tmp);
1304
+ }
1305
+ } else if (stage == 1){
1306
+ for(int ii = 0; ii < 4; ii++){
1307
+ for(int jj = 0; jj < 2; jj++){
1308
+ ap_uint<256> tmp;
1309
+ for(int k = 0; k < 32; k++){
1310
+ #pragma HLS unroll
1311
+ tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii*4+k/8][jj*8+k%8] >> 8);
1312
+ }
1313
+ fifo_to_acc0.write(tmp);
1314
+ }
1315
+ }
1316
+ } else {
1317
+ final_acc:
1318
+ for(int ii = 0; ii < 16; ii++){
1319
+ #pragma HLS pipeline II=1 style=stp
1320
+ ap_uint<512> tmp_recv = fifo_reduce_recv.read();
1321
+ ap_uint<512> tmp;
1322
+ for(int k = 0; k < 16; k++){
1323
+ #pragma HLS unroll
1324
+ acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
1325
+ tmp(k*32+21, k*32) = acc_final[ii][k];
1326
+ }
1327
+ fifo_reduce_send.write(tmp);
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+ }
1333
+
1334
+ // write out for debug
1335
+ // write:
1336
+ // for(int i = 0; i < L; i++){
1337
+ // for(int j = 0; j < D_head_div_8; j++){
1338
+ // #pragma HLS pipeline II=1 style=stp
1339
+ // fifo_O_out.write(scratchpad_out[i][j]);
1340
+ // }
1341
+ // }
1342
+ }
1343
+
1344
+ void sfu_buffer( // double buffering
1345
+ tapa::istream<int>& fifo_inst,
1346
+ tapa::istream<ap_uint<512>>& fifo_data_in,
1347
+ tapa::ostream<ap_uint<512>>& fifo_data_out
1348
+ ){
1349
+ const int L = fifo_inst.read();
1350
+ for(int stage = 0; stage < 5; stage++){
1351
+
1352
+ for(int l = 0; l < (L >> 5); l++){
1353
+ float sum[8][16];
1354
+ float cache[MAX_SEQ_LEN][16];
1355
+ #pragma HLS array_partition variable=cache dim=2 complete
1356
+ #pragma HLS array_partition variable=sum dim=2 complete
1357
+
1358
+ const int hidden_bound = fifo_inst.read();
1359
+
1360
+ for(int i = 0; i < 8; i++){
1361
+ for(int j = 0; j < 16; j++){
1362
+ #pragma HLS unroll
1363
+ sum[i][j] = 0.0;
1364
+ }
1365
+ }
1366
+
1367
+ acc:
1368
+ for(int i = 0; i < hidden_bound; i++){
1369
+ #pragma HLS pipeline II=1 style=stp
1370
+ #pragma HLS dependence false variable=sum
1371
+ #pragma HLS dependence true variable=sum distance=8
1372
+ ap_uint<512> tmp = fifo_data_in.read();
1373
+ for(int k = 0; k < 16; k++){
1374
+ #pragma HLS unroll
1375
+ float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
1376
+ sum[i%8][k] += res;
1377
+ cache[i][k] = res;
1378
+ }
1379
+ }
1380
+
1381
+ reduce:
1382
+ for(int i = 1; i < 8; i++){
1383
+ for(int j = 0; j < 8; j++){
1384
+ #pragma HLS pipeline II=1 style=stp
1385
+ #pragma HLS dependence true variable=sum distance=8
1386
+ for(int k = 0; k < 2; k++){
1387
+ sum[0][j*2+k] += sum[i][j*2+k];
1388
+ }
1389
+ }
1390
+ }
1391
+
1392
+ ap_uint<512> tmp;
1393
+ for(int i = 0; i < 16; i++){
1394
+ #pragma HLS unroll
1395
+ tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
1396
+ }
1397
+ fifo_data_out.write(tmp);
1398
+
1399
+ write:
1400
+ for(int i = 0; i < hidden_bound; i++){
1401
+ #pragma HLS pipeline II=1 style=stp
1402
+ ap_uint<512> tmp;
1403
+ for(int j = 0; j < 16; j++){
1404
+ #pragma HLS unroll
1405
+ tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
1406
+ }
1407
+ fifo_data_out.write(tmp);
1408
+ }
1409
+
1410
+ }
1411
+ }
1412
+
1413
+ }
1414
+
1415
+ void sfu_buffer_slr0( // double buffering
1416
+ tapa::istream<int>& fifo_inst,
1417
+ tapa::istream<ap_uint<512>>& fifo_data_in_exp,
1418
+ tapa::istream<ap_uint<512>>& fifo_data_in_ln,
1419
+ tapa::istream<ap_uint<512>>& fifo_data_in_ffn,
1420
+ tapa::ostream<ap_uint<512>>& fifo_data_out
1421
+ ){
1422
+ const int L = fifo_inst.read();
1423
+ for(int stage = 0; stage < 7; stage++){
1424
+
1425
+ int hidden_bound = D;
1426
+
1427
+ for(int l = 0; l < (L >> 5); l++){
1428
+ float sum[8][16];
1429
+ float var[8][16];
1430
+ float cache[MAX_SEQ_LEN][16];
1431
+ #pragma HLS array_partition variable=cache dim=2 complete
1432
+ #pragma HLS array_partition variable=sum dim=2 complete
1433
+ #pragma HLS array_partition variable=var dim=2 complete
1434
+
1435
+ if(stage < 5) hidden_bound = fifo_inst.read();
1436
+
1437
+ for(int i = 0; i < 8; i++){
1438
+ for(int j = 0; j < 16; j++){
1439
+ #pragma HLS unroll
1440
+ sum[i][j] = 0.0;
1441
+ var[i][j] = 0.0;
1442
+ }
1443
+ }
1444
+
1445
+ acc:
1446
+ for(int i = 0; i < hidden_bound; i++){
1447
+ #pragma HLS pipeline II=1 style=stp
1448
+ #pragma HLS dependence false variable=sum
1449
+ #pragma HLS dependence true variable=sum distance=8
1450
+
1451
+ ap_uint<512> tmp;
1452
+ if(stage < 5) {
1453
+ tmp = fifo_data_in_exp.read();
1454
+ } else if(stage == 5){
1455
+ tmp = fifo_data_in_ln.read();
1456
+ } else {
1457
+ tmp = fifo_data_in_ffn.read();
1458
+ }
1459
+
1460
+ for(int k = 0; k < 16; k++){
1461
+ #pragma HLS unroll
1462
+ float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
1463
+ sum[i%8][k] = res;
1464
+ if(stage >= 4) var[i%8][k] = res;
1465
+ cache[i][k] = res;
1466
+ }
1467
+ }
1468
+
1469
+ reduce:
1470
+ for(int i = 1; i < 8; i++){
1471
+ for(int j = 0; j < 8; j++){
1472
+ #pragma HLS pipeline II=1 style=stp
1473
+ #pragma HLS dependence true variable=sum distance=8
1474
+ #pragma HLS dependence true variable=var distance=8
1475
+ for(int k = 0; k < 2; k++){
1476
+ sum[0][j*2+k] += sum[i][j*2+k];
1477
+ if(stage >= 5) var[0][j*2+k] += var[i][j*2+k];
1478
+ }
1479
+ }
1480
+ }
1481
+
1482
+ ap_uint<512> tmp;
1483
+ ap_uint<512> tmp_var;
1484
+ for(int i = 0; i < 16; i++){
1485
+ #pragma HLS unroll
1486
+ tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
1487
+ if(stage >= 5) tmp_var(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(var[0][i]);
1488
+ }
1489
+ fifo_data_out.write(tmp);
1490
+ if(stage >= 5) fifo_data_out.write(tmp_var);
1491
+
1492
+ write:
1493
+ for(int i = 0; i < hidden_bound; i++){
1494
+ #pragma HLS pipeline II=1 style=stp
1495
+ ap_uint<512> tmp;
1496
+ for(int j = 0; j < 16; j++){
1497
+ #pragma HLS unroll
1498
+ tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
1499
+ }
1500
+ fifo_data_out.write(tmp);
1501
+ }
1502
+ }
1503
+ }
1504
+ }
1505
+
1506
+
1507
+ void sfu_acc_exp(
1508
+ tapa::istream<int>& fifo_inst,
1509
+ tapa::istream<ap_uint<512>>& fifo_data_in,
1510
+ tapa::ostreams<ap_uint<512>, 2>& fifo_buf,
1511
+ tapa::ostreams<int, 2>& fifo_inst_out
1512
+ ) {
1513
+ const int L = fifo_inst.read();
1514
+ fifo_inst_out[0].write(L);
1515
+ fifo_inst_out[1].write(L);
1516
+
1517
+ for(int stage = 0; stage < 5; stage++){
1518
+
1519
+ for(int l = 0; l < (L >> 4); l++){
1520
+ fifo_inst_out[l%2].write(((l+1) << 4));
1521
+ exp_acc:
1522
+ for(int i = 0; i < ((l+1) << 4);){
1523
+ #pragma HLS pipeline II=1 style=stp
1524
+ if(!fifo_data_in.empty()){
1525
+ ap_uint<512> tmp; fifo_data_in.try_read(tmp);
1526
+ ap_uint<512> tmp_o;
1527
+ for(int k = 0; k < 16; k++){
1528
+ #pragma HLS unroll
1529
+ int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
1530
+ float res_exp = 0.0;
1531
+ res_exp = hls::exp(ap_int<32>(res >> 10));
1532
+ tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
1533
+ }
1534
+ fifo_buf[l%2].write(tmp_o);
1535
+ i++;
1536
+ }
1537
+ }
1538
+ }
1539
+ }
1540
+ }
1541
+
1542
+ void sfu_gelu(
1543
+ tapa::istream<int>& fifo_inst,
1544
+ tapa::ostream<int>& fifo_inst_out,
1545
+ tapa::istream<ap_uint<512>>& fifo_ffn,
1546
+ tapa::ostream<ap_uint<128>>& fifo_out
1547
+ ){
1548
+ const int L = fifo_inst.read();
1549
+ fifo_inst_out.write(L);
1550
+
1551
+ for(int i = 0; i < (L >> 4); i++){
1552
+ for(int j = 0; j < D_ffn_SLR;){
1553
+ if(!fifo_ffn.empty()){
1554
+ ap_uint<512> tmp; fifo_ffn.try_read(tmp);
1555
+ ap_uint<128> tmp_out;
1556
+ for(int k = 0; k < 16; k++){
1557
+ // table based approximation
1558
+ float val = (float) tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
1559
+ float outp_data = 0.0;
1560
+ if (val < -2 || val == 0)
1561
+ outp_data = 0;
1562
+ else if(val < -1.5)
1563
+ outp_data = -0.09754;
1564
+ else if(val < -1)
1565
+ outp_data = -0.15743;
1566
+ else if(val < -0.5)
1567
+ outp_data = -0.15383;
1568
+ else if(val < 0)
1569
+ outp_data = -0.10153;
1570
+ else
1571
+ outp_data = val;
1572
+ tmp_out(k*8+7, k*8) = ap_int<8>((int) (outp_data) >> 8);
1573
+ }
1574
+ fifo_out.write(tmp_out);
1575
+ j++;
1576
+ }
1577
+ }
1578
+ }
1579
+ }
1580
+
1581
+ void data_packing(
1582
+ tapa::istream<int>& fifo_inst,
1583
+ tapa::istream<ap_uint<128>>& fifo_in,
1584
+ tapa::ostream<ap_uint<1024>>& fifo_out
1585
+ ){
1586
+ const int L = fifo_inst.read();
1587
+
1588
+ for(int i = 0; i < (L >> 4); i++){
1589
+ ap_uint<1024> cache[D_ffn_SLR_div_8];
1590
+
1591
+ for(int j = 0; j < D_ffn_SLR_div_8; j++){
1592
+ ap_uint<64> fuse_reg[16];
1593
+ ap_uint<1024> send_pkt;
1594
+ #pragma HLS array_partition variable=fuse_reg complete
1595
+ for(int k = 0; k < 8;){
1596
+ #pragma HLS pipeline II=1
1597
+ if(!fifo_in.empty()){
1598
+ ap_uint<128> tmp; fifo_in.try_read(tmp);
1599
+ for(int l = 0; l < 16; l++){
1600
+ #pragma HLS unroll
1601
+ fuse_reg[l](k*8+7, k*8) = tmp(l*8+7, l*8);
1602
+ }
1603
+ k++;
1604
+ }
1605
+ }
1606
+ send_pkt = ap_uint<1024>((
1607
+ fuse_reg[0], fuse_reg[1], fuse_reg[2], fuse_reg[3], fuse_reg[4], fuse_reg[5], fuse_reg[6], fuse_reg[7],
1608
+ fuse_reg[8], fuse_reg[9], fuse_reg[10], fuse_reg[11], fuse_reg[12], fuse_reg[13], fuse_reg[14], fuse_reg[15]
1609
+ ));
1610
+ cache[j] = send_pkt;
1611
+ fifo_out.write(send_pkt);
1612
+ }
1613
+
1614
+ for(int iter = 0; iter < D_div_16*2 - 1; iter++){
1615
+ for(int j = 0; j < D_ffn_SLR_div_8; j++){
1616
+ #pragma HLS pipeline II=1
1617
+ fifo_out.write(cache[j]);
1618
+ }
1619
+ }
1620
+ }
1621
+ }
1622
+
1623
+ void sfu_norm(
1624
+ tapa::istream<int>& fifo_inst,
1625
+ tapa::istreams<ap_uint<512>, 2>& fifo_buf,
1626
+ tapa::ostream<ap_uint<128>>& fifo_data_out
1627
+ ){
1628
+ const int L = fifo_inst.read();
1629
+ for(int stage = 0; stage < 5; stage++){
1630
+
1631
+ for(int l = 0; l < (L >> 4); l++){
1632
+ float sum[16];
1633
+ #pragma HLS array_partition variable=sum complete
1634
+
1635
+ ap_uint<512> tmp_in = fifo_buf[l%2].read();
1636
+
1637
+ for(int i = 0; i < 16; i++){
1638
+ #pragma HLS unroll factor=8
1639
+ sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
1640
+ }
1641
+
1642
+ for(int i = 0; i < ((l+1) << 4);){
1643
+ #pragma HLS pipeline II=1 style=stp
1644
+ if(!fifo_buf[l%2].empty()){
1645
+ ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
1646
+ ap_uint<128> tmp;
1647
+ for(int j = 0; j < 16; j++){
1648
+ #pragma HLS unroll
1649
+ ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
1650
+ tmp(j*8 + 7, j*8) = res;
1651
+ }
1652
+ fifo_data_out.write(tmp);
1653
+ i++;
1654
+ }
1655
+ }
1656
+ }
1657
+ }
1658
+ }
1659
+
1660
+ void sfu_norm_slr0(
1661
+ tapa::istream<int>& fifo_inst,
1662
+ tapa::istreams<ap_uint<512>, 2>& fifo_buf,
1663
+ tapa::ostream<ap_uint<128>>& fifo_data_out,
1664
+ tapa::ostream<ap_uint<128>>& fifo_data_off,
1665
+ tapa::ostream<ap_uint<128>>& fifo_out
1666
+ ){
1667
+ const int L = fifo_inst.read();
1668
+
1669
+ for(int stage = 0; stage < 7; stage++){
1670
+
1671
+ for(int l = 0; l < (L >> 4); l++){
1672
+ int sum[16];
1673
+ int var[16];
1674
+ #pragma HLS array_partition variable=sum complete
1675
+ #pragma HLS array_partition variable=var complete
1676
+
1677
+ const int fifo_idx = l%2;
1678
+ const int hidden_bound = (stage < 5) ? ((l+1) << 4) : D;
1679
+
1680
+ ap_uint<512> tmp_in = fifo_buf[fifo_idx].read();
1681
+ ap_uint<512> tmp_var;
1682
+ if(stage >= 5) tmp_var = fifo_buf[fifo_idx].read();
1683
+
1684
+ if(stage >= 5){
1685
+ for(int i = 0; i < 16; i++){
1686
+ #pragma HLS unroll
1687
+ var[i] = ap_uint<32>(tmp_in(i*32+31, i*32));
1688
+ }
1689
+ } else {
1690
+ for(int i = 0; i < 16; i++){
1691
+ #pragma HLS unroll
1692
+ sum[i] = ap_uint<32>(tmp_in(i*32+31, i*32)) * 2;
1693
+ }
1694
+ }
1695
+
1696
+ for(int i = 0; i < hidden_bound;){
1697
+ #pragma HLS pipeline II=1 style=stp
1698
+ if(!fifo_buf[fifo_idx].empty()){
1699
+ ap_uint<512> tmp_cache; fifo_buf[fifo_idx].try_read(tmp_cache);
1700
+ ap_uint<128> tmp;
1701
+ for(int j = 0; j < 16; j++){
1702
+ #pragma HLS unroll
1703
+ ap_int<8> res;
1704
+ int op1; int op2;
1705
+ if(stage >= 5){
1706
+ op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
1707
+ op2 = var[j];
1708
+ } else {
1709
+ op1 = ap_uint<32>(tmp_cache(j*32+31, j*32));
1710
+ op2 = sum[j];
1711
+ }
1712
+ res = op1 + op2;
1713
+ tmp(j*8 + 7, j*8) = res;
1714
+ }
1715
+ if(stage == 5) {
1716
+ fifo_data_off.write(tmp);
1717
+ } else if(stage == 6){
1718
+ fifo_out.write(tmp);
1719
+ } else {
1720
+ fifo_data_out.write(tmp);
1721
+ }
1722
+ i++;
1723
+ }
1724
+ }
1725
+ }
1726
+ }
1727
+ }
1728
+
1729
+ void context_buffer(
1730
+ tapa::istream<int>& fifo_inst,
1731
+ tapa::istream<ap_uint<1024>>& fifo_context,
1732
+ tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
1733
+ tapa::ostream<ap_uint<1024>>& fifo_to_acc1
1734
+ ){
1735
+ ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
1736
+ #pragma HLS array_partition variable=context cyclic dim=1 factor=32
1737
+ #pragma HLS bind_storage variable=context type=ram_2p impl=uram
1738
+
1739
+ const int L = fifo_inst.read();
1740
+
1741
+ for(int stage = 0; stage < 5; stage++){
1742
+ for(int i = 0; i < (L >> 4); i++){
1743
+ for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
1744
+ if(!fifo_context.empty()){
1745
+ ap_uint<1024> tmp; fifo_context.try_read(tmp);
1746
+ for(int ii = 0; ii < 16; ii++){
1747
+ #pragma HLS unroll
1748
+ context[i*16+ii][j] = tmp(ii*64+63, ii*64);
1749
+ }
1750
+ j++;
1751
+ }
1752
+ }
1753
+ }
1754
+ }
1755
+
1756
+ // NOTE: change it to write to HBM for debugging
1757
+ // write ops to acc0 and acc1 in parallel
1758
+ for(int i = 0; i < (L >> 5); i++){
1759
+ for(int l = 0; l < D_div_16; l++){
1760
+ for(int iter = 0; iter < 2; iter++){
1761
+ for(int j = 0; j < CONTEXT_D; j++){
1762
+ ap_uint<1024> tmp_acc0;
1763
+ ap_uint<1024> tmp_acc1;
1764
+ for(int k = 0; k < 16; k++){
1765
+ #pragma HLS unroll
1766
+ tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
1767
+ tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
1768
+ }
1769
+ fifo_to_acc0.write(tmp_acc0);
1770
+ fifo_to_acc1.write(tmp_acc1);
1771
+ }
1772
+ }
1773
+ }
1774
+ }
1775
+ }
1776
+
1777
+ void ffn_buffer(
1778
+ const int L,
1779
+ tapa::istream<ap_uint<128>>& fifo_ffn_in,
1780
+ tapa::ostream<ap_uint<1024>>& fifo_ffn_out,
1781
+ tapa::ostream<ap_uint<1024>>& fifo_ffn_res
1782
+ ){
1783
+ ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
1784
+ #pragma HLS array_partition variable=X cyclic dim=1 factor=16
1785
+ #pragma HLS bind_storage variable=X type=ram_2p impl=uram
1786
+
1787
+ for(int i = 0; i < (L >> 4); i++){
1788
+ for(int j = 0; j < D_div_8; j++){
1789
+ ap_uint<64> fuse_reg[16];
1790
+ #pragma HLS array_partition variable=fuse_reg complete
1791
+
1792
+ for(int l = 0; l < 8;){
1793
+ #pragma HLS pipeline II=1
1794
+
1795
+ if(!fifo_ffn_in.empty()){
1796
+ ap_uint<128> tmp; fifo_ffn_in.try_read(tmp);
1797
+ for(int k = 0; k < 16; k++){
1798
+ #pragma HLS unroll
1799
+ fuse_reg[k](l*8+7, l*8) = tmp(k*8+7, k*8);
1800
+ }
1801
+ l++;
1802
+ }
1803
+ }
1804
+ for(int k = 0; k < 16; k++){
1805
+ #pragma HLS unroll
1806
+ X[i*16+k][j] = fuse_reg[k];
1807
+ }
1808
+ }
1809
+ }
1810
+
1811
+ for(int i = 0; i < (L >> 4); i++){
1812
+ for(int iter = 0; iter < (D_ffn_SLR >> 4); iter++){
1813
+ for(int it = 0; it < 2; it++){
1814
+ for(int j = 0; j < D_div_8; j++){
1815
+ ap_uint<1024> tmp;
1816
+ for(int k = 0; k < 16; k++){
1817
+ #pragma HLS unroll
1818
+ tmp(k*64+63, k*64) = X[i*16+k][j];
1819
+ }
1820
+ fifo_ffn_out.write(tmp);
1821
+ }
1822
+ }
1823
+
1824
+ if(iter < D_div_16){
1825
+ for(int j = 0; j < 2; j++){
1826
+ ap_uint<1024> send;
1827
+ for(int k = 0; k < 16; k++){
1828
+ #pragma HLS unroll
1829
+ send(k*64+63, k*64) = X[i*16+k][iter*2+j];
1830
+ }
1831
+ fifo_ffn_res.write(send);
1832
+ }
1833
+ }
1834
+ }
1835
+ }
1836
+ }
1837
+
1838
+ void ffn_residual(
1839
+ const int L,
1840
+ tapa::istream<ap_uint<1024>>& fifo_x,
1841
+ tapa::istream<ap_uint<512>>& fifo_in,
1842
+ tapa::ostreams<ap_uint<512>, 2>& fifo_out
1843
+ ){
1844
+ for(int i = 0; i < (L >> 4); i++){
1845
+ for(int j = 0; j < D_div_8; j++){
1846
+ ap_uint<1024> tmp_x = fifo_x.read();
1847
+ for(int k = 0; k < 8;){
1848
+ if(!fifo_in.empty()){
1849
+ ap_uint<512> tmp; fifo_in.try_read(tmp);
1850
+ ap_uint<512> tmp_o;
1851
+ ap_uint<128> x = tmp_x(k*128+127, k*128);
1852
+ for(int l = 0; l < 16; l++){
1853
+ #pragma HLS unroll
1854
+ ap_int<22> a = tmp(l*32+31, l*32);
1855
+ ap_int<8> b = x(l*8+7, l*8);
1856
+ ap_int<22> res = a + b;
1857
+ tmp_o(l*32+31, l*32) = res;
1858
+ }
1859
+ fifo_out[i%2].write(tmp_o);
1860
+ k++;
1861
+ }
1862
+ }
1863
+ }
1864
+ }
1865
+ }
1866
+
1867
+ void measure_cycle(tapa::istream<bool>& fifo_fin, tapa::mmap<int> cycle_count){
1868
+ for(int cycle = 0;;cycle++){
1869
+ if(!fifo_fin.empty()){
1870
+ fifo_fin.read(nullptr);
1871
+ cycle_count[0] = cycle;
1872
+ break;
1873
+ }
1874
+ }
1875
+ }
1876
+
1877
+ void opt_kernel(
1878
+ const int L,
1879
+ const int L_out,
1880
+ const int seq_len,
1881
+ // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
1882
+ tapa::mmap<ap_uint<512>> X_acc0,
1883
+ tapa::mmap<ap_uint<512>> X_acc1,
1884
+ tapa::mmap<ap_uint<512>> W_acc0,
1885
+ tapa::mmap<ap_uint<512>> W_acc1,
1886
+ tapa::mmap<ap_uint<128>> acc0_out,
1887
+ // tapa::mmap<ap_uint<64>> acc1_out,
1888
+ tapa::mmap<int> cycle_count
1889
+ ){
1890
+ tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
1891
+ tapa::streams<ConfigInst, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
1892
+ tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
1893
+ tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
1894
+ tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
1895
+ tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
1896
+ tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
1897
+ tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
1898
+ // tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
1899
+ tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
1900
+ tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
1901
+ tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
1902
+ // tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
1903
+ tapa::streams<ap_uint<256>, NUM_SLR, 8> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
1904
+ tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
1905
+ tapa::stream<bool> fifo_fin("fifo_fin");
1906
+
1907
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
1908
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
1909
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
1910
+ tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
1911
+ tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
1912
+
1913
+ // tapa::stream<ap_uint<128>> fifo_acc0_out("fifo_acc0_out");
1914
+ tapa::stream<ap_uint<128>> fifo_acc1_out("fifo_acc1_out");
1915
+
1916
+ tapa::stream<ap_uint<512>, 16> fifo_res_acc0("fifo_res_acc0");
1917
+ tapa::stream<ap_uint<512>, 16> fifo_res_acc1("fifo_res_acc1");
1918
+ tapa::stream<ap_uint<512>, D> fifo_ln_acc0("fifo_ln_acc0");
1919
+ tapa::stream<ap_uint<512>, D> fifo_ln_acc1("fifo_ln_acc1");
1920
+
1921
+ tapa::stream<ap_uint<128>> fifo_ffn_buffer_in("fifo_ffn_buffer_in");
1922
+ tapa::stream<ap_uint<1024>> fifo_ffn_buffer_out("fifo_ffn_buffer_out");
1923
+
1924
+ tapa::streams<ap_uint<512>, NUM_SLR, 16> fifo_gelu_in("fifo_gelu_in");
1925
+ tapa::streams<ap_uint<128>, NUM_SLR, D> fifo_gelu_out("fifo_gelu_out");
1926
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_gelu_full("fifo_gelu_full");
1927
+
1928
+ tapa::stream<ap_uint<512>, 8> fifo_ffn2("fifo_ffn2");
1929
+ tapa::stream<ap_uint<1024>, D_div_8+2> fifo_skip_x("fifo_skip_x");
1930
+ tapa::streams<ap_uint<512>, 2> fifo_res2("fifo_res2");
1931
+
1932
+ tapa::streams<int, NUM_SLR> fifo_inst_switch_acc0("fifo_inst_switch_acc0");
1933
+ tapa::streams<int, NUM_SLR> fifo_inst_switch_acc1("fifo_inst_switch_acc1");
1934
+ tapa::streams<int, NUM_SLR> fifo_inst_switch_sfu("fifo_inst_switch_sfu");
1935
+ tapa::streams<int, NUM_SLR> fifo_inst_switch_context("fifo_inst_switch_context");
1936
+ tapa::streams<int, NUM_SLR> fifo_inst_switch_gelu("fifo_inst_switch_gelu");
1937
+ tapa::streams<int, NUM_SLR*2> fifo_inst_sfu_buffer("fifo_inst_sfu_buffer");
1938
+ tapa::streams<int, NUM_SLR> fifo_inst_data_pack("fifo_inst_data_pack");
1939
+ tapa::streams<int, NUM_SLR> fifo_inst_norm("fifo_inst_norm");
1940
+
1941
+ tapa::task()
1942
+ .invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
1943
+ .invoke<tapa::join>(read_W, W_acc0, fifo_W_acc0)
1944
+ .invoke<tapa::join>(read_W, W_acc1, fifo_W_acc1)
1945
+ .invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
1946
+ .invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
1947
+ .invoke<tapa::join>(
1948
+ temporal_acc0_slr0,
1949
+ fifo_inst_acc0, fifo_inst_acc0,
1950
+ fifo_inst_switch_acc0,
1951
+ fifo_X_acc0_slr0, fifo_X_acc0,
1952
+ fifo_W_acc0, fifo_W_acc0,
1953
+ fifo_from_acc1_to_acc0,
1954
+ fifo_acc0_to_sfu,
1955
+ fifo_gelu_in,
1956
+ fifo_cont_to_acc0,
1957
+ fifo_ffn_buffer_out,
1958
+ fifo_reduce_acc0,
1959
+ fifo_res_acc0
1960
+ // fifo_fin
1961
+ )
1962
+ .invoke<tapa::join>(
1963
+ temporal_acc1_slr0,
1964
+ fifo_inst_acc1, fifo_inst_acc1,
1965
+ fifo_inst_switch_acc1,
1966
+ fifo_X_acc1_slr0, fifo_X_acc1,
1967
+ fifo_W_acc1, fifo_W_acc1,
1968
+ fifo_from_acc1_to_acc0,
1969
+ fifo_from_sfu_to_acc1,
1970
+ fifo_context,
1971
+ fifo_cont_to_acc1,
1972
+ fifo_reduce_acc1,
1973
+ fifo_res_acc1,
1974
+ fifo_gelu_full,
1975
+ fifo_ffn2
1976
+ // fifo_fin
1977
+ )
1978
+ .invoke<tapa::join>(
1979
+ residual, seq_len,
1980
+ fifo_res_acc0,
1981
+ fifo_ln_acc0
1982
+ )
1983
+ .invoke<tapa::join>(
1984
+ residual, seq_len,
1985
+ fifo_res_acc1,
1986
+ fifo_ln_acc1
1987
+ )
1988
+ .invoke<tapa::join, NUM_SLR-1>(
1989
+ temporal_acc0,
1990
+ fifo_inst_acc0, fifo_inst_acc0,
1991
+ fifo_inst_switch_acc0,
1992
+ fifo_X_acc0, fifo_X_acc0,
1993
+ fifo_W_acc0, fifo_W_acc0,
1994
+ fifo_from_acc1_to_acc0,
1995
+ fifo_acc0_to_sfu,
1996
+ fifo_cont_to_acc0,
1997
+ fifo_gelu_in,
1998
+ fifo_reduce_acc0, fifo_reduce_acc0
1999
+ )
2000
+ .invoke<tapa::join, NUM_SLR-1>(
2001
+ temporal_acc1,
2002
+ fifo_inst_acc1, fifo_inst_acc1,
2003
+ fifo_inst_switch_acc1,
2004
+ fifo_X_acc1, fifo_X_acc1,
2005
+ fifo_W_acc1, fifo_W_acc1,
2006
+ fifo_from_acc1_to_acc0,
2007
+ fifo_from_sfu_to_acc1,
2008
+ fifo_context,
2009
+ fifo_cont_to_acc1,
2010
+ fifo_reduce_acc1, fifo_reduce_acc1,
2011
+ fifo_gelu_full
2012
+ )
2013
+ .invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc0, fifo_inst_switch_sfu, fifo_inst_switch_gelu)
2014
+ .invoke<tapa::join, NUM_SLR>(packet_switch_acc, fifo_inst_switch_acc1, fifo_inst_switch_context, fifo_inst_norm)
2015
+ .invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc0, fifo_reduce_acc0)
2016
+ .invoke<tapa::join>(write_zero, seq_len, D_write_zero_acc1, fifo_reduce_acc1)
2017
+ .invoke<tapa::join, NUM_SLR>(
2018
+ sfu_acc_exp, fifo_inst_switch_sfu,
2019
+ fifo_acc0_to_sfu,
2020
+ fifo_sfu_buf_in,
2021
+ fifo_inst_sfu_buffer
2022
+ )
2023
+ .invoke<tapa::join>(
2024
+ sfu_buffer_slr0, fifo_inst_sfu_buffer,
2025
+ fifo_sfu_buf_in,
2026
+ fifo_ln_acc0,
2027
+ fifo_res2,
2028
+ fifo_sfu_buf_out
2029
+ )
2030
+ .invoke<tapa::join>(
2031
+ sfu_buffer_slr0, fifo_inst_sfu_buffer,
2032
+ fifo_sfu_buf_in,
2033
+ fifo_ln_acc1,
2034
+ fifo_res2,
2035
+ fifo_sfu_buf_out
2036
+ )
2037
+ .invoke<tapa::join, (NUM_SLR-1)*2>(
2038
+ sfu_buffer, fifo_inst_sfu_buffer,
2039
+ fifo_sfu_buf_in,
2040
+ fifo_sfu_buf_out
2041
+ )
2042
+ .invoke<tapa::join>(
2043
+ sfu_norm_slr0, fifo_inst_norm,
2044
+ fifo_sfu_buf_out,
2045
+ fifo_from_sfu_to_acc1,
2046
+ fifo_ffn_buffer_in,
2047
+ fifo_acc1_out
2048
+ )
2049
+ .invoke<tapa::join, NUM_SLR-1>(
2050
+ sfu_norm, fifo_inst_norm,
2051
+ fifo_sfu_buf_out,
2052
+ fifo_from_sfu_to_acc1
2053
+ )
2054
+ .invoke<tapa::join>(
2055
+ ffn_buffer, seq_len,
2056
+ fifo_ffn_buffer_in,
2057
+ fifo_ffn_buffer_out,
2058
+ fifo_skip_x
2059
+ )
2060
+ .invoke<tapa::join>(
2061
+ ffn_residual, seq_len,
2062
+ fifo_skip_x,
2063
+ fifo_ffn2,
2064
+ fifo_res2
2065
+ )
2066
+ .invoke<tapa::join, NUM_SLR>(
2067
+ context_buffer, fifo_inst_switch_context,
2068
+ fifo_context,
2069
+ fifo_cont_to_acc0, fifo_cont_to_acc1
2070
+ )
2071
+ .invoke<tapa::join, NUM_SLR>(
2072
+ sfu_gelu, fifo_inst_switch_gelu, fifo_inst_data_pack,
2073
+ fifo_gelu_in,
2074
+ fifo_gelu_out
2075
+ )
2076
+ .invoke<tapa::join, NUM_SLR>(
2077
+ data_packing, fifo_inst_data_pack,
2078
+ fifo_gelu_out,
2079
+ fifo_gelu_full
2080
+ )
2081
+ // .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
2082
+ .invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc1_out, fifo_fin)
2083
+ // .invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
2084
+ .invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
2085
+ .invoke<tapa::detach>(black_hole_inst, fifo_inst_acc0)
2086
+ .invoke<tapa::detach>(black_hole_inst, fifo_inst_acc1)
2087
+ .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
2088
+ .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
2089
+ .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
2090
+ .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
2091
+ }
gpt-2-medium/kernel-versal.cpp ADDED
The diff for this file is too large to render. See raw diff
 
gpt-2-medium/kernel.cpp ADDED
@@ -0,0 +1,1528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cmath>
2
+ #include <string>
3
+ #include <tapa.h>
4
+ #include <ap_int.h>
5
+ #include <hls_math.h>
6
+
7
+ constexpr int D = 1024;
8
+ constexpr int D_ffn = 4096;
9
+ constexpr int N_head = 16;
10
+ constexpr int MAX_SEQ_LEN = 1024;
11
+ constexpr int MAX_SEQ_LEN_div_2 = MAX_SEQ_LEN / 2;
12
+ constexpr int MAX_SEQ_LEN_div_8 = MAX_SEQ_LEN / 8;
13
+ constexpr int NUM_SLR = 3;
14
+ constexpr int NUM_DUM_SLR = 4;
15
+ constexpr int TOTAL_PORT = NUM_SLR * 2;
16
+ constexpr int D_head = D / N_head;
17
+ constexpr int D_head_div_16 = D_head / 16;
18
+ constexpr int D_head_div_8 = D_head / 8;
19
+ constexpr int D_head_div_4 = D_head / 4;
20
+ constexpr int D_head_div_2 = D_head / 2;
21
+ constexpr int D_div_8 = D / 8;
22
+ constexpr int D_div_16 = D / 16;
23
+ constexpr int FFN_WEIGHT_SIZE = D * D_ffn;
24
+ constexpr int OUT_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 5;
25
+ constexpr int WEIGHT_D = D * 2;
26
+ constexpr int QKV_WEIGHT_SIZE = D * D_head * NUM_DUM_SLR * 10; // multi-head attention
27
+ constexpr int TOTAL_WEIGHT_SIZE = OUT_WEIGHT_SIZE + QKV_WEIGHT_SIZE;
28
+ constexpr int CONTEXT_D = D_head_div_8 * 5;
29
+ constexpr int D_head_mul_5 = D_head * 5;
30
+ constexpr int D_write_zero = D / 32 * 5;
31
+
32
+ using int_v16 = tapa::vec_t<int, 16>;
33
+ using int4_v128 = tapa::vec_t<ap_int<4>, 128>;
34
+ using int8_v64 = tapa::vec_t<ap_int<8>, 64>;
35
+
36
+ template <typename data_t>
37
+ inline void bh(tapa::istream<data_t> & q) {
38
+ #pragma HLS inline
39
+ for (;;) {
40
+ #pragma HLS pipeline II=1
41
+ data_t tmp; q.try_read(tmp);
42
+ }
43
+ }
44
+
45
+ void black_hole_int(tapa::istream<int> & fifo_in) {
46
+ bh(fifo_in);
47
+ }
48
+
49
+ void black_hole_int_v16(tapa::istream<int_v16> & fifo_in) {
50
+ bh(fifo_in);
51
+ }
52
+
53
+ void black_hole_x(tapa::istream<int8_v64> & fifo_in) {
54
+ bh(fifo_in);
55
+ }
56
+
57
+ void black_hole_w(tapa::istream<int4_v128> & fifo_in) {
58
+ bh(fifo_in);
59
+ }
60
+
61
+ void black_hole_ap_uint_512(tapa::istream<ap_uint<512>> & fifo_in) {
62
+ bh(fifo_in);
63
+ }
64
+
65
+ void black_hole_ap_uint_1024(tapa::istream<ap_uint<1024>> & fifo_in) {
66
+ bh(fifo_in);
67
+ }
68
+
69
+ void read_W(
70
+ const int N,
71
+ tapa::async_mmap<ap_uint<512>>& vec,
72
+ tapa::ostream<ap_uint<512>>& fifo_out
73
+ ){
74
+
75
+ for(int i_req = 0, i_resp = 0; i_resp < (N >> 7);){
76
+ #pragma HLS pipeline II=1
77
+ if((i_req < (N >> 7)) & !vec.read_addr.full()){
78
+ vec.read_addr.write(i_req);
79
+ i_req++;
80
+ }
81
+ if(!vec.read_data.empty()){
82
+ ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
83
+ fifo_out.write(tmp_o);
84
+ i_resp++;
85
+ }
86
+ }
87
+ }
88
+
89
+ void read_X(
90
+ const int N,
91
+ tapa::async_mmap<ap_uint<512>>& vec,
92
+ tapa::ostream<ap_uint<512>>& fifo_out
93
+ ){
94
+ for(int i_req = 0, i_resp = 0; i_resp < (N >> 6);){
95
+ #pragma HLS pipeline II=1
96
+ if((i_req < (N >> 6)) & !vec.read_addr.full()){
97
+ vec.read_addr.write(i_req);
98
+ i_req++;
99
+ }
100
+ if(!vec.read_data.empty()){
101
+ ap_uint<512> tmp_o; vec.read_data.try_read(tmp_o);
102
+ fifo_out.write(tmp_o);
103
+ i_resp++;
104
+ }
105
+ }
106
+ }
107
+
108
+ void read_inst(
109
+ const int L,
110
+ tapa::ostream<int>& fifo_out_acc0,
111
+ tapa::ostream<int>& fifo_out_acc1
112
+ ){
113
+ for(int stage_i = 0; stage_i < 20; stage_i++){
114
+ #pragma HLS pipeline II=1
115
+
116
+ const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
117
+
118
+ if(stage == 3){
119
+ fifo_out_acc0.write(0);
120
+ fifo_out_acc1.write(0);
121
+
122
+ fifo_out_acc0.write(L/2);
123
+ fifo_out_acc1.write(L/2);
124
+ }
125
+ else if(stage != 1){
126
+ fifo_out_acc0.write(0);
127
+ fifo_out_acc1.write(0);
128
+
129
+ fifo_out_acc0.write(L);
130
+ fifo_out_acc1.write(L);
131
+ } else {
132
+ fifo_out_acc0.write(0);
133
+ fifo_out_acc0.write(L/2);
134
+
135
+ fifo_out_acc1.write(L/2);
136
+ fifo_out_acc1.write(L);
137
+ }
138
+ }
139
+ }
140
+
141
+ void write_mtx(
142
+ const int N,
143
+ tapa::async_mmap<ap_uint<64>>& output_mtx,
144
+ tapa::istream<ap_uint<64>>& fifo_in
145
+ ){
146
+
147
+ for(int i_req = 0, i_resp = 0; i_resp < N;){
148
+ #pragma HLS pipeline II=1
149
+ if((i_req < N) & !fifo_in.empty() & !output_mtx.write_addr.full() & !output_mtx.write_data.full()){
150
+ output_mtx.write_addr.try_write(i_req);
151
+ ap_uint<64> tmp; fifo_in.try_read(tmp);
152
+ output_mtx.write_data.try_write(tmp);
153
+ ++i_req;
154
+ }
155
+ if(!output_mtx.write_resp.empty()){
156
+ i_resp += unsigned(output_mtx.write_resp.read(nullptr))+1;
157
+ }
158
+ }
159
+ }
160
+
161
+ void write_zero(
162
+ const int L,
163
+ tapa::ostream<ap_uint<512>>& fifo_zero
164
+ ){
165
+ for(int i = 0; i < L * D_write_zero;){
166
+ if(!fifo_zero.full()){
167
+ ap_uint<512> tmp = 0;
168
+ fifo_zero.try_write(tmp);
169
+ i++;
170
+ }
171
+ }
172
+ }
173
+
174
+ // acc slr0 master node
175
+ void temporal_acc0_slr0(
176
+ const int L,
177
+ tapa::istream<int>& fifo_len_in,
178
+ tapa::ostream<int>& fifo_len_out,
179
+ tapa::istream<ap_uint<512>>& fifo_X_in,
180
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
181
+ tapa::istream<ap_uint<512>>& fifo_W_in,
182
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
183
+ tapa::istream<ap_uint<128>>& fifo_from_acc1,
184
+ tapa::ostream<ap_uint<512>>& fifo_O_out,
185
+ tapa::istream<ap_uint<1024>>& fifo_context,
186
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
187
+ tapa::ostream<ap_uint<64>>& fifo_write,
188
+ tapa::ostream<bool>& fifo_fin
189
+ ){
190
+
191
+ ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
192
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
193
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
194
+
195
+ ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
196
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
197
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
198
+ #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
199
+
200
+ ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
201
+ #pragma HLS array_partition variable=X cyclic dim=1 factor=16
202
+ #pragma HLS array_partition variable=X cyclic dim=2 factor=2
203
+ #pragma HLS bind_storage variable=X type=ram_2p impl=uram
204
+
205
+ for(int stage_i = 0; stage_i < 20; stage_i++){
206
+
207
+ //TODO: stage send from inst
208
+
209
+ // stage 0: WqX
210
+ // stage 1: WkX0 <- acc1
211
+ // stage 2: QK^T
212
+
213
+ ap_uint<32> W[D_head][D_div_8]; // TODO: reduce dimension
214
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=16
215
+
216
+ const int start = fifo_len_in.read();
217
+ const int end = fifo_len_in.read();
218
+ fifo_len_out.write(start);
219
+ fifo_len_out.write(end);
220
+
221
+ const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
222
+
223
+ // load weights and forward
224
+ if(stage != 2) { // TODO: 1d array & uniform access
225
+ for(int i = 0; i < D_head_div_4; i++){
226
+ load_weight:
227
+ for(int j = 0; j < D_div_8;){
228
+ if(!fifo_W_in.empty()){
229
+ ap_uint<512> val; fifo_W_in.try_read(val);
230
+
231
+ for(int k = 0; k < 4; k++){
232
+ #pragma HLS unroll
233
+ W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
234
+ }
235
+ val = ap_uint<512>(val >> 128);
236
+ fifo_W_out.write(val);
237
+ j++;
238
+ }
239
+ }
240
+ }
241
+ }
242
+
243
+ int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
244
+ j_bound = (stage == 3) ? D_div_16 : j_bound;
245
+ int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
246
+
247
+ // stage 1: compute Q
248
+ for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 16
249
+
250
+ if(stage_i == 0){
251
+ for(int ii = 0; ii < 2; ii++){ // load only 1 time
252
+ load_x:
253
+ for(int jj = 0; jj < D_div_8;){
254
+ if(!fifo_X_in.empty()){
255
+ ap_uint<512> val; fifo_X_in.try_read(val);
256
+
257
+ for(int k = 0; k < 8; k++){
258
+ #pragma HLS unroll
259
+ X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
260
+ }
261
+ jj++;
262
+ }
263
+ }
264
+ }
265
+ }
266
+
267
+ for(int j = 0; j < j_bound; j++){
268
+
269
+ ap_int<38> acc_vec[8][16][8];
270
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
271
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
272
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
273
+
274
+ for(int ii = 0; ii < 8; ii++){
275
+ #pragma HLS unroll
276
+ for(int kk = 0; kk < 16; kk++){
277
+ #pragma HLS unroll
278
+ for(int k = 0; k < 8; k++){
279
+ #pragma HLS unroll
280
+ acc_vec[ii][kk][k] = 0;
281
+ }
282
+ }
283
+ }
284
+
285
+ compute:
286
+ for(int k = 0; k < k_bound; k++){ // reduction dim
287
+ #pragma HLS pipeline II=1
288
+
289
+ ap_uint<64> op1_mtx[16];
290
+ ap_uint<64> op2_mtx[16];
291
+ #pragma HLS array_partition variable=op1_mtx complete
292
+ #pragma HLS array_partition variable=op2_mtx complete
293
+
294
+ ap_uint<1024> recv_pkt;
295
+
296
+ if(stage == 3) {
297
+ recv_pkt = fifo_context.read();
298
+ }
299
+
300
+ for(int ii = 0; ii < 16; ii++){
301
+ #pragma HLS unroll
302
+ if(stage == 3){
303
+ op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]); // change it
304
+ op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
305
+ } else if(stage == 2) {
306
+ op1_mtx[ii] = scratchpad_q[i*16+ii][k];
307
+ op2_mtx[ii] = scratchpad_k[j*16+ii][k];
308
+ } else {
309
+ op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
310
+ op2_mtx[ii] = X[i*16+ii][k];
311
+ }
312
+ }
313
+
314
+ if(stage < 2){
315
+ ap_uint<1024> send_pkt = ap_uint<1024>((
316
+ op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
317
+ op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
318
+ ));
319
+ fifo_X_out.write(send_pkt);
320
+ }
321
+
322
+ for(int ii = 0; ii < 8; ii++){
323
+ #pragma HLS unroll
324
+ for(int kk = 0; kk < 16; kk++){
325
+ #pragma HLS unroll
326
+ for(int l = 0; l < 8; l++){
327
+ #pragma HLS unroll
328
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
329
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
330
+ if(stage == 2){
331
+ op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
332
+ op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
333
+ } else {
334
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
335
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
336
+ }
337
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
338
+ acc_vec[ii][kk][l] += w_pack * op3;
339
+ }
340
+ }
341
+ }
342
+ }
343
+
344
+ ap_int<22> acc_final[16][16];
345
+ #pragma HLS array_partition variable=acc_final dim=1 complete
346
+ #pragma HLS array_partition variable=acc_final dim=2 complete
347
+
348
+ for(int ii = 0; ii < 16; ii++){
349
+ #pragma HLS unroll
350
+ for(int k = 0; k < 16; k++){
351
+ #pragma HLS unroll
352
+ acc_final[ii][k] = 0;
353
+ }
354
+ }
355
+
356
+ reduction:
357
+ for(int kk = 0; kk < 8; kk++){
358
+ for(int ii = 0; ii < 16; ii++){
359
+ #pragma HLS unroll
360
+ for(int k = 0; k < 8; k++){
361
+ #pragma HLS unroll
362
+ ap_int<19> res0; ap_int<19> res1;
363
+ (res1, res0) = acc_vec[kk][ii][k];
364
+ res1 = res1 + res0[18];
365
+ acc_final[ii][k*2] += res0;
366
+ acc_final[ii][k*2+1] += res1;
367
+ if(kk == 7 && stage < 2) {
368
+ acc_final[ii][k*2] = acc_final[ii][k*2] >> 8;
369
+ acc_final[ii][k*2+1] = acc_final[ii][k*2] >> 8;
370
+ }
371
+ }
372
+ }
373
+ }
374
+
375
+ if(stage == 0){
376
+ for(int ii = 0; ii < 16; ii++){
377
+ #pragma HLS unroll
378
+ for(int k = 0; k < 16; k++){
379
+ #pragma HLS unroll
380
+ int offset = k%8;
381
+ scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
382
+ }
383
+ }
384
+ } else if (stage == 1){
385
+ for(int ii = 0; ii < 16; ii++){
386
+ ap_uint<128> tmp = fifo_from_acc1.read();
387
+
388
+ for(int k = 0; k < 16; k++){
389
+ #pragma HLS unroll
390
+ int offset = k%8;
391
+ scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
392
+ }
393
+ for(int k = 0; k < 2; k++){
394
+ #pragma HLS unroll
395
+ scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
396
+ }
397
+ }
398
+ } else if(stage == 2){
399
+ for(int ii = 0; ii < 16; ii++){
400
+ #pragma HLS pipeline II=1
401
+ ap_uint<512> tmp;
402
+ for(int kk = 0; kk < 16; kk++){
403
+ #pragma HLS unroll
404
+ tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
405
+ }
406
+ fifo_O_out.write(tmp);
407
+ }
408
+ } else {
409
+ final_acc:
410
+ for(int ii = 0; ii < 16;){
411
+ #pragma HLS pipeline II=1
412
+ #pragma HLS dependence variable=X type=inter false
413
+ if(!fifo_reduce_recv.empty()){
414
+ ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
415
+ for(int k = 0; k < 16; k++){
416
+ #pragma HLS unroll
417
+ acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
418
+ X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
419
+ }
420
+
421
+ ii++;
422
+ }
423
+ }
424
+ }
425
+ }
426
+ }
427
+ }
428
+ fifo_fin.write(true);
429
+
430
+ write:
431
+ for(int i = 0; i < L; i++){
432
+ for(int j = 0; j < D_div_8; j++){
433
+ #pragma HLS pipeline II=1
434
+ fifo_write.write(X[i][j]);
435
+ }
436
+ }
437
+ }
438
+
439
+ void temporal_acc0(
440
+ const int L,
441
+ tapa::istream<int>& fifo_len_in,
442
+ tapa::ostream<int>& fifo_len_out,
443
+ tapa::istream<ap_uint<1024>>& fifo_X_in,
444
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
445
+ tapa::istream<ap_uint<512>>& fifo_W_in,
446
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
447
+ tapa::istream<ap_uint<128>>& fifo_from_acc1,
448
+ tapa::ostream<ap_uint<512>>& fifo_O_out,
449
+ tapa::istream<ap_uint<1024>>& fifo_context,
450
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
451
+ tapa::ostream<ap_uint<512>>& fifo_reduce_send,
452
+ tapa::ostream<bool>& fifo_fin
453
+ ){
454
+
455
+ ap_uint<64> scratchpad_q[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
456
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=1 factor=16
457
+ #pragma HLS array_partition variable=scratchpad_q cyclic dim=2 factor=2
458
+
459
+ ap_uint<64> scratchpad_k[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
460
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=1 factor=16
461
+ #pragma HLS array_partition variable=scratchpad_k cyclic dim=2 factor=2
462
+ #pragma HLS bind_storage variable=scratchpad_k type=ram_2p impl=uram
463
+
464
+ for(int stage_i = 0; stage_i < 20; stage_i++){
465
+ #pragma HLS loop_flatten off
466
+
467
+ // stage 0: WqX
468
+ // stage 1: WkX0 <- acc1
469
+ // stage 2: QK^T
470
+ // stage 3: WoO
471
+
472
+ ap_uint<32> W[D_head][D_div_8]; // 4 bit
473
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=16
474
+
475
+ const int start = fifo_len_in.read();
476
+ const int end = fifo_len_in.read();
477
+ fifo_len_out.write(start);
478
+ fifo_len_out.write(end);
479
+
480
+ const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
481
+
482
+ // load weights and forward
483
+ if(stage != 2) {
484
+ for(int i = 0; i < D_head_div_4; i++){
485
+ load_weight:
486
+ for(int j = 0; j < D_div_8;){
487
+ if(!fifo_W_in.empty()){
488
+ ap_uint<512> val; fifo_W_in.try_read(val);
489
+
490
+ for(int k = 0; k < 4; k++){
491
+ #pragma HLS unroll
492
+ W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
493
+ }
494
+ val = ap_uint<512>(val >> 128);
495
+ fifo_W_out.write(val);
496
+ j++;
497
+ }
498
+ }
499
+ }
500
+ }
501
+
502
+ int j_bound = (stage == 2) ? (L >> 4) : D_head_div_16;
503
+ j_bound = (stage == 3) ? D_div_16 : j_bound;
504
+ int k_bound = (stage > 1) ? D_head_div_8 : D_div_8;
505
+
506
+ // stage 1: compute Q
507
+ for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 64
508
+ for(int j = 0; j < j_bound; j++){
509
+
510
+ ap_int<38> acc_vec[8][16][8];
511
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
512
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
513
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
514
+
515
+ for(int ii = 0; ii < 8; ii++){
516
+ #pragma HLS unroll
517
+ for(int kk = 0; kk < 16; kk++){
518
+ #pragma HLS unroll
519
+ for(int k = 0; k < 8; k++){
520
+ #pragma HLS unroll
521
+ acc_vec[ii][kk][k] = 0;
522
+ }
523
+ }
524
+ }
525
+
526
+ compute:
527
+ for(int k = 0; k < k_bound; k++){ // reduction dim
528
+ #pragma HLS pipeline II=1
529
+
530
+ ap_uint<64> op1_mtx[16];
531
+ ap_uint<64> op2_mtx[16];
532
+ #pragma HLS array_partition variable=op1_mtx complete
533
+ #pragma HLS array_partition variable=op2_mtx complete
534
+
535
+ ap_uint<1024> recv_pkt;
536
+ if(stage == 3){
537
+ recv_pkt = fifo_context.read();
538
+ } else if(stage != 2) {
539
+ recv_pkt = fifo_X_in.read();
540
+ fifo_X_out.write(recv_pkt);
541
+ }
542
+
543
+ for(int ii = 0; ii < 16; ii++){
544
+ #pragma HLS unroll
545
+ if(stage == 3){
546
+ op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
547
+ op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
548
+ } else if(stage == 2) {
549
+ op1_mtx[ii] = scratchpad_q[i*16+ii][k];
550
+ op2_mtx[ii] = scratchpad_k[j*16+ii][k];
551
+ } else {
552
+ op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
553
+ op2_mtx[ii] = ap_uint<64>(recv_pkt(ii*64+63, ii*64));
554
+ }
555
+ }
556
+
557
+ for(int ii = 0; ii < 8; ii++){
558
+ #pragma HLS unroll
559
+ for(int kk = 0; kk < 16; kk++){
560
+ #pragma HLS unroll
561
+ for(int l = 0; l < 8; l++){
562
+ #pragma HLS unroll
563
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
564
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
565
+ if(stage == 2){
566
+ op1 = ap_int<8>(op1_mtx[l*2](ii*8+7, ii*8));
567
+ op2 = ap_int<8>(op1_mtx[l*2+1](ii*8+7, ii*8));
568
+ } else {
569
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
570
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
571
+ }
572
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
573
+ acc_vec[ii][kk][l] += w_pack * op3;
574
+ }
575
+ }
576
+ }
577
+ }
578
+
579
+ ap_int<22> acc_final[16][16];
580
+ #pragma HLS array_partition variable=acc_final dim=1 complete
581
+ #pragma HLS array_partition variable=acc_final dim=2 complete
582
+
583
+ for(int ii = 0; ii < 16; ii++){
584
+ #pragma HLS unroll
585
+ for(int k = 0; k < 16; k++){
586
+ #pragma HLS unroll
587
+ acc_final[ii][k] = 0;
588
+ }
589
+ }
590
+
591
+ reduction:
592
+ for(int kk = 0; kk < 8; kk++){
593
+ for(int ii = 0; ii < 16; ii++){
594
+ #pragma HLS unroll
595
+ for(int k = 0; k < 8; k++){
596
+ #pragma HLS unroll
597
+ ap_int<19> res0; ap_int<19> res1;
598
+ (res1, res0) = acc_vec[kk][ii][k];
599
+ res1 = res1 + res0[18];
600
+ acc_final[ii][k*2] += res0;
601
+ acc_final[ii][k*2+1] += res1;
602
+ if(kk == 7 && stage < 2) {
603
+ acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
604
+ acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
605
+ }
606
+ }
607
+ }
608
+ }
609
+
610
+ if(stage == 0){
611
+ for(int ii = 0; ii < 16; ii++){
612
+ #pragma HLS unroll
613
+ for(int k = 0; k < 16; k++){
614
+ #pragma HLS unroll
615
+ int offset = k%8;
616
+ scratchpad_q[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
617
+ }
618
+ }
619
+ } else if (stage == 1){
620
+ for(int ii = 0; ii < 16; ii++){
621
+ ap_uint<128> tmp = fifo_from_acc1.read();
622
+
623
+ for(int k = 0; k < 16; k++){
624
+ #pragma HLS unroll
625
+ int offset = k%8;
626
+ scratchpad_k[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[ii][k]);
627
+ }
628
+ for(int k = 0; k < 2; k++){
629
+ #pragma HLS unroll
630
+ scratchpad_k[end + i*16 + ii][j*2+k] = ap_uint<64>(tmp(k*64+63, k*64));
631
+ }
632
+ }
633
+ } else if(stage == 2){
634
+ for(int ii = 0; ii < 16; ii++){
635
+ #pragma HLS pipeline II=1
636
+ ap_uint<512> tmp;
637
+ for(int kk = 0; kk < 16; kk++){
638
+ #pragma HLS unroll
639
+ tmp(kk*32+31, kk*32) = tapa::bit_cast<ap_uint<32>>(acc_final[ii][kk]);
640
+ }
641
+ fifo_O_out.write(tmp);
642
+ }
643
+ } else {
644
+ final_acc:
645
+ for(int ii = 0; ii < 16;){
646
+ #pragma HLS pipeline II=1
647
+ if(!fifo_reduce_recv.empty()){
648
+ ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
649
+ ap_uint<512> tmp;
650
+ for(int k = 0; k < 16; k++){
651
+ #pragma HLS unroll
652
+ acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
653
+ tmp(k*32+21, k*32) = acc_final[ii][k];
654
+ }
655
+ fifo_reduce_send.write(tmp);
656
+ ii++;
657
+ }
658
+ }
659
+ }
660
+ }
661
+ }
662
+ }
663
+ fifo_fin.write(true);
664
+ }
665
+
666
+ // acc slr0 master node
667
+ void temporal_acc1_slr0(
668
+ const int L,
669
+ tapa::istream<int>& fifo_len_in,
670
+ tapa::ostream<int>& fifo_len_out,
671
+ tapa::istream<ap_uint<512>>& fifo_X_in,
672
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
673
+ tapa::istream<ap_uint<512>>& fifo_W_in,
674
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
675
+ tapa::ostream<ap_uint<128>>& fifo_to_acc0,
676
+ tapa::istream<ap_uint<128>>& fifo_from_sfu,
677
+ tapa::ostream<ap_uint<1024>>& fifo_O_out,
678
+ tapa::istream<ap_uint<1024>>& fifo_context,
679
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
680
+ tapa::ostream<ap_uint<64>>& fifo_write,
681
+ tapa::ostream<bool>& fifo_fin
682
+ ){
683
+ ap_uint<64> X[MAX_SEQ_LEN][D_div_8]; // 8 bit
684
+ #pragma HLS array_partition variable=X cyclic dim=1 factor=16
685
+ #pragma HLS array_partition variable=X cyclic dim=2 factor=2
686
+ #pragma HLS bind_storage variable=X type=ram_2p impl=uram
687
+
688
+ ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
689
+ #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
690
+ #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
691
+ #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
692
+
693
+ // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
694
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
695
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
696
+
697
+ for(int stage_i = 0; stage_i < 20; stage_i++){
698
+
699
+ // stage 0: WvX
700
+ // stage 1: WkX1 -> acc0
701
+ // stage 2: Softmax(QK)V <- acc0
702
+ // stage 3: WoO
703
+
704
+ ap_uint<32> W[D_head][D_div_8]; // 4 bit
705
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=16
706
+
707
+ const int start = fifo_len_in.read();
708
+ const int end = fifo_len_in.read();
709
+ fifo_len_out.write(start);
710
+ fifo_len_out.write(end);
711
+
712
+ const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
713
+
714
+ // load weights and forward
715
+ if(stage != 2) {
716
+ for(int i = 0; i < D_head_div_4; i++){
717
+ load_weight:
718
+ for(int j = 0; j < D_div_8;){
719
+ if(!fifo_W_in.empty()){
720
+ ap_uint<512> val; fifo_W_in.try_read(val);
721
+
722
+ for(int k = 0; k < 4; k++){
723
+ #pragma HLS unroll
724
+ W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
725
+ }
726
+ val = ap_uint<512>(val >> 128);
727
+ fifo_W_out.write(val);
728
+ j++;
729
+ }
730
+ }
731
+ }
732
+ }
733
+
734
+ int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
735
+ k_bound = (stage == 3) ? D_head_div_8 : k_bound;
736
+ int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
737
+
738
+ for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
739
+
740
+ ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
741
+ #pragma HLS array_partition variable=cache_attn dim=2 complete
742
+ #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
743
+
744
+ if(stage_i == 0){
745
+ for(int ii = 0; ii < 2; ii++){ // load only 1 time
746
+ load_x:
747
+ for(int jj = 0; jj < D_div_8;){
748
+ if(!fifo_X_in.empty()){
749
+ ap_uint<512> val; fifo_X_in.try_read(val);
750
+
751
+ for(int k = 0; k < 8; k++){
752
+ #pragma HLS unroll
753
+ X[i*16+ii*8+k][jj] = ap_uint<64>(val(k*64+63, k*64));
754
+ }
755
+ jj++;
756
+ }
757
+ }
758
+ }
759
+ } else if (stage == 2) {
760
+ for(int ii = 0; ii < (L >> 3); ii++){
761
+ ap_uint<32> fuse_reg[16];
762
+ load_attn:
763
+ for(int offset = 0; offset < 8;){
764
+ #pragma HLS pipeline II=1
765
+ if(!fifo_from_sfu.empty()){
766
+ ap_uint<128> val; fifo_from_sfu.try_read(val);
767
+ for(int k = 0; k < 16; k++){
768
+ #pragma HLS unroll
769
+ fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
770
+ }
771
+ offset++;
772
+ }
773
+ }
774
+ for(int k = 0; k < 16; k++){
775
+ #pragma HLS unroll
776
+ cache_attn[ii][k] = fuse_reg[k];
777
+ }
778
+ }
779
+ }
780
+
781
+ for(int j = 0; j < j_bound; j++){
782
+
783
+ ap_int<38> acc_vec[8][16][8];
784
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
785
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
786
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
787
+
788
+ for(int ii = 0; ii < 8; ii++){
789
+ #pragma HLS unroll
790
+ for(int kk = 0; kk < 16; kk++){
791
+ #pragma HLS unroll
792
+ for(int k = 0; k < 8; k++){
793
+ #pragma HLS unroll
794
+ acc_vec[ii][kk][k] = 0;
795
+ }
796
+ }
797
+ }
798
+
799
+ compute:
800
+ for(int k = 0; k < k_bound; k++){
801
+ #pragma HLS pipeline II=1
802
+
803
+ ap_uint<64> op1_mtx[16];
804
+ ap_uint<64> op2_mtx[16];
805
+ #pragma HLS array_partition variable=op1_mtx complete
806
+ #pragma HLS array_partition variable=op2_mtx complete
807
+
808
+ ap_uint<1024> recv_pkt;
809
+
810
+ if(stage == 3) {
811
+ recv_pkt = fifo_context.read();
812
+ }
813
+
814
+ for(int ii = 0; ii < 16; ii++){
815
+ #pragma HLS unroll
816
+ if(stage == 3){
817
+ op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
818
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
819
+ } else if(stage != 2) {
820
+ op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
821
+ op2_mtx[ii] = X[i*16+ii][k];
822
+ } else {
823
+ op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
824
+ op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
825
+ }
826
+ }
827
+
828
+ if(stage < 2){
829
+ ap_uint<1024> send_pkt = ap_uint<1024>((
830
+ op2_mtx[0], op2_mtx[1], op2_mtx[2], op2_mtx[3], op2_mtx[4], op2_mtx[5], op2_mtx[6], op2_mtx[7],
831
+ op2_mtx[8], op2_mtx[9], op2_mtx[10], op2_mtx[11], op2_mtx[12], op2_mtx[13], op2_mtx[14], op2_mtx[15]
832
+ ));
833
+ fifo_X_out.write(send_pkt);
834
+ }
835
+
836
+ for(int ii = 0; ii < 8; ii++){
837
+ #pragma HLS unroll
838
+ for(int kk = 0; kk < 16; kk++){
839
+ #pragma HLS unroll
840
+ for(int l = 0; l < 8; l++){
841
+ #pragma HLS unroll
842
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
843
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
844
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
845
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
846
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
847
+ acc_vec[ii][kk][l] += w_pack * op3;
848
+ }
849
+ }
850
+ }
851
+ }
852
+
853
+ ap_int<22> acc_final[16][16];
854
+ #pragma HLS array_partition variable=acc_final dim=1 complete
855
+ #pragma HLS array_partition variable=acc_final dim=2 complete
856
+
857
+ for(int ii = 0; ii < 16; ii++){
858
+ #pragma HLS unroll
859
+ for(int k = 0; k < 16; k++){
860
+ #pragma HLS unroll
861
+ acc_final[ii][k] = 0;
862
+ }
863
+ }
864
+
865
+ reduction:
866
+ for(int kk = 0; kk < 8; kk++){
867
+ for(int ii = 0; ii < 16; ii++){
868
+ #pragma HLS unroll
869
+ for(int k = 0; k < 8; k++){
870
+ #pragma HLS unroll
871
+ ap_int<19> res0; ap_int<19> res1;
872
+ (res1, res0) = acc_vec[kk][ii][k];
873
+ res1 = res1 + res0[18];
874
+ acc_final[ii][k*2] += res0;
875
+ acc_final[ii][k*2+1] += res1;
876
+ if(kk == 7 && stage != 3) {
877
+ acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
878
+ acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
879
+ }
880
+ }
881
+ }
882
+ }
883
+
884
+ if(stage == 0){
885
+ for(int ii = 0; ii < 16; ii++){
886
+ #pragma HLS unroll
887
+ for(int k = 0; k < 16; k++){
888
+ #pragma HLS unroll
889
+ int offset = k%8;
890
+ scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
891
+ }
892
+ }
893
+ } else if (stage == 2){
894
+ for(int ii = 0; ii < 2; ii++){
895
+ #pragma HLS pipeline II=1
896
+ ap_uint<1024> tmp;
897
+ for(int jj = 0; jj < 8; jj++){
898
+ #pragma HLS unroll
899
+ for(int k = 0; k < 16; k++){
900
+ #pragma HLS unroll
901
+ tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
902
+ }
903
+ }
904
+ fifo_O_out.write(tmp);
905
+ }
906
+ } else if (stage == 1) {
907
+ for(int ii = 0; ii < 16; ii++){
908
+ ap_uint<128> tmp;
909
+ for(int k = 0; k < 16; k++){
910
+ #pragma HLS unroll
911
+ tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
912
+ }
913
+ fifo_to_acc0.write(tmp);
914
+ }
915
+ } else {
916
+ final_acc:
917
+ for(int ii = 0; ii < 16;){
918
+ #pragma HLS pipeline II=1
919
+ #pragma HLS dependence variable=X type=inter false
920
+ if(!fifo_reduce_recv.empty()){
921
+ ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
922
+ for(int k = 0; k < 16; k++){
923
+ #pragma HLS unroll
924
+ acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
925
+ X[i*16+ii][j*2+k/8]((k%8)*8+7, (k%8)*8) = ap_int<8>(acc_final[ii][k] >> 8); //TODO: change
926
+ }
927
+
928
+ ii++;
929
+ }
930
+ }
931
+ }
932
+ }
933
+ }
934
+ }
935
+ fifo_fin.write(true);
936
+
937
+ // write out for debug
938
+ write:
939
+ for(int i = 0; i < L; i++){
940
+ for(int j = 0; j < D_div_8; j++){
941
+ #pragma HLS pipeline II=1
942
+ fifo_write.write(X[i][j]);
943
+ }
944
+ }
945
+ }
946
+
947
+ void temporal_acc1(
948
+ const int L,
949
+ tapa::istream<int>& fifo_len_in,
950
+ tapa::ostream<int>& fifo_len_out,
951
+ tapa::istream<ap_uint<1024>>& fifo_X_in,
952
+ tapa::ostream<ap_uint<1024>>& fifo_X_out, // 8-bit activation
953
+ tapa::istream<ap_uint<512>>& fifo_W_in,
954
+ tapa::ostream<ap_uint<512>>& fifo_W_out, // 4-bit weight
955
+ tapa::ostream<ap_uint<128>>& fifo_to_acc0,
956
+ tapa::istream<ap_uint<128>>& fifo_from_sfu,
957
+ tapa::ostream<ap_uint<1024>>& fifo_O_out,
958
+ tapa::istream<ap_uint<1024>>& fifo_context,
959
+ tapa::istream<ap_uint<512>>& fifo_reduce_recv,
960
+ tapa::ostream<ap_uint<512>>& fifo_reduce_send,
961
+ tapa::ostream<bool>& fifo_fin
962
+ ){
963
+
964
+ ap_uint<64> scratchpad[MAX_SEQ_LEN][D_head_div_8]; // 8 bit
965
+ #pragma HLS array_partition variable=scratchpad cyclic dim=1 factor=16
966
+ #pragma HLS array_partition variable=scratchpad cyclic dim=2 factor=2
967
+ #pragma HLS bind_storage variable=scratchpad type=ram_2p impl=uram
968
+
969
+ // ap_uint<64> scratchpad_out[MAX_SEQ_LEN][D_head_div_8];
970
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=1 factor=16
971
+ // #pragma HLS array_partition variable=scratchpad_out cyclic dim=2 factor=2
972
+
973
+ for(int stage_i = 0; stage_i < 20; stage_i++){
974
+
975
+ // stage 0: WvX
976
+ // stage 1: WkX1 -> acc0
977
+ // stage 2: Softmax(QK)V <- acc0
978
+ // stage 3: WoO
979
+
980
+ ap_uint<32> W[D_head][D_div_8]; // 4 bit
981
+ #pragma HLS array_partition variable=W cyclic dim=1 factor=16
982
+
983
+ const int start = fifo_len_in.read();
984
+ const int end = fifo_len_in.read();
985
+ fifo_len_out.write(start);
986
+ fifo_len_out.write(end);
987
+
988
+ const int stage = (stage_i < 15) ? (stage_i % 3) : 3;
989
+
990
+ // load weights and forward
991
+ if(stage != 2) {
992
+ for(int i = 0; i < D_head_div_4; i++){
993
+ load_weight:
994
+ for(int j = 0; j < D_div_8;){
995
+ if(!fifo_W_in.empty()){
996
+ ap_uint<512> val; fifo_W_in.try_read(val);
997
+
998
+ for(int k = 0; k < 4; k++){
999
+ #pragma HLS unroll
1000
+ W[i*4+k][j] = ap_uint<32>(val(k*32+31, k*32));
1001
+ }
1002
+ val = ap_uint<512>(val >> 128);
1003
+ fifo_W_out.write(val);
1004
+ j++;
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
1010
+ int k_bound = (stage == 2) ? (L >> 3) : D_div_8;
1011
+ k_bound = (stage == 3) ? D_head_div_8 : k_bound;
1012
+ int j_bound = (stage == 3) ? D_div_16 : D_head_div_16;
1013
+
1014
+ for(int i = (start >> 4); i < (end >> 4); i++){ // make sure L is multiple of 4
1015
+
1016
+ ap_uint<32> cache_attn[MAX_SEQ_LEN_div_8][16];
1017
+ #pragma HLS array_partition variable=cache_attn dim=2 complete
1018
+ #pragma HLS array_partition variable=cache_attn dim=1 cyclic factor=2
1019
+
1020
+ if(stage == 2){
1021
+ for(int ii = 0; ii < (L >> 3); ii++){
1022
+ ap_uint<32> fuse_reg[16];
1023
+ load_attn:
1024
+ for(int offset = 0; offset < 8;){
1025
+ #pragma HLS pipeline II=1
1026
+ if(!fifo_from_sfu.empty()){
1027
+ ap_uint<128> val; fifo_from_sfu.try_read(val);
1028
+ for(int k = 0; k < 16; k++){
1029
+ #pragma HLS unroll
1030
+ fuse_reg[k](offset*4+3, offset*4) = ap_int<4>(val(k*8+3, k*8));
1031
+ }
1032
+ offset++;
1033
+ }
1034
+ }
1035
+ for(int k = 0; k < 16; k++){
1036
+ #pragma HLS unroll
1037
+ cache_attn[ii][k] = fuse_reg[k];
1038
+ }
1039
+ }
1040
+ }
1041
+
1042
+ for(int j = 0; j < j_bound; j++){
1043
+
1044
+ ap_int<38> acc_vec[8][16][8];
1045
+ #pragma HLS array_partition variable=acc_vec dim=1 complete
1046
+ #pragma HLS array_partition variable=acc_vec dim=2 complete
1047
+ #pragma HLS array_partition variable=acc_vec dim=3 complete
1048
+
1049
+ for(int ii = 0; ii < 8; ii++){
1050
+ #pragma HLS unroll
1051
+ for(int kk = 0; kk < 16; kk++){
1052
+ #pragma HLS unroll
1053
+ for(int k = 0; k < 8; k++){
1054
+ #pragma HLS unroll
1055
+ acc_vec[ii][kk][k] = 0;
1056
+ }
1057
+ }
1058
+ }
1059
+
1060
+ compute:
1061
+ for(int k = 0; k < k_bound; k++){
1062
+ #pragma HLS pipeline II=1
1063
+
1064
+ ap_uint<64> op1_mtx[16];
1065
+ ap_uint<64> op2_mtx[16];
1066
+ #pragma HLS array_partition variable=op1_mtx complete
1067
+ #pragma HLS array_partition variable=op2_mtx complete
1068
+
1069
+ ap_uint<1024> recv_pkt;
1070
+
1071
+ if(stage == 3) {
1072
+ recv_pkt = fifo_context.read();
1073
+ } else if(stage != 2) {
1074
+ recv_pkt = fifo_X_in.read();
1075
+ fifo_X_out.write(recv_pkt);
1076
+ }
1077
+
1078
+ for(int ii = 0; ii < 16; ii++){ //TODO: change logic
1079
+ #pragma HLS unroll
1080
+ if(stage == 3){
1081
+ op1_mtx[ii] = ap_uint<64>(W[k*8+ii%8][j*2+ii/8]);
1082
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
1083
+ } else if(stage != 2) {
1084
+ op1_mtx[ii] = ap_uint<64>(W[j*16+ii][k]);
1085
+ op2_mtx[ii] = recv_pkt(ii*64+63, ii*64);
1086
+ } else {
1087
+ op1_mtx[ii] = ap_uint<64>(cache_attn[k][ii]);
1088
+ op2_mtx[ii] = scratchpad[k*8+ii/2][j*2+(ii%2)];
1089
+ }
1090
+ }
1091
+
1092
+ for(int ii = 0; ii < 8; ii++){
1093
+ #pragma HLS unroll
1094
+ for(int kk = 0; kk < 16; kk++){
1095
+ #pragma HLS unroll
1096
+ for(int l = 0; l < 8; l++){
1097
+ #pragma HLS unroll
1098
+ ap_int<8> op1; ap_int<8> op2; ap_int<8> op3;
1099
+ op3 = ap_int<8>(op2_mtx[kk](ii*8+7, ii*8));
1100
+ op1 = ap_int<4>(op1_mtx[l*2](ii*4+3, ii*4));
1101
+ op2 = ap_int<4>(op1_mtx[l*2+1](ii*4+3, ii*4));
1102
+ ap_int<27> w_pack = ap_int<27>((op2, ap_uint<19>(0))) + op1;
1103
+ acc_vec[ii][kk][l] += w_pack * op3;
1104
+ }
1105
+ }
1106
+ }
1107
+ }
1108
+
1109
+ ap_int<22> acc_final[16][16];
1110
+ #pragma HLS array_partition variable=acc_final dim=1 complete
1111
+ #pragma HLS array_partition variable=acc_final dim=2 complete
1112
+
1113
+ for(int ii = 0; ii < 16; ii++){
1114
+ #pragma HLS unroll
1115
+ for(int k = 0; k < 16; k++){
1116
+ #pragma HLS unroll
1117
+ acc_final[ii][k] = 0;
1118
+ }
1119
+ }
1120
+
1121
+ reduction:
1122
+ for(int kk = 0; kk < 8; kk++){
1123
+ for(int ii = 0; ii < 16; ii++){
1124
+ #pragma HLS unroll
1125
+ for(int k = 0; k < 8; k++){
1126
+ #pragma HLS unroll
1127
+ ap_int<19> res0; ap_int<19> res1;
1128
+ (res1, res0) = acc_vec[kk][ii][k];
1129
+ res1 = res1 + res0[18];
1130
+ acc_final[ii][k*2] += res0;
1131
+ acc_final[ii][k*2+1] += res1;
1132
+ if(kk == 7 && stage != 3) {
1133
+ acc_final[ii][k*2] = acc_final[ii][k*2] >> 8; // rescale & clamp
1134
+ acc_final[ii][k*2+1] = acc_final[ii][k*2+1] >> 8; // rescale & clamp
1135
+ }
1136
+ }
1137
+ }
1138
+ }
1139
+
1140
+ if(stage == 0){
1141
+ for(int ii = 0; ii < 16; ii++){
1142
+ #pragma HLS unroll
1143
+ for(int k = 0; k < 16; k++){
1144
+ #pragma HLS unroll
1145
+ int offset = k%8;
1146
+ scratchpad[i*16+ii][j*2+k/8](offset*8+7, offset*8) = ap_int<8>(acc_final[k][ii]);
1147
+ }
1148
+ }
1149
+ } else if (stage == 2){
1150
+ for(int ii = 0; ii < 2; ii++){
1151
+ #pragma HLS pipeline II=1
1152
+ ap_uint<1024> tmp;
1153
+ for(int jj = 0; jj < 8; jj++){
1154
+ #pragma HLS unroll
1155
+ for(int k = 0; k < 16; k++){
1156
+ #pragma HLS unroll
1157
+ tmp((jj*16+k)*8+7, (jj*16+k)*8) = ap_int<8>(acc_final[k][ii*8+jj]);
1158
+ }
1159
+ }
1160
+ fifo_O_out.write(tmp);
1161
+ }
1162
+ } else if (stage == 1){
1163
+ for(int ii = 0; ii < 16; ii++){
1164
+ ap_uint<128> tmp;
1165
+ for(int k = 0; k < 16; k++){
1166
+ #pragma HLS unroll
1167
+ tmp(k*8+7, k*8) = ap_int<8>(acc_final[ii][k]);
1168
+ }
1169
+ fifo_to_acc0.write(tmp);
1170
+ }
1171
+ } else {
1172
+ final_acc:
1173
+ for(int ii = 0; ii < 16;){
1174
+ #pragma HLS pipeline II=1
1175
+ if(!fifo_reduce_recv.empty()){
1176
+ ap_uint<512> tmp_recv; fifo_reduce_recv.try_read(tmp_recv);
1177
+ ap_uint<512> tmp;
1178
+ for(int k = 0; k < 16; k++){
1179
+ #pragma HLS unroll
1180
+ acc_final[ii][k] += ap_int<22>(tmp_recv(k*32+21, k*32));
1181
+ tmp(k*32+21, k*32) = acc_final[ii][k];
1182
+ }
1183
+ fifo_reduce_send.write(tmp);
1184
+ ii++;
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+ }
1190
+ }
1191
+ fifo_fin.write(true);
1192
+
1193
+ // write out for debug
1194
+ // write:
1195
+ // for(int i = 0; i < L; i++){
1196
+ // for(int j = 0; j < D_head_div_8; j++){
1197
+ // #pragma HLS pipeline II=1
1198
+ // fifo_O_out.write(scratchpad_out[i][j]);
1199
+ // }
1200
+ // }
1201
+ }
1202
+
1203
+ void sfu_buffer( // double buffering
1204
+ const int L,
1205
+ tapa::istream<ap_uint<512>>& fifo_data_in,
1206
+ tapa::ostream<ap_uint<512>>& fifo_data_out
1207
+ ){
1208
+ for(int stage = 0; stage < 5; stage++){
1209
+
1210
+ for(int l = 0; l < (L >> 5); l++){
1211
+ float sum[8][16];
1212
+ float cache[MAX_SEQ_LEN][16];
1213
+ #pragma HLS array_partition variable=cache dim=2 complete
1214
+ #pragma HLS array_partition variable=sum dim=2 complete
1215
+
1216
+ for(int i = 0; i < 8; i++){
1217
+ for(int j = 0; j < 16; j++){
1218
+ #pragma HLS unroll
1219
+ sum[i][j] = 0.0;
1220
+ }
1221
+ }
1222
+
1223
+ acc:
1224
+ for(int i = 0; i < L; i++){
1225
+ #pragma HLS pipeline II=1
1226
+ #pragma HLS dependence false variable=sum
1227
+ #pragma HLS dependence true variable=sum distance=8
1228
+ ap_uint<512> tmp = fifo_data_in.read();
1229
+ for(int k = 0; k < 16; k++){
1230
+ #pragma HLS unroll
1231
+ float res = tapa::bit_cast<float>(ap_int<32>(tmp(k*32+31, k*32)));
1232
+ sum[i%8][k] += res;
1233
+ cache[i][k] = res;
1234
+ }
1235
+ }
1236
+
1237
+ reduce:
1238
+ for(int i = 1; i < 8; i++){
1239
+ for(int j = 0; j < 8; j++){
1240
+ #pragma HLS pipeline II=1
1241
+ #pragma HLS dependence true variable=sum distance=8
1242
+ for(int k = 0; k < 2; k++){
1243
+ sum[0][j*2+k] += sum[i][j*2+k];
1244
+ }
1245
+ }
1246
+ }
1247
+
1248
+ ap_uint<512> tmp;
1249
+ for(int i = 0; i < 16; i++){
1250
+ #pragma HLS unroll
1251
+ tmp(i*32+31, i*32) = tapa::bit_cast<ap_uint<32>>(sum[0][i]);
1252
+ }
1253
+ fifo_data_out.write(tmp);
1254
+
1255
+ write:
1256
+ for(int i = 0; i < L; i++){
1257
+ #pragma HLS pipeline II=1
1258
+ ap_uint<512> tmp;
1259
+ for(int j = 0; j < 16; j++){
1260
+ #pragma HLS unroll
1261
+ tmp(j*32+31, j*32) = tapa::bit_cast<ap_uint<32>>(cache[i][j]);
1262
+ }
1263
+ fifo_data_out.write(tmp);
1264
+ }
1265
+
1266
+ }
1267
+ }
1268
+
1269
+ }
1270
+
1271
+ void sfu_acc_exp(
1272
+ const int L,
1273
+ tapa::istream<ap_uint<512>>& fifo_data_in,
1274
+ tapa::ostreams<ap_uint<512>, 2>& fifo_buf
1275
+ ) {
1276
+ for(int stage = 0; stage < 5; stage++){
1277
+
1278
+ for(int l = 0; l < (L >> 4); l++){
1279
+ exp_acc:
1280
+ for(int i = 0; i < L;){
1281
+ #pragma HLS pipeline II=1
1282
+ if(!fifo_data_in.empty()){
1283
+ ap_uint<512> tmp; fifo_data_in.try_read(tmp);
1284
+ ap_uint<512> tmp_o;
1285
+ for(int k = 0; k < 16; k++){
1286
+ #pragma HLS unroll
1287
+ int res = tapa::bit_cast<int>(ap_int<32>(tmp(k*32+31, k*32)));
1288
+ float res_exp = 0.0;
1289
+ res_exp = hls::exp(ap_int<32>(res >> 10));
1290
+ tmp_o(k*32+31, k*32) = tapa::bit_cast<ap_uint<32>>(res_exp);
1291
+ }
1292
+ fifo_buf[l%2].write(tmp_o);
1293
+ i++;
1294
+ }
1295
+ }
1296
+ }
1297
+ }
1298
+ }
1299
+
1300
+ void sfu_norm(
1301
+ const int L,
1302
+ tapa::istreams<ap_uint<512>, 2>& fifo_buf,
1303
+ tapa::ostream<ap_uint<128>>& fifo_data_out
1304
+ ){
1305
+ for(int stage = 0; stage < 5; stage++){
1306
+
1307
+ for(int l = 0; l < (L >> 4); l++){
1308
+ float sum[16];
1309
+ #pragma HLS array_partition variable=sum complete
1310
+
1311
+ ap_uint<512> tmp_in = fifo_buf[l%2].read();
1312
+
1313
+ for(int i = 0; i < 16; i++){
1314
+ #pragma HLS unroll
1315
+ sum[i] = 32.0 / tapa::bit_cast<float>(ap_uint<32>(tmp_in(i*32+31, i*32)));
1316
+ }
1317
+
1318
+ for(int i = 0; i < L;){
1319
+ #pragma HLS pipeline II=1
1320
+ if(!fifo_buf[l%2].empty()){
1321
+ ap_uint<512> tmp_cache; fifo_buf[l%2].try_read(tmp_cache);
1322
+ ap_uint<128> tmp;
1323
+ for(int j = 0; j < 16; j++){
1324
+ #pragma HLS unroll
1325
+ ap_int<8> res = (int) (tapa::bit_cast<float>(ap_uint<32>(tmp_cache(j*32+31, j*32))) * sum[j]);
1326
+ tmp(j*8 + 7, j*8) = res;
1327
+ }
1328
+ fifo_data_out.write(tmp);
1329
+ i++;
1330
+ }
1331
+ }
1332
+ }
1333
+ }
1334
+ }
1335
+
1336
+ void context_buffer(
1337
+ const int L,
1338
+ tapa::istream<ap_uint<1024>>& fifo_context,
1339
+ tapa::ostream<ap_uint<1024>>& fifo_to_acc0,
1340
+ tapa::ostream<ap_uint<1024>>& fifo_to_acc1
1341
+ ){
1342
+ ap_uint<64> context[MAX_SEQ_LEN][CONTEXT_D];
1343
+ #pragma HLS array_partition variable=context cyclic dim=1 factor=32
1344
+ #pragma HLS bind_storage variable=context type=ram_2p impl=uram
1345
+
1346
+ for(int stage = 0; stage < 5; stage++){
1347
+ for(int i = 0; i < (L >> 4); i++){
1348
+ for(int j = stage * D_head_div_8; j < (stage + 1) * D_head_div_8;){
1349
+ if(!fifo_context.empty()){
1350
+ ap_uint<1024> tmp; fifo_context.try_read(tmp);
1351
+ for(int ii = 0; ii < 16; ii++){
1352
+ #pragma HLS unroll
1353
+ context[i*16+ii][j] = tmp(ii*64+63, ii*64);
1354
+ }
1355
+ j++;
1356
+ }
1357
+ }
1358
+ }
1359
+ }
1360
+
1361
+ // NOTE: change it to write to HBM for debugging
1362
+ // write ops to acc0 and acc1 in parallel
1363
+ for(int stage = 0; stage < 5; stage++){
1364
+ for(int i = 0; i < (L >> 5); i++){
1365
+ for(int l = 0; l < D_div_16; l++){
1366
+ for(int j = 0; j < D_head_div_8; j++){
1367
+ ap_uint<1024> tmp_acc0;
1368
+ ap_uint<1024> tmp_acc1;
1369
+ for(int k = 0; k < 16; k++){
1370
+ #pragma HLS unroll
1371
+ tmp_acc0(k*64+63, k*64) = context[i*32+k][j];
1372
+ tmp_acc1(k*64+63, k*64) = context[i*32+16+k][j];
1373
+ }
1374
+ fifo_to_acc0.write(tmp_acc0);
1375
+ fifo_to_acc1.write(tmp_acc1);
1376
+ }
1377
+ }
1378
+ }
1379
+ }
1380
+
1381
+ }
1382
+
1383
+ void measure_cycle(tapa::istreams<bool, TOTAL_PORT>& fifo_fin, tapa::mmap<int> cycle_count){
1384
+ for(int cycle = 0;;cycle++){
1385
+ bool flag_cont = false;
1386
+ for(int i = 0; i < TOTAL_PORT; i++){
1387
+ flag_cont |= fifo_fin[i].empty();
1388
+ }
1389
+ if(!flag_cont){
1390
+ for(int i = 0; i < TOTAL_PORT; i++){
1391
+ fifo_fin[i].read(nullptr);
1392
+ }
1393
+ cycle_count[0] = cycle;
1394
+ break;
1395
+ }
1396
+ }
1397
+ }
1398
+
1399
+ void opt_kernel(
1400
+ const int L,
1401
+ const int L_out,
1402
+ const int seq_len,
1403
+ // tapa::mmap<int> inst, // inst[0] = L, inst[1] = reload_weight
1404
+ tapa::mmap<ap_uint<512>> X_acc0,
1405
+ tapa::mmap<ap_uint<512>> X_acc1,
1406
+ tapa::mmap<ap_uint<512>> W_acc0,
1407
+ tapa::mmap<ap_uint<512>> W_acc1,
1408
+ tapa::mmap<ap_uint<64>> acc0_out,
1409
+ tapa::mmap<ap_uint<64>> acc1_out,
1410
+ tapa::mmap<int> cycle_count
1411
+ ){
1412
+ tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc0("fifo_inst_acc0");
1413
+ tapa::streams<int, NUM_SLR+1, 4> fifo_inst_acc1("fifo_inst_acc1");
1414
+ tapa::stream<ap_uint<512>, 16> fifo_X_acc0_slr0("fifo_X_acc0_slr0");
1415
+ tapa::stream<ap_uint<512>, 16> fifo_X_acc1_slr0("fifo_X_acc1_slr0");
1416
+ tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc0("fifo_X_acc0");
1417
+ tapa::streams<ap_uint<1024>, NUM_SLR, 4> fifo_X_acc1("fifo_X_acc1");
1418
+ tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc0("fifo_W_acc0");
1419
+ tapa::streams<ap_uint<512>, NUM_SLR+1, 8> fifo_W_acc1("fifo_W_acc1");
1420
+ // tapa::streams<ap_uint<512>, NUM_SLR, 4> fifo_acc0_out("fifo_acc0_out");
1421
+ tapa::streams<ap_uint<512>, NUM_SLR> fifo_acc0_to_sfu("fifo_acc0_to_sfu");
1422
+ tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_in("fifo_sfu_buf_in");
1423
+ tapa::streams<ap_uint<512>, NUM_SLR*2> fifo_sfu_buf_out("fifo_sfu_buf_out");
1424
+ // tapa::streams<ap_uint<64>, NUM_SLR> fifo_acc1_out("fifo_acc1_out");
1425
+ tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_acc1_to_acc0("fifo_from_acc1_to_acc0");
1426
+ tapa::streams<ap_uint<128>, NUM_SLR, 2> fifo_from_sfu_to_acc1("fifo_from_sfu_to_acc1");
1427
+ tapa::streams<bool, NUM_SLR*2> fifo_fin("fifo_fin");
1428
+
1429
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_context("fifo_context");
1430
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc0("fifo_cont_to_acc0");
1431
+ tapa::streams<ap_uint<1024>, NUM_SLR> fifo_cont_to_acc1("fifo_cont_to_acc1");
1432
+ tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc0("fifo_reduce_acc0");
1433
+ tapa::streams<ap_uint<512>, NUM_SLR> fifo_reduce_acc1("fifo_reduce_acc1");
1434
+
1435
+ tapa::stream<ap_uint<64>> fifo_acc0_out("fifo_acc0_out");
1436
+ tapa::stream<ap_uint<64>> fifo_acc1_out("fifo_acc1_out");
1437
+
1438
+ tapa::task()
1439
+ .invoke<tapa::join>(read_inst, seq_len, fifo_inst_acc0, fifo_inst_acc1)
1440
+ .invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc0, fifo_W_acc0)
1441
+ .invoke<tapa::join>(read_W, TOTAL_WEIGHT_SIZE, W_acc1, fifo_W_acc1)
1442
+ .invoke<tapa::join>(read_X, L, X_acc0, fifo_X_acc0_slr0)
1443
+ .invoke<tapa::join>(read_X, L, X_acc1, fifo_X_acc1_slr0)
1444
+ .invoke<tapa::join>(
1445
+ temporal_acc0_slr0,
1446
+ seq_len,
1447
+ fifo_inst_acc0, fifo_inst_acc0,
1448
+ fifo_X_acc0_slr0, fifo_X_acc0,
1449
+ fifo_W_acc0, fifo_W_acc0,
1450
+ fifo_from_acc1_to_acc0,
1451
+ fifo_acc0_to_sfu,
1452
+ fifo_cont_to_acc0,
1453
+ fifo_reduce_acc0,
1454
+ fifo_acc0_out,
1455
+ fifo_fin
1456
+ )
1457
+ .invoke<tapa::join>(
1458
+ temporal_acc1_slr0,
1459
+ seq_len,
1460
+ fifo_inst_acc1, fifo_inst_acc1,
1461
+ fifo_X_acc1_slr0, fifo_X_acc1,
1462
+ fifo_W_acc1, fifo_W_acc1,
1463
+ fifo_from_acc1_to_acc0,
1464
+ fifo_from_sfu_to_acc1,
1465
+ fifo_context,
1466
+ fifo_cont_to_acc1,
1467
+ fifo_reduce_acc1,
1468
+ fifo_acc1_out,
1469
+ fifo_fin
1470
+ )
1471
+ .invoke<tapa::join, NUM_SLR-1>(
1472
+ temporal_acc0,
1473
+ seq_len,
1474
+ fifo_inst_acc0, fifo_inst_acc0,
1475
+ fifo_X_acc0, fifo_X_acc0,
1476
+ fifo_W_acc0, fifo_W_acc0,
1477
+ fifo_from_acc1_to_acc0,
1478
+ fifo_acc0_to_sfu,
1479
+ fifo_cont_to_acc0,
1480
+ fifo_reduce_acc0, fifo_reduce_acc0,
1481
+ fifo_fin
1482
+ )
1483
+ .invoke<tapa::join, NUM_SLR-1>(
1484
+ temporal_acc1,
1485
+ seq_len,
1486
+ fifo_inst_acc1, fifo_inst_acc1,
1487
+ fifo_X_acc1, fifo_X_acc1,
1488
+ fifo_W_acc1, fifo_W_acc1,
1489
+ fifo_from_acc1_to_acc0,
1490
+ fifo_from_sfu_to_acc1,
1491
+ fifo_context,
1492
+ fifo_cont_to_acc1,
1493
+ fifo_reduce_acc1, fifo_reduce_acc1,
1494
+ fifo_fin
1495
+ )
1496
+ .invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc0)
1497
+ .invoke<tapa::join>(write_zero, seq_len, fifo_reduce_acc1)
1498
+ .invoke<tapa::join, NUM_SLR>(
1499
+ sfu_acc_exp, seq_len,
1500
+ fifo_acc0_to_sfu,
1501
+ fifo_sfu_buf_in
1502
+ )
1503
+ .invoke<tapa::join, NUM_SLR*2>(
1504
+ sfu_buffer, seq_len,
1505
+ fifo_sfu_buf_in,
1506
+ fifo_sfu_buf_out
1507
+ )
1508
+ .invoke<tapa::join, NUM_SLR>(
1509
+ sfu_norm, seq_len,
1510
+ fifo_sfu_buf_out,
1511
+ fifo_from_sfu_to_acc1
1512
+ )
1513
+ .invoke<tapa::join, NUM_SLR>(
1514
+ context_buffer, seq_len,
1515
+ fifo_context,
1516
+ fifo_cont_to_acc0, fifo_cont_to_acc1
1517
+ )
1518
+ // .invoke<tapa::join, NUM_SLR>(write_attention, seq_len, acc0_out, fifo_acc0_out)
1519
+ .invoke<tapa::join>(write_mtx, L_out, acc0_out, fifo_acc0_out)
1520
+ .invoke<tapa::join>(write_mtx, L_out, acc1_out, fifo_acc1_out)
1521
+ .invoke<tapa::join>(measure_cycle, fifo_fin, cycle_count)
1522
+ .invoke<tapa::detach>(black_hole_int, fifo_inst_acc0)
1523
+ .invoke<tapa::detach>(black_hole_int, fifo_inst_acc1)
1524
+ .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc0)
1525
+ .invoke<tapa::detach>(black_hole_ap_uint_1024, fifo_X_acc1)
1526
+ .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc0)
1527
+ .invoke<tapa::detach>(black_hole_ap_uint_512, fifo_W_acc1);
1528
+ }
gpt-2-medium/link_config_versal.ini ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [connectivity]
2
+ sp=opt_kernel.X_acc0:DDR
3
+ sp=opt_kernel.X_acc1:DDR
4
+ sp=opt_kernel.W_acc0:DDR
5
+ sp=opt_kernel.W_acc1:DDR
6
+ sp=opt_kernel.acc0_out:DDR
7
+ sp=opt_kernel.cycle_count:DDR
gpt-2-medium/opt-versal-rs.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rapidstream import RapidStreamTAPA, DeviceFactory
2
+
3
+ rs = RapidStreamTAPA("rs_build/")
4
+ rs.reset()
5
+ factory = DeviceFactory(
6
+ row=4,
7
+ col=2,
8
+ part_num="xcvp1802-lsvc4072-2MP-e-S"
9
+ )
10
+
11
+ # Set the pblocks of the device so that each slot contains half of an SLR:
12
+ factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4"])
13
+ factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4"])
14
+ factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7"])
15
+ factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7"])
16
+
17
+ factory.set_slot_pblock(0, 2, ["-add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10"])
18
+ factory.set_slot_pblock(1, 2, ["-add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10"])
19
+ factory.set_slot_pblock(0, 3, ["-add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"])
20
+ factory.set_slot_pblock(1, 3, ["-add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"])
21
+
22
+ # There are 18870 total SLL nodes for VP1552:
23
+ factory.set_slot_crossing_capacity(0, 0, north=9435)
24
+ factory.set_slot_crossing_capacity(1, 0, north=9435)
25
+ factory.set_slot_crossing_capacity(0, 1, north=9435)
26
+ factory.set_slot_crossing_capacity(1, 1, north=9435)
27
+ factory.set_slot_crossing_capacity(0, 2, north=9435)
28
+ factory.set_slot_crossing_capacity(1, 2, north=9435)
29
+
30
+ # Call factory to extract the slot resources automatically from Vivado:
31
+ factory.extract_slot_resources()
32
+
33
+ # The device can be supplied as the virtual device for the RapidStream APIs:
34
+ device = factory.generate_virtual_device()
35
+ rs.set_virtual_device(device)
36
+
37
+ rs.add_xo_file("./opt-stage4-dot-prod.tapa/opt.hw.xo")
38
+ rs.set_top_module_name("opt_kernel")
39
+ rs.add_clock("ap_clk", period_ns=3.33)
40
+
41
+ rs.set_vitis_connectivity_config("link_config_versal.ini")
42
+ rs.assign_port_to_region(".*", "SLOT_X0Y0:SLOT_X1Y0")
43
+ rs.run_dse(max_workers=1, max_dse_limit=0.9, min_dse_limit=0.6)
gpt-2-medium/package_sample.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # TARGET=hw
3
+ TARGET=hw_emu
4
+ DEBUG=-g
5
+
6
+ TOP=opt_kernel
7
+ XO='/path/to/opt_kernel.xo'
8
+ CONSTRAINT='/path/to/constraints.tcl'
9
+ >&2 echo "Using the default clock target of the platform."
10
+ PLATFORM="/path/to/vpk180_pfm_vitis.xpfm"
11
+ VERSAL="/path/to/xilinx-versal-common-v2023.2"
12
+ TARGET_FREQUENCY=300000000
13
+ if [ -z $PLATFORM ]; then echo Please edit this file and set a valid PLATFORM= on line "${LINENO}"; exit; fi
14
+
15
+ OUTPUT_DIR="$(pwd)/vitis_run_${TARGET}_ln"
16
+
17
+ MAX_SYNTH_JOBS=16
18
+ STRATEGY="Default"
19
+ PLACEMENT_STRATEGY="Default"
20
+
21
+ emconfigutil --platform ${PLATFORM} --od "${OUTPUT_DIR}/"
22
+
23
+ v++ ${DEBUG}\
24
+ --platform ${PLATFORM} \
25
+ --target ${TARGET} \
26
+ --package \
27
+ "${OUTPUT_DIR}/${TOP}_vpk180.xsa" \
28
+ --temp_dir "${OUTPUT_DIR}/${TOP}_vpk180.temp/package.build" \
29
+ --save-temps \
30
+ --package.out_dir "${OUTPUT_DIR}/package" \
31
+ --package.boot_mode sd \
32
+ --package.rootfs "${VERSAL}/rootfs.ext4" \
33
+ --package.kernel_image "${VERSAL}/Image" \
34
+ --package.sd_file "${OUTPUT_DIR}/emconfig.json" \
35
+ --package.sd_file "./host-opencl" \
36
+ --package.sd_file "./run_app.sh" \
37
+ --package.sd_file "./xrt.ini" \
38
+ -o "${OUTPUT_DIR}/${TOP}_vpk180.xclbin"
gpt-2-medium/parse_floorplan.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from enum import Enum, auto
3
+ from typing import Any
4
+ from argparse import ArgumentParser
5
+
6
+ parser = ArgumentParser()
7
+ parser.add_argument("-f", "--file", dest="filename", type=str,
8
+ help="input floorplan json file", metavar="FILE")
9
+
10
+ class IREnum(Enum):
11
+ """Enums to parse Rapidstream NOC IR."""
12
+
13
+ PIPELINE = "__rs_hs_pipeline"
14
+ REGION = "REGION"
15
+ BODY = "BODY"
16
+ HEAD_REGION = "__HEAD_REGION"
17
+ TAIL_REGION = "__TAIL_REGION"
18
+ DATA_WIDTH = "DATA_WIDTH"
19
+ DEPTH = "DEPTH"
20
+ BODY_LEVEL = "BODY_LEVEL"
21
+ IF_DOUT = "if_dout"
22
+ IF_EMPTY_N = "if_empty_n"
23
+ IF_READ = "if_read"
24
+ IF_DIN = "if_din"
25
+ IF_FULL_N = "if_full_n"
26
+ IF_WRITE = "if_write"
27
+ NMU = "nmu_"
28
+ NSU = "nsu_"
29
+ CC_MASTER = "_cc_master"
30
+ CC_RET = "_cc_ret"
31
+ RS_ROUTE = "RS_ROUTE"
32
+ FLOORPLAN_REGION = "floorplan_region"
33
+ PRAGMAS = "pragmas"
34
+ LIT = "lit"
35
+
36
+ PIPELINE_MAPPING = {
37
+ "__rs_ap_ctrl_start_ready_pipeline": "AP",
38
+ "__rs_ff_pipeline": "FF",
39
+ "__rs_hs_pipeline": "HS",
40
+ }
41
+
42
+ def parse_top_mod(ir: dict[str, Any]) -> Any:
43
+ """Parses the top_mod dict in the Rapidstream IR.
44
+
45
+ Return a dictionary.
46
+
47
+ Example:
48
+ >>> design = {
49
+ ... "modules": {
50
+ ... "top_name": "FINDME",
51
+ ... "module_definitions": [{"name": "FINDME"}],
52
+ ... }
53
+ ... }
54
+ >>> parse_top_mod(design)
55
+ {'name': 'FINDME'}
56
+ """
57
+ top_mod = ir["modules"]["top_name"]
58
+ for mod in ir["modules"]["module_definitions"]:
59
+ if mod["name"] == top_mod:
60
+ return mod
61
+ raise AssertionError()
62
+
63
+ def parse_mod(ir: dict[str, Any], name: str) -> Any:
64
+ """Parses a given module's IR in the Rapidstream IR.
65
+
66
+ Return a dictionary.
67
+ """
68
+ for mod in ir["modules"]["module_definitions"]:
69
+ if mod["name"] == name:
70
+ return mod
71
+ return {}
72
+
73
+ def find_repr(source: list[dict[str, Any]], key: str) -> str:
74
+ """Finds the first type repr value of a key in the Rapidstream list IR.
75
+
76
+ Returns a string.
77
+ """
78
+ for e in find_expr(source, key):
79
+ return str(e["repr"])
80
+ print(f"WARNING: repr for key {key} not found!")
81
+ return ""
82
+
83
+ def find_expr(
84
+ source: list[dict[str, Any | list[dict[str, str]]]], key: str
85
+ ) -> list[dict[str, str]]:
86
+ """Finds the expr value of a key in the Rapidstream list IR.
87
+
88
+ Returns a string.
89
+ """
90
+ for c in source:
91
+ if c["name"] == key:
92
+ return c["expr"]
93
+ print(f"WARNING: expr for key {key} not found!")
94
+ return []
95
+
96
+ def parse_floorplan(ir: dict[str, Any], grouped_mod_name: str) -> dict[str, list[str]]:
97
+ """Parses the top module and grouped module's floorplan regions.
98
+
99
+ Return a dictionary where keys are slots and values are submodules.
100
+ """
101
+ combined_mods = {
102
+ # top
103
+ "": parse_top_mod(ir)["submodules"],
104
+ }
105
+ if grouped_mod_ir := parse_mod(ir, grouped_mod_name):
106
+ # grouped module
107
+ combined_mods[f"{grouped_mod_name}_0/"] = grouped_mod_ir["submodules"]
108
+
109
+ insts = {}
110
+ for parent, mods in combined_mods.items():
111
+ for sub_mod in mods:
112
+ sub_mod_name = parent + sub_mod["name"]
113
+ if sub_mod["floorplan_region"] is not None:
114
+ # regular module
115
+ insts[sub_mod_name] = sub_mod["floorplan_region"]
116
+ elif sub_mod["module"] in PIPELINE_MAPPING:
117
+ # pipeline module, needs to extract slot of each reg
118
+ mapped_name = PIPELINE_MAPPING[sub_mod["module"]]
119
+ body_level = find_repr(sub_mod["parameters"], IREnum.BODY_LEVEL.value)
120
+ insts[f"{sub_mod_name}/RS_{mapped_name}_PP_HEAD"] = find_repr(
121
+ sub_mod["parameters"], IREnum.HEAD_REGION.value
122
+ ).strip('"')
123
+ insts[f"{sub_mod_name}/RS_{mapped_name}_PP_TAIL"] = find_repr(
124
+ sub_mod["parameters"], IREnum.TAIL_REGION.value
125
+ ).strip('"')
126
+ for i in range(int(body_level)):
127
+ insts[f"{sub_mod_name}/RS_{mapped_name}_PP_BODY_{i}"] = find_repr(
128
+ sub_mod["parameters"], f"__BODY_{i}_REGION"
129
+ ).strip('"')
130
+
131
+ # convert {instance: slot} to {slot: [instances]}
132
+ floorplan: dict[str, list[str]] = {}
133
+ for sub_mod_name, slot in insts.items():
134
+ assert slot is not None, f"{sub_mod_name} cannot have null slot!"
135
+ if slot not in floorplan:
136
+ floorplan[slot] = []
137
+ floorplan[slot].append(sub_mod_name)
138
+ return floorplan
139
+
140
+
141
+ def extract_slot_coord(slot_name: str) -> tuple[int, int]:
142
+ """Extracts the x and y coordinates from the slot name.
143
+
144
+ Returns a coordinate tuple as (x, y) in int.
145
+
146
+ Example:
147
+ >>> extract_slot_coord("SLOT_X0Y1")
148
+ (0, 1)
149
+ """
150
+ return int(slot_name.split("X")[1].split("Y")[0]), int(slot_name.split("Y")[1])
151
+
152
+ def export_constraint(floorplan: dict[str, list[str]], kernel_name: str) -> list[str]:
153
+ """Generates tcl constraints given the floorplan dictionary.
154
+
155
+ Returns a list of tcl commands.
156
+ """
157
+ tcl = [
158
+ """
159
+
160
+ # Initialize an empty list to store undefined cells
161
+ set undefined_cells {}
162
+ """
163
+ ]
164
+
165
+ cr_map = [
166
+ ["CLOCKREGION_X0Y1:CLOCKREGION_X4Y4", "CLOCKREGION_X0Y5:CLOCKREGION_X4Y7", "CLOCKREGION_X0Y8:CLOCKREGION_X4Y10", "CLOCKREGION_X0Y11:CLOCKREGION_X4Y13"],
167
+ ["CLOCKREGION_X5Y1:CLOCKREGION_X9Y4", "CLOCKREGION_X5Y5:CLOCKREGION_X9Y7", "CLOCKREGION_X5Y8:CLOCKREGION_X9Y10", "CLOCKREGION_X5Y11:CLOCKREGION_X9Y13"]
168
+ ]
169
+
170
+ for slot in floorplan.keys():
171
+ slot1, slot2 = slot.split("_TO_")
172
+ assert slot1 == slot2
173
+ x, y = extract_slot_coord(slot1)
174
+ cr = cr_map[x][y]
175
+ tcl += [
176
+ f"""
177
+ # begin defining a slot for logic resources
178
+ create_pblock {slot}
179
+ resize_pblock {slot} -add {cr}
180
+ """
181
+ ]
182
+
183
+ for slot, _ in floorplan.items():
184
+ tcl += [f"set {slot}_cells {{"]
185
+ tcl += [f" ext_platform_i/VitisRegion/{kernel_name}/inst/{slot}_0/.*"]
186
+ tcl += [
187
+ f"""}}
188
+ add_cells_to_pblock [get_pblocks {slot}] [get_cells -regex ${slot}_cells]
189
+
190
+ # Iterate through each cell in the list
191
+ foreach cell ${slot}_cells {{
192
+ set defined [llength [get_cells $cell]]
193
+ if {{ $defined == 0 }} {{
194
+ lappend undefined_cells $cell
195
+ }}
196
+ }}
197
+ """
198
+ ]
199
+
200
+ tcl += [
201
+ """
202
+ if {[llength $undefined_cells] > 0} {
203
+ puts "Undefined cells:"
204
+ foreach cell $undefined_cells {
205
+ puts $cell
206
+ }
207
+ }
208
+ """
209
+ ]
210
+
211
+ return tcl
212
+
213
+ if __name__ == "__main__":
214
+ args = parser.parse_args()
215
+
216
+ with open(args.filename, "r", encoding="utf-8") as file:
217
+ ir = json.load(file)
218
+
219
+ pipeline_dict = parse_floorplan(ir, "")
220
+ tcl = export_constraint(pipeline_dict, "opt_kernel")
221
+
222
+ with open("constraints.tcl", "w", encoding="utf-8") as file:
223
+ file.write("\n".join(tcl))
gpt-2-medium/run_app.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ export LD_LIBRARY_PATH=/mnt:/tmp:$LD_LIBRARY_PATH
4
+ export XCL_EMULATION_MODE=hw_emu
5
+ export XILINX_XRT=/usr
6
+ export XILINX_VITIS=/mnt
7
+
8
+ ./host-opencl opt_kernel_vpk180.xclbin
gpt-2-medium/run_tapa.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tapac \
2
+ -o opt.hw.xo \
3
+ --platform xilinx_u280_xdma_201920_3 \
4
+ --top opt_kernel \
5
+ --work-dir opt-stage3.tapa \
6
+ --connectivity hbm_config.ini \
7
+ --enable-hbm-binding-adjustment \
8
+ --enable-synth-util \
9
+ --run-floorplan-dse \
10
+ --min-area-limit 0.55 \
11
+ --min-slr-width-limit 5000 \
12
+ --max-slr-width-limit 19000 \
13
+ --max-parallel-synth-jobs 16 \
14
+ --floorplan-output opt-floorplan.tcl \
15
+ kernel.cpp
gpt-2-medium/run_tapa_rs.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ ml load xilinx/vivado/2024.1
3
+ tapac \
4
+ --work-dir opt-stage4-dot-prod.tapa \
5
+ --top opt_kernel \
6
+ --part-num xcvp1802-lsvc4072-2MP-e-S \
7
+ --clock-period 3.33 \
8
+ -o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
9
+ --connectivity link_config_versal.ini \
10
+ --run-tapacc \
11
+ --run-hls \
12
+ --generate-task-rtl \
13
+ --run-floorplanning \
14
+ --generate-top-rtl \
15
+ kernel-versal.cpp
16
+
17
+ ml load xilinx/vivado/2024.1
18
+ tapac \
19
+ --work-dir opt-stage4-dot-prod.tapa \
20
+ --top opt_kernel \
21
+ --part-num xcvp1802-lsvc4072-2MP-e-S \
22
+ --clock-period 3.33 \
23
+ -o "opt-stage4-dot-prod.tapa/opt.hw.xo" \
24
+ --connectivity link_config_versal.ini \
25
+ --pack-xo \
26
+ kernel-versal.cpp
27
+
28
+
gpt-2-medium/xo/constraints.tcl ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # Initialize an empty list to store undefined cells
4
+ set undefined_cells {}
5
+
6
+
7
+ # begin defining a slot for logic resources
8
+ create_pblock SLOT_X0Y0_TO_SLOT_X0Y0
9
+ resize_pblock SLOT_X0Y0_TO_SLOT_X0Y0 -add CLOCKREGION_X0Y1:CLOCKREGION_X4Y4
10
+
11
+
12
+ # begin defining a slot for logic resources
13
+ create_pblock SLOT_X0Y2_TO_SLOT_X0Y2
14
+ resize_pblock SLOT_X0Y2_TO_SLOT_X0Y2 -add CLOCKREGION_X0Y8:CLOCKREGION_X4Y10
15
+
16
+
17
+ # begin defining a slot for logic resources
18
+ create_pblock SLOT_X1Y2_TO_SLOT_X1Y2
19
+ resize_pblock SLOT_X1Y2_TO_SLOT_X1Y2 -add CLOCKREGION_X5Y8:CLOCKREGION_X9Y10
20
+
21
+
22
+ # begin defining a slot for logic resources
23
+ create_pblock SLOT_X0Y3_TO_SLOT_X0Y3
24
+ resize_pblock SLOT_X0Y3_TO_SLOT_X0Y3 -add CLOCKREGION_X0Y11:CLOCKREGION_X4Y13
25
+
26
+
27
+ # begin defining a slot for logic resources
28
+ create_pblock SLOT_X1Y3_TO_SLOT_X1Y3
29
+ resize_pblock SLOT_X1Y3_TO_SLOT_X1Y3 -add CLOCKREGION_X5Y11:CLOCKREGION_X9Y13
30
+
31
+
32
+ # begin defining a slot for logic resources
33
+ create_pblock SLOT_X1Y0_TO_SLOT_X1Y0
34
+ resize_pblock SLOT_X1Y0_TO_SLOT_X1Y0 -add CLOCKREGION_X5Y1:CLOCKREGION_X9Y4
35
+
36
+
37
+ # begin defining a slot for logic resources
38
+ create_pblock SLOT_X1Y1_TO_SLOT_X1Y1
39
+ resize_pblock SLOT_X1Y1_TO_SLOT_X1Y1 -add CLOCKREGION_X5Y5:CLOCKREGION_X9Y7
40
+
41
+
42
+ # begin defining a slot for logic resources
43
+ create_pblock SLOT_X0Y1_TO_SLOT_X0Y1
44
+ resize_pblock SLOT_X0Y1_TO_SLOT_X0Y1 -add CLOCKREGION_X0Y5:CLOCKREGION_X4Y7
45
+
46
+ set SLOT_X0Y0_TO_SLOT_X0Y0_cells {
47
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y0_TO_SLOT_X0Y0.*
48
+ }
49
+ add_cells_to_pblock [get_pblocks SLOT_X0Y0_TO_SLOT_X0Y0] [get_cells -regex $SLOT_X0Y0_TO_SLOT_X0Y0_cells]
50
+
51
+ # Iterate through each cell in the list
52
+ foreach cell $SLOT_X0Y0_TO_SLOT_X0Y0_cells {
53
+ set defined [llength [get_cells $cell]]
54
+ if { $defined == 0 } {
55
+ lappend undefined_cells $cell
56
+ }
57
+ }
58
+
59
+ set SLOT_X0Y2_TO_SLOT_X0Y2_cells {
60
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y2_TO_SLOT_X0Y2.*
61
+ }
62
+ add_cells_to_pblock [get_pblocks SLOT_X0Y2_TO_SLOT_X0Y2] [get_cells -regex $SLOT_X0Y2_TO_SLOT_X0Y2_cells]
63
+
64
+ # Iterate through each cell in the list
65
+ foreach cell $SLOT_X0Y2_TO_SLOT_X0Y2_cells {
66
+ set defined [llength [get_cells $cell]]
67
+ if { $defined == 0 } {
68
+ lappend undefined_cells $cell
69
+ }
70
+ }
71
+
72
+ set SLOT_X1Y2_TO_SLOT_X1Y2_cells {
73
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y2_TO_SLOT_X1Y2.*
74
+ }
75
+ add_cells_to_pblock [get_pblocks SLOT_X1Y2_TO_SLOT_X1Y2] [get_cells -regex $SLOT_X1Y2_TO_SLOT_X1Y2_cells]
76
+
77
+ # Iterate through each cell in the list
78
+ foreach cell $SLOT_X1Y2_TO_SLOT_X1Y2_cells {
79
+ set defined [llength [get_cells $cell]]
80
+ if { $defined == 0 } {
81
+ lappend undefined_cells $cell
82
+ }
83
+ }
84
+
85
+ set SLOT_X0Y3_TO_SLOT_X0Y3_cells {
86
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y3_TO_SLOT_X0Y3.*
87
+ }
88
+ add_cells_to_pblock [get_pblocks SLOT_X0Y3_TO_SLOT_X0Y3] [get_cells -regex $SLOT_X0Y3_TO_SLOT_X0Y3_cells]
89
+
90
+ # Iterate through each cell in the list
91
+ foreach cell $SLOT_X0Y3_TO_SLOT_X0Y3_cells {
92
+ set defined [llength [get_cells $cell]]
93
+ if { $defined == 0 } {
94
+ lappend undefined_cells $cell
95
+ }
96
+ }
97
+
98
+ set SLOT_X1Y3_TO_SLOT_X1Y3_cells {
99
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y3_TO_SLOT_X1Y3.*
100
+ }
101
+ add_cells_to_pblock [get_pblocks SLOT_X1Y3_TO_SLOT_X1Y3] [get_cells -regex $SLOT_X1Y3_TO_SLOT_X1Y3_cells]
102
+
103
+ # Iterate through each cell in the list
104
+ foreach cell $SLOT_X1Y3_TO_SLOT_X1Y3_cells {
105
+ set defined [llength [get_cells $cell]]
106
+ if { $defined == 0 } {
107
+ lappend undefined_cells $cell
108
+ }
109
+ }
110
+
111
+
112
+ set SLOT_X1Y0_TO_SLOT_X1Y0_cells {
113
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y0_TO_SLOT_X1Y0.*
114
+ }
115
+ add_cells_to_pblock [get_pblocks SLOT_X1Y0_TO_SLOT_X1Y0] [get_cells -regex $SLOT_X1Y0_TO_SLOT_X1Y0_cells]
116
+
117
+ # Iterate through each cell in the list
118
+ foreach cell $SLOT_X1Y0_TO_SLOT_X1Y0_cells {
119
+ set defined [llength [get_cells $cell]]
120
+ if { $defined == 0 } {
121
+ lappend undefined_cells $cell
122
+ }
123
+ }
124
+
125
+ set SLOT_X1Y1_TO_SLOT_X1Y1_cells {
126
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X1Y1_TO_SLOT_X1Y1.*
127
+ }
128
+ add_cells_to_pblock [get_pblocks SLOT_X1Y1_TO_SLOT_X1Y1] [get_cells -regex $SLOT_X1Y1_TO_SLOT_X1Y1_cells]
129
+
130
+ # Iterate through each cell in the list
131
+ foreach cell $SLOT_X1Y1_TO_SLOT_X1Y1_cells {
132
+ set defined [llength [get_cells $cell]]
133
+ if { $defined == 0 } {
134
+ lappend undefined_cells $cell
135
+ }
136
+ }
137
+
138
+ set SLOT_X0Y1_TO_SLOT_X0Y1_cells {
139
+ ext_platform_i/VitisRegion/opt_kernel/inst/SLOT_X0Y1_TO_SLOT_X0Y1.*
140
+ }
141
+ add_cells_to_pblock [get_pblocks SLOT_X0Y1_TO_SLOT_X0Y1] [get_cells -regex $SLOT_X0Y1_TO_SLOT_X0Y1_cells]
142
+
143
+ # Iterate through each cell in the list
144
+ foreach cell $SLOT_X0Y1_TO_SLOT_X0Y1_cells {
145
+ set defined [llength [get_cells $cell]]
146
+ if { $defined == 0 } {
147
+ lappend undefined_cells $cell
148
+ }
149
+ }
150
+
151
+
152
+ if {[llength $undefined_cells] > 0} {
153
+ puts "Undefined cells:"
154
+ foreach cell $undefined_cells {
155
+ puts $cell
156
+ }
157
+ }
gpt-2-medium/xo/opt_kernel.xo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50ccf71a9ffd437e6e800624723a66b21391130e200641b2c8c7af0875ef73ce
3
+ size 2049244
gpt-2-medium/xrt.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [Emulation]
2
+ debug_mode=batch