PandA-2024.02
11_conv2d_b.parallel.c
Go to the documentation of this file.
1 #include "c_backend_api.h"
2 
3 #ifdef BAMBU_PROFILING
4 extern void __builtin_bambu_time_start();
5 extern void __builtin_bambu_time_stop();
6 #endif
7 
18 
19 
20 __attribute__((noinline))
21 void kernel1(int32_t ax0_ax1_fused_ax2_fused, float* placeholder, float* T_layout_trans)
22 {
23  int32_t ax3;
24  for (ax3 = 0; ax3 < 64; ++ax3)
25  {
26  T_layout_trans[((ax0_ax1_fused_ax2_fused * 64) + ax3)] = placeholder[((ax0_ax1_fused_ax2_fused * 64) + ax3)];
27  }
28 }
29 
30 __attribute__((noinline))
31 void kernel2(int32_t ax0_ax1_fused, float* placeholder, float* T_layout_trans)
32 {
33  int32_t ax2;
34  int32_t ax3;
35 
36  for (ax2 = 0; ax2 < 64; ++ax2) {
37  for (ax3 = 0; ax3 < 64; ++ax3) {
38  T_layout_trans[(((ax0_ax1_fused * 4096) + (ax2 * 64)) + ax3)] = placeholder[(((ax2 * 128) + (ax3 * 2)) + ax0_ax1_fused)];
39  }
40  }
41 }
42 
43 __attribute__((noinline))
44 void kernel3(int32_t i1_i2_fused, float *data_pad, float* placeholder)
45 {
46  int32_t i3;
47  for (i3 = 0; i3 < 66; ++i3) {
48  (( float*)data_pad)[((i1_i2_fused * 66) + i3)] = (((((1 <= i1_i2_fused) && (i1_i2_fused < 65)) && (1 <= i3)) && (i3 < 65)) ? placeholder[(((i1_i2_fused * 64) + i3) - 65)] : 0.000000e+00f);
49  }
50 }
51 
52 __attribute__((noinline))
53 void kernel4(int32_t n_oc_chunk_fused_oh_fused, float *data_pad, float* placeholder1, float* conv2d_NCHWc)
54 {
55  float2 conv2d_NCHWc_global[16];
56  for (ow_outer = 0; ow_outer < 4; ++ow_outer) {
57  conv2d_NCHWc_global[0] = ((float2)(0.000000e+00f, 0.000000e+00f));
58  conv2d_NCHWc_global[1] = ((float2)(0.000000e+00f, 0.000000e+00f));
59  conv2d_NCHWc_global[2] = ((float2)(0.000000e+00f, 0.000000e+00f));
60  conv2d_NCHWc_global[3] = ((float2)(0.000000e+00f, 0.000000e+00f));
61  conv2d_NCHWc_global[4] = ((float2)(0.000000e+00f, 0.000000e+00f));
62  conv2d_NCHWc_global[5] = ((float2)(0.000000e+00f, 0.000000e+00f));
63  conv2d_NCHWc_global[6] = ((float2)(0.000000e+00f, 0.000000e+00f));
64  conv2d_NCHWc_global[7] = ((float2)(0.000000e+00f, 0.000000e+00f));
65  conv2d_NCHWc_global[8] = ((float2)(0.000000e+00f, 0.000000e+00f));
66  conv2d_NCHWc_global[9] = ((float2)(0.000000e+00f, 0.000000e+00f));
67  conv2d_NCHWc_global[10] = ((float2)(0.000000e+00f, 0.000000e+00f));
68  conv2d_NCHWc_global[11] = ((float2)(0.000000e+00f, 0.000000e+00f));
69  conv2d_NCHWc_global[12] = ((float2)(0.000000e+00f, 0.000000e+00f));
70  conv2d_NCHWc_global[13] = ((float2)(0.000000e+00f, 0.000000e+00f));
71  conv2d_NCHWc_global[14] = ((float2)(0.000000e+00f, 0.000000e+00f));
72  conv2d_NCHWc_global[15] = ((float2)(0.000000e+00f, 0.000000e+00f));
73  for (kh = 0; kh < 3; ++kh) {
74  for (kw = 0; kw < 3; ++kw) {
75  conv2d_NCHWc_global[0] = (conv2d_NCHWc_global[0] + (((float2)((( float*)data_pad)[((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw)], (( float*)data_pad)[((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
76  conv2d_NCHWc_global[1] = (conv2d_NCHWc_global[1] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 1)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 1)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
77  conv2d_NCHWc_global[2] = (conv2d_NCHWc_global[2] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 2)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 2)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
78  conv2d_NCHWc_global[3] = (conv2d_NCHWc_global[3] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 3)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 3)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
79  conv2d_NCHWc_global[4] = (conv2d_NCHWc_global[4] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 4)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 4)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
80  conv2d_NCHWc_global[5] = (conv2d_NCHWc_global[5] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 5)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 5)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
81  conv2d_NCHWc_global[6] = (conv2d_NCHWc_global[6] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 6)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 6)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
82  conv2d_NCHWc_global[7] = (conv2d_NCHWc_global[7] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 7)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 7)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
83  conv2d_NCHWc_global[8] = (conv2d_NCHWc_global[8] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 8)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 8)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
84  conv2d_NCHWc_global[9] = (conv2d_NCHWc_global[9] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 9)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 9)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
85  conv2d_NCHWc_global[10] = (conv2d_NCHWc_global[10] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 10)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 10)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
86  conv2d_NCHWc_global[11] = (conv2d_NCHWc_global[11] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 11)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 11)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
87  conv2d_NCHWc_global[12] = (conv2d_NCHWc_global[12] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 12)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 12)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
88  conv2d_NCHWc_global[13] = (conv2d_NCHWc_global[13] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 13)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 13)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
89  conv2d_NCHWc_global[14] = (conv2d_NCHWc_global[14] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 14)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 14)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
90  conv2d_NCHWc_global[15] = (conv2d_NCHWc_global[15] + (((float2)((( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 15)], (( float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 15)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
91  }
92  }
93  }
94  int32_t ow_inner;
95  for (ow_inner = 0; ow_inner < 16; ++ow_inner) {
96  (( float2*)(conv2d_NCHWc + (((n_oc_chunk_fused_oh_fused * 128) + (ow_outer * 32)) + (ow_inner * 2))))[0] = (( float2*)(( float*)conv2d_NCHWc_global + (ow_inner * 2)))[0];
97  }
98 
99 }
100 
101 
102 __attribute__((noinline))
103 void parallel1(float* placeholder, float* T_layout_trans)
104 {
105  int32_t ax0_ax1_fused_ax2_fused;
106 
107  #pragma omp parallel for
108  for (ax0_ax1_fused_ax2_fused = 0; ax0_ax1_fused_ax2_fused < 64; ++ax0_ax1_fused_ax2_fused)
109  {
110  kernel1(ax0_ax1_fused_ax2_fused, placeholder, T_layout_trans);
111  }
112 }
113 
114 __attribute__((noinline))
115 void parallel2(float* placeholder, float* T_layout_trans)
116 {
117  int32_t ax0_ax1_fused;
118  #pragma omp parallel for
119  for (ax0_ax1_fused = 0; ax0_ax1_fused < 2; ++ax0_ax1_fused)
120  {
121  kernel2(ax0_ax1_fused, placeholder, T_layout_trans);
122  }
123 }
124 
125 __attribute__((noinline))
126 void parallel3(float *data_pad, float* placeholder)
127 {
128  int32_t i1_i2_fused;
129  #pragma omp parallel for
130  for (i1_i2_fused = 0; i1_i2_fused < 66; ++i1_i2_fused)
131  {
132  kernel3(i1_i2_fused, data_pad, placeholder);
133  }
134 }
135 
136 __attribute__((noinline))
137 void parallel4(float *data_pad, float* placeholder1, float* conv2d_NCHWc)
138 {
139  int32_t n_oc_chunk_fused_oh_fused;
140  #pragma omp parallel for
141  for (n_oc_chunk_fused_oh_fused = 0; n_oc_chunk_fused_oh_fused < 64; ++n_oc_chunk_fused_oh_fused)
142  {
143  kernel4(n_oc_chunk_fused_oh_fused, data_pad, placeholder1, conv2d_NCHWc);
144  }
145 }
146 
147 int32_t fused_layout_transform_2( void* args, void* arg_type_ids, int32_t num_args) {
148  void* arg0 = (((TVMValue*)args)[0].v_handle);
149  void* arg1 = (((TVMValue*)args)[1].v_handle);
150  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
151  float* T_layout_trans = (float*)(((TVMArray*)arg1)[0].data);
152 
153  parallel1(placeholder, T_layout_trans);
154 
155  return 0;
156 }
157 
158 int32_t fused_layout_transform_1( void* args, void* arg_type_ids, int32_t num_args) {
159  void* arg0 = (((TVMValue*)args)[0].v_handle);
160  void* arg1 = (((TVMValue*)args)[1].v_handle);
161  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
162  float* T_layout_trans = (float*)(((TVMArray*)arg1)[0].data);
163 
164  parallel2(placeholder, T_layout_trans);
165 
166  return 0;
167 }
168 
169 int32_t fused_nn_contrib_conv2d_NCHWc( void* args, void* arg_type_ids, int32_t num_args) {
170  void* arg0 = (((TVMValue*)args)[0].v_handle);
171  void* arg1 = (((TVMValue*)args)[1].v_handle);
172  void* arg2 = (((TVMValue*)args)[2].v_handle);
173  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
174  float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data);
175  float* conv2d_NCHWc = (float*)(((TVMArray*)arg2)[0].data);
176 
177  void* data_pad = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)17424, 2, 32);
178  if (data_pad == NULL) {
179  return -1;
180  }
181 
182  parallel3(data_pad, placeholder);
183 
184  parallel4(data_pad, placeholder1, conv2d_NCHWc);
185 
186  if (TVMBackendFreeWorkspace(1, dev_id, data_pad) != 0) {
187  return -1;
188  }
189  return 0;
190 }
191 
192 int32_t conv(TVMValue* param0, TVMValue param1, TVMValue param2){
193  int res1, res2, res3;
194  res1 = fused_layout_transform_2(param0, 0, 0);
195  res2 = fused_nn_contrib_conv2d_NCHWc(param1, 0, 0);
196  res3 = fused_layout_transform_1(param2, 0, 0);
197  return res3;
198 }
199 
200 int32_t fused_conv2d_wrapper(float* X, float* p0, float* T_fused_layout_transform_1) {
201 
202  float out_fused_layout_transform_2[4096];
203  float out_conv[8192];
204  int32_t res;
205  a0[0].data = X;
206  a1[0].data = out_fused_layout_transform_2;
207  param0[0].v_handle = a0;
208  param0[1].v_handle = a1;
209 
210  b0[0].data = out_fused_layout_transform_2;
211  b1[0].data = p0;
212  b2[0].data = out_conv;
213  param1[0].v_handle = b0;
214  param1[1].v_handle = b1;
215  param1[2].v_handle = b2;
216 
217  c0[0].data = out_conv;
218  c1[0].data = T_fused_layout_transform_1;
219  param2[0].v_handle = c0;
220  param2[1].v_handle = c1;
221 
222 
223 #ifdef BAMBU_PROFILING
225 #endif
226 
227  res = conv(param0, param1, param2);
228 
229 #ifdef BAMBU_PROFILING
231 #endif
232 
233  return res;
234 }
#define NULL
TVMArray b0[1]
TVMArray c2[1]
void __builtin_bambu_time_start()
int32_t conv(TVMValue *param0, TVMValue param1, TVMValue param2)
void * v_handle
float float2
Union type of values being passed through API and function calls.
TVMValue param0[2]
int32_t fused_nn_contrib_conv2d_NCHWc(void *args, void *arg_type_ids, int32_t num_args)
TVMArray a0[1]
TVM_DLL int TVMBackendFreeWorkspace(int device_type, int device_id, void *ptr)
Backend function to free temporal workspace.
TVMValue param1[3]
TVMArray a1[1]
void * data
The opaque data pointer points to the allocated data. This will be CUDA device pointer or cl_mem hand...
Definition: dlpack.h:131
int32_t fused_layout_transform_1(void *args, void *arg_type_ids, int32_t num_args)
int32_t fused_conv2d_wrapper(float *X, float *p0, float *T_fused_layout_transform_1)
TVMValue param2[2]
TVMArray b2[1]
TVM_DLL void * TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint, int dtype_bits_hint)
Backend function to allocate temporal workspace.
Plain C Tensor object, does not manage memory.
Definition: dlpack.h:111
int32_t fused_layout_transform_2(void *args, void *arg_type_ids, int32_t num_args)
TVMArray c1[1]
TVMArray c0[1]
TVMArray b1[1]
void __builtin_bambu_time_stop()
__attribute__((noinline))
Convert the given fixedpt number to a decimal string.

Generated on Mon Feb 12 2024 13:02:50 for PandA-2024.02 by doxygen 1.8.13