PandA-2024.02
09_conv2d_a.parallel.c
Go to the documentation of this file.
1 #include "c_backend_api.h"
2 
3 #ifdef BAMBU_PROFILING
4 extern void __builtin_bambu_time_start();
5 extern void __builtin_bambu_time_stop();
6 #endif
7 
18 
19 typedef float float2;
20 
21 __attribute__((noinline))
22 void kernel1(int32_t ax0_ax1_fused_ax2_fused, float* placeholder, float* T_layout_trans)
23 {
24  int32_t ax3;
25  for (ax3 = 0; ax3 < 8; ++ax3)
26  {
27  T_layout_trans[((ax0_ax1_fused_ax2_fused * 8) + ax3)] = placeholder[((ax0_ax1_fused_ax2_fused * 8) + ax3)];
28  }
29 }
30 
31 __attribute__((noinline))
32 void kernel2(int32_t ax0_ax1_fused, float* placeholder, float* T_layout_trans)
33 {
34  int32_t ax2;
35  int32_t ax3;
36 
37  for (ax2 = 0; ax2 < 8; ++ax2) {
38  for (ax3 = 0; ax3 < 8; ++ax3) {
39  T_layout_trans[(((ax0_ax1_fused * 64) + (ax2 * 8)) + ax3)] = placeholder[(((ax2 * 16) + (ax3 * 2)) + ax0_ax1_fused)];
40  }
41  }
42 }
43 
44 __attribute__((noinline))
45 void kernel3(int32_t i1_i2_fused, float *data_pad, float* placeholder)
46 {
47  int32_t i3;
48  for (i3 = 0; i3 < 10; ++i3) {
49  data_pad[((i1_i2_fused * 10) + i3)] = (((((1 <= i1_i2_fused) && (i1_i2_fused < 9)) && (1 <= i3)) && (i3 < 9)) ? placeholder[(((i1_i2_fused * 8) + i3) - 9)] : 0.000000e+00f);
50  }
51 }
52 
53 __attribute__((noinline))
54 void kernel4(int32_t n_oc_chunk_fused_oh_fused, float *data_pad, float* placeholder1, float* conv2d_NCHWc)
55 {
56  float2 conv2d_NCHWc_global[8];
57  conv2d_NCHWc_global[0] = ((float2)(0.000000e+00f, 0.000000e+00f));
58  conv2d_NCHWc_global[1] = ((float2)(0.000000e+00f, 0.000000e+00f));
59  conv2d_NCHWc_global[2] = ((float2)(0.000000e+00f, 0.000000e+00f));
60  conv2d_NCHWc_global[3] = ((float2)(0.000000e+00f, 0.000000e+00f));
61  conv2d_NCHWc_global[4] = ((float2)(0.000000e+00f, 0.000000e+00f));
62  conv2d_NCHWc_global[5] = ((float2)(0.000000e+00f, 0.000000e+00f));
63  conv2d_NCHWc_global[6] = ((float2)(0.000000e+00f, 0.000000e+00f));
64  conv2d_NCHWc_global[7] = ((float2)(0.000000e+00f, 0.000000e+00f));
65  int32_t kh;
66  int32_t kw;
67  for (kh = 0; kh < 3; ++kh) {
68  for (kw = 0; kw < 3; ++kw) {
69  conv2d_NCHWc_global[0] = (conv2d_NCHWc_global[0] + (((float2)(data_pad[(((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw)], data_pad[(((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
70  conv2d_NCHWc_global[1] = (conv2d_NCHWc_global[1] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 1)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 1)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
71  conv2d_NCHWc_global[2] = (conv2d_NCHWc_global[2] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 2)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 2)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
72  conv2d_NCHWc_global[3] = (conv2d_NCHWc_global[3] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 3)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 3)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
73  conv2d_NCHWc_global[4] = (conv2d_NCHWc_global[4] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 4)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 4)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
74  conv2d_NCHWc_global[5] = (conv2d_NCHWc_global[5] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 5)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 5)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
75  conv2d_NCHWc_global[6] = (conv2d_NCHWc_global[6] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 6)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 6)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
76  conv2d_NCHWc_global[7] = (conv2d_NCHWc_global[7] + (((float2)(data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 7)], data_pad[((((kh * 10) + (n_oc_chunk_fused_oh_fused * 10)) + kw) + 7)])) * (( float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
77  }
78  }
79  int32_t ow_inner;
80  for (ow_inner = 0; ow_inner < 8; ++ow_inner) {
81  (( float2*)(conv2d_NCHWc + ((n_oc_chunk_fused_oh_fused * 16) + (ow_inner * 2))))[0] = (( float2*)(( float*)conv2d_NCHWc_global + (ow_inner * 2)))[0];
82  }
83  }
84 
85 __attribute__((noinline))
86 void parallel1(float* placeholder, float* T_layout_trans)
87 {
88  int32_t ax0_ax1_fused_ax2_fused;
89 
90  #pragma omp parallel for
91  for (ax0_ax1_fused_ax2_fused = 0; ax0_ax1_fused_ax2_fused < 8; ++ax0_ax1_fused_ax2_fused)
92  {
93  kernel1(ax0_ax1_fused_ax2_fused, placeholder, T_layout_trans);
94  }
95 }
96 
97 __attribute__((noinline))
98 void parallel2(float* placeholder, float* T_layout_trans)
99 {
100  int32_t ax0_ax1_fused;
101  #pragma omp parallel for
102  for (ax0_ax1_fused = 0; ax0_ax1_fused < 2; ++ax0_ax1_fused)
103  {
104  kernel2(ax0_ax1_fused, placeholder, T_layout_trans);
105  }
106 }
107 
108 __attribute__((noinline))
109 void parallel3(float *data_pad, float* placeholder)
110 {
111  int32_t i1_i2_fused;
112  #pragma omp parallel for
113  for (i1_i2_fused = 0; i1_i2_fused < 10; ++i1_i2_fused)
114  {
115  kernel3(i1_i2_fused, data_pad, placeholder);
116  }
117 }
118 
119 __attribute__((noinline))
120 void parallel4(float *data_pad, float* placeholder1, float* conv2d_NCHWc)
121 {
122  int32_t n_oc_chunk_fused_oh_fused;
123  #pragma omp parallel for
124  for (n_oc_chunk_fused_oh_fused = 0; n_oc_chunk_fused_oh_fused < 8; ++n_oc_chunk_fused_oh_fused)
125  {
126  kernel4(n_oc_chunk_fused_oh_fused, data_pad, placeholder1, conv2d_NCHWc);
127  }
128 }
129 
130 
131 __attribute__((noinline))
132 int32_t fused_layout_transform_1( void* args, void* arg_type_ids, int32_t num_args) {
133  void* arg0 = (((TVMValue*)args)[0].v_handle);
134  void* arg1 = (((TVMValue*)args)[1].v_handle);
135  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
136  float* T_layout_trans = (float*)(((TVMArray*)arg1)[0].data);
137  parallel2(placeholder, T_layout_trans);
138 
139  return 0;
140 }
141 
142 __attribute__((noinline))
143 int32_t fused_layout_transform_2( void* args, void* arg_type_ids, int32_t num_args) {
144  void* arg0 = (((TVMValue*)args)[0].v_handle);
145  void* arg1 = (((TVMValue*)args)[1].v_handle);
146  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
147  float* T_layout_trans = (float*)(((TVMArray*)arg1)[0].data);
148 
149  parallel1(placeholder, T_layout_trans);
150 
151  return 0;
152 }
153 
154 __attribute__((noinline))
155 int32_t fused_nn_contrib_conv2d_NCHWc( void* args, void* arg_type_ids, int32_t num_args) {
156  void* arg0 = (((TVMValue*)args)[0].v_handle);
157  void* arg1 = (((TVMValue*)args)[1].v_handle);
158  void* arg2 = (((TVMValue*)args)[2].v_handle);
159  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
160  float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data);
161  float* conv2d_NCHWc = (float*)(((TVMArray*)arg2)[0].data);
162  float data_pad[100];
163 
164  parallel3(data_pad, placeholder);
165 
166  parallel4(data_pad, placeholder1, conv2d_NCHWc);
167 
168  return 0;
169 }
170 
171 int32_t conv(TVMValue* param0, TVMValue* param1, TVMValue* param2){
172 
173  int32_t res1, res2, res3;
174  res1 = fused_layout_transform_2(param0, 0, 0);
175  res2 = fused_nn_contrib_conv2d_NCHWc(param1, 0, 0);
176  res3 = fused_layout_transform_1(param2, 0, 0);
177  return res3;
178 }
179 
180 int32_t fused_conv2d_wrapper(float* X, float* p0, float* T_fused_layout_transform_1) {
181 
182  float out_fused_layout_transform_2[64];
183  float out_conv[128];
184  int32_t res;
185  a0[0].data = X;
186  a1[0].data = out_fused_layout_transform_2;
187  param0[0].v_handle = a0;
188  param0[1].v_handle = a1;
189 
190  b0[0].data = out_fused_layout_transform_2;
191  b1[0].data = p0;
192  b2[0].data = out_conv;
193  param1[0].v_handle = b0;
194  param1[1].v_handle = b1;
195  param1[2].v_handle = b2;
196 
197  c0[0].data = out_conv;
198  c1[0].data = T_fused_layout_transform_1;
199  param2[0].v_handle = c0;
200  param2[1].v_handle = c1;
201 
202 
203 #ifdef BAMBU_PROFILING
205 #endif
206 
207  res = conv(param0, param1, param2);
208 
209 #ifdef BAMBU_PROFILING
211 #endif
212 
213  return res;
214 }
215 
216 
TVM_DLL int32_t fused_layout_transform_1(void *args, void *arg_type_ids, int32_t num_args)
Definition: 09_conv2d_a.cc:7
TVMArray a0[1]
TVMArray c1[1]
void __builtin_bambu_time_start()
TVMArray a1[1]
void * v_handle
float float2
TVMArray b0[1]
Union type of values being passed through API and function calls.
__attribute__((noinline))
Convert the given fixedpt number to a decimal string.
TVM_DLL int32_t fused_layout_transform_2(void *args, void *arg_type_ids, int32_t num_args)
Definition: 09_conv2d_a.cc:37
void * data
The opaque data pointer points to the allocated data. This will be CUDA device pointer or cl_mem hand...
Definition: dlpack.h:131
int32_t fused_conv2d_wrapper(float *X, float *p0, float *T_fused_layout_transform_1)
int32_t conv(TVMValue *param0, TVMValue *param1, TVMValue *param2)
TVM_DLL int32_t fused_nn_contrib_conv2d_NCHWc(void *args, void *arg_type_ids, int32_t num_args)
Definition: 09_conv2d_a.cc:65
TVMArray b2[1]
TVMValue param1[3]
TVMArray b1[1]
Plain C Tensor object, does not manage memory.
Definition: dlpack.h:111
TVMValue param0[2]
TVMArray c0[1]
void __builtin_bambu_time_stop()
TVMValue param2[2]

Generated on Mon Feb 12 2024 13:02:50 for PandA-2024.02 by doxygen 1.8.13