21 void kernel1(int32_t ax0_ax1_fused_ax2_fused,
float* placeholder,
float* T_layout_trans)
24 for (ax3 = 0; ax3 < 64; ++ax3)
26 T_layout_trans[((ax0_ax1_fused_ax2_fused * 64) + ax3)] = placeholder[((ax0_ax1_fused_ax2_fused * 64) + ax3)];
31 void kernel2(int32_t ax0_ax1_fused,
float* placeholder,
float* T_layout_trans)
36 for (ax2 = 0; ax2 < 64; ++ax2) {
37 for (ax3 = 0; ax3 < 64; ++ax3) {
38 T_layout_trans[(((ax0_ax1_fused * 4096) + (ax2 * 64)) + ax3)] = placeholder[(((ax2 * 128) + (ax3 * 2)) + ax0_ax1_fused)];
44 void kernel3(int32_t i1_i2_fused,
float *data_pad,
float* placeholder)
47 for (i3 = 0; i3 < 66; ++i3) {
48 ((
float*)data_pad)[((i1_i2_fused * 66) + i3)] = (((((1 <= i1_i2_fused) && (i1_i2_fused < 65)) && (1 <= i3)) && (i3 < 65)) ? placeholder[(((i1_i2_fused * 64) + i3) - 65)] : 0.000000e+00f);
53 void kernel4(int32_t n_oc_chunk_fused_oh_fused,
float *data_pad,
float* placeholder1,
float* conv2d_NCHWc)
55 float2 conv2d_NCHWc_global[16];
56 for (ow_outer = 0; ow_outer < 4; ++ow_outer) {
57 conv2d_NCHWc_global[0] = ((
float2)(0.000000e+00f, 0.000000e+00f));
58 conv2d_NCHWc_global[1] = ((
float2)(0.000000e+00f, 0.000000e+00f));
59 conv2d_NCHWc_global[2] = ((
float2)(0.000000e+00f, 0.000000e+00f));
60 conv2d_NCHWc_global[3] = ((
float2)(0.000000e+00f, 0.000000e+00f));
61 conv2d_NCHWc_global[4] = ((
float2)(0.000000e+00f, 0.000000e+00f));
62 conv2d_NCHWc_global[5] = ((
float2)(0.000000e+00f, 0.000000e+00f));
63 conv2d_NCHWc_global[6] = ((
float2)(0.000000e+00f, 0.000000e+00f));
64 conv2d_NCHWc_global[7] = ((
float2)(0.000000e+00f, 0.000000e+00f));
65 conv2d_NCHWc_global[8] = ((
float2)(0.000000e+00f, 0.000000e+00f));
66 conv2d_NCHWc_global[9] = ((
float2)(0.000000e+00f, 0.000000e+00f));
67 conv2d_NCHWc_global[10] = ((
float2)(0.000000e+00f, 0.000000e+00f));
68 conv2d_NCHWc_global[11] = ((
float2)(0.000000e+00f, 0.000000e+00f));
69 conv2d_NCHWc_global[12] = ((
float2)(0.000000e+00f, 0.000000e+00f));
70 conv2d_NCHWc_global[13] = ((
float2)(0.000000e+00f, 0.000000e+00f));
71 conv2d_NCHWc_global[14] = ((
float2)(0.000000e+00f, 0.000000e+00f));
72 conv2d_NCHWc_global[15] = ((
float2)(0.000000e+00f, 0.000000e+00f));
73 for (kh = 0; kh < 3; ++kh) {
74 for (kw = 0; kw < 3; ++kw) {
75 conv2d_NCHWc_global[0] = (conv2d_NCHWc_global[0] + (((
float2)(((
float*)data_pad)[((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw)], ((
float*)data_pad)[((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
76 conv2d_NCHWc_global[1] = (conv2d_NCHWc_global[1] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 1)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 1)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
77 conv2d_NCHWc_global[2] = (conv2d_NCHWc_global[2] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 2)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 2)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
78 conv2d_NCHWc_global[3] = (conv2d_NCHWc_global[3] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 3)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 3)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
79 conv2d_NCHWc_global[4] = (conv2d_NCHWc_global[4] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 4)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 4)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
80 conv2d_NCHWc_global[5] = (conv2d_NCHWc_global[5] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 5)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 5)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
81 conv2d_NCHWc_global[6] = (conv2d_NCHWc_global[6] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 6)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 6)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
82 conv2d_NCHWc_global[7] = (conv2d_NCHWc_global[7] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 7)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 7)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
83 conv2d_NCHWc_global[8] = (conv2d_NCHWc_global[8] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 8)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 8)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
84 conv2d_NCHWc_global[9] = (conv2d_NCHWc_global[9] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 9)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 9)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
85 conv2d_NCHWc_global[10] = (conv2d_NCHWc_global[10] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 10)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 10)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
86 conv2d_NCHWc_global[11] = (conv2d_NCHWc_global[11] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 11)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 11)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
87 conv2d_NCHWc_global[12] = (conv2d_NCHWc_global[12] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 12)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 12)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
88 conv2d_NCHWc_global[13] = (conv2d_NCHWc_global[13] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 13)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 13)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
89 conv2d_NCHWc_global[14] = (conv2d_NCHWc_global[14] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 14)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 14)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
90 conv2d_NCHWc_global[15] = (conv2d_NCHWc_global[15] + (((
float2)(((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 15)], ((
float*)data_pad)[(((((kh * 66) + (n_oc_chunk_fused_oh_fused * 66)) + (ow_outer * 16)) + kw) + 15)])) * ((
float2*)(placeholder1 + ((kh * 6) + (kw * 2))))[0]));
95 for (ow_inner = 0; ow_inner < 16; ++ow_inner) {
96 ((
float2*)(conv2d_NCHWc + (((n_oc_chunk_fused_oh_fused * 128) + (ow_outer * 32)) + (ow_inner * 2))))[0] = ((
float2*)((
float*)conv2d_NCHWc_global + (ow_inner * 2)))[0];
103 void parallel1(
float* placeholder,
float* T_layout_trans)
105 int32_t ax0_ax1_fused_ax2_fused;
107 #pragma omp parallel for 108 for (ax0_ax1_fused_ax2_fused = 0; ax0_ax1_fused_ax2_fused < 64; ++ax0_ax1_fused_ax2_fused)
110 kernel1(ax0_ax1_fused_ax2_fused, placeholder, T_layout_trans);
115 void parallel2(
float* placeholder,
float* T_layout_trans)
117 int32_t ax0_ax1_fused;
118 #pragma omp parallel for 119 for (ax0_ax1_fused = 0; ax0_ax1_fused < 2; ++ax0_ax1_fused)
121 kernel2(ax0_ax1_fused, placeholder, T_layout_trans);
126 void parallel3(
float *data_pad,
float* placeholder)
129 #pragma omp parallel for 130 for (i1_i2_fused = 0; i1_i2_fused < 66; ++i1_i2_fused)
132 kernel3(i1_i2_fused, data_pad, placeholder);
137 void parallel4(
float *data_pad,
float* placeholder1,
float* conv2d_NCHWc)
139 int32_t n_oc_chunk_fused_oh_fused;
140 #pragma omp parallel for 141 for (n_oc_chunk_fused_oh_fused = 0; n_oc_chunk_fused_oh_fused < 64; ++n_oc_chunk_fused_oh_fused)
143 kernel4(n_oc_chunk_fused_oh_fused, data_pad, placeholder1, conv2d_NCHWc);
148 void* arg0 = (((
TVMValue*)args)[0].v_handle);
149 void* arg1 = (((
TVMValue*)args)[1].v_handle);
150 float* placeholder = (
float*)(((
TVMArray*)arg0)[0].data);
151 float* T_layout_trans = (
float*)(((
TVMArray*)arg1)[0].data);
153 parallel1(placeholder, T_layout_trans);
159 void* arg0 = (((
TVMValue*)args)[0].v_handle);
160 void* arg1 = (((
TVMValue*)args)[1].v_handle);
161 float* placeholder = (
float*)(((
TVMArray*)arg0)[0].data);
162 float* T_layout_trans = (
float*)(((
TVMArray*)arg1)[0].data);
164 parallel2(placeholder, T_layout_trans);
170 void* arg0 = (((
TVMValue*)args)[0].v_handle);
171 void* arg1 = (((
TVMValue*)args)[1].v_handle);
172 void* arg2 = (((
TVMValue*)args)[2].v_handle);
173 float* placeholder = (
float*)(((
TVMArray*)arg0)[0].data);
174 float* placeholder1 = (
float*)(((
TVMArray*)arg1)[0].data);
175 float* conv2d_NCHWc = (
float*)(((
TVMArray*)arg2)[0].data);
178 if (data_pad ==
NULL) {
182 parallel3(data_pad, placeholder);
184 parallel4(data_pad, placeholder1, conv2d_NCHWc);
193 int res1, res2, res3;
202 float out_fused_layout_transform_2[4096];
203 float out_conv[8192];
206 a1[0].
data = out_fused_layout_transform_2;
210 b0[0].
data = out_fused_layout_transform_2;
212 b2[0].
data = out_conv;
218 c1[0].
data = T_fused_layout_transform_1;
223 #ifdef BAMBU_PROFILING 227 res =
conv(param0, param1, param2);
229 #ifdef BAMBU_PROFILING
void __builtin_bambu_time_start()
int32_t conv(TVMValue *param0, TVMValue param1, TVMValue param2)
Union type of values being passed through API and function calls.
int32_t fused_nn_contrib_conv2d_NCHWc(void *args, void *arg_type_ids, int32_t num_args)
TVM_DLL int TVMBackendFreeWorkspace(int device_type, int device_id, void *ptr)
Backend function to free temporal workspace.
void * data
The opaque data pointer points to the allocated data. This will be CUDA device pointer or cl_mem hand...
int32_t fused_layout_transform_1(void *args, void *arg_type_ids, int32_t num_args)
int32_t fused_conv2d_wrapper(float *X, float *p0, float *T_fused_layout_transform_1)
TVM_DLL void * TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint, int dtype_bits_hint)
Backend function to allocate temporal workspace.
Plain C Tensor object, does not manage memory.
int32_t fused_layout_transform_2(void *args, void *arg_type_ids, int32_t num_args)
void __builtin_bambu_time_stop()
__attribute__((noinline))
Convert the given fixedpt number to a decimal string.