19 void kernel(int32_t y_outer_x_outer_fused,
float *
compute,
float* placeholder,
float* placeholder1)
21 float compute1[16] = {0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f};
23 for (k = 0; k < 49; ++
k) {
25 compute1[i] = compute1[i] + (placeholder + (k * 16))[i] * (placeholder1 + ((y_outer_x_outer_fused * 784) + (k * 16)))[i];
27 compute[y_outer_x_outer_fused] = 0.000000e+00f;
28 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]);
29 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]);
30 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]);
31 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]);
32 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]);
33 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]);
34 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]);
35 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]);
36 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]);
37 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]);
38 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]);
39 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]);
40 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]);
41 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]);
42 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]);
43 compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]);
48 void parallel(
float *
compute,
float* placeholder,
float* placeholder1)
50 int32_t y_outer_x_outer_fused;
51 #pragma omp parallel for 52 for (y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused)
54 kernel(y_outer_x_outer_fused, compute, placeholder, placeholder1);
60 void* arg0 = (((
TVMValue*)args)[0].v_handle);
61 void* arg1 = (((
TVMValue*)args)[1].v_handle);
62 void* arg2 = (((
TVMValue*)args)[2].v_handle);
63 void* arg3 = (((
TVMValue*)args)[3].v_handle);
64 float* placeholder = (
float*)(((
TVMArray*)arg0)[0].data);
65 float* placeholder1 = (
float*)(((
TVMArray*)arg1)[0].data);
66 float* placeholder2 = (
float*)(((
TVMArray*)arg2)[0].data);
67 float* T_add = (
float*)(((
TVMArray*)arg3)[0].data);
71 parallel(compute, placeholder, placeholder1);
75 for (ax1 = 0; ax1 < 10; ++ax1) {
76 T_add[ax1] = (compute[ax1] + placeholder2[ax1]);
83 int32_t
fused_nn_softmax(
void* args,
void* arg_type_ids, int32_t num_args) {
84 void* arg0 = (((
TVMValue*)args)[0].v_handle);
85 void* arg1 = (((
TVMValue*)args)[1].v_handle);
86 float* placeholder = (
float*)(((
TVMArray*)arg0)[0].data);
87 float* tensor = (
float*)(((
TVMArray*)arg1)[0].data);
91 tensor1[0] = -3.402823e+38f;
93 for (k1 = 0; k1 < 10; ++k1) {
94 float _1 = tensor1[0];
95 float _2 = placeholder[k1];
96 tensor1[0] = ((_1) > (_2) ? (_1) : (_2));
99 for (ax1 = 0; ax1 < 10; ++ax1) {
100 tensor2[ax1] = expf((placeholder[ax1] - tensor1[0]));
102 tensor3[0] = 0.000000e+00f;
104 for (k2 = 0; k2 < 10; ++k2) {
105 tensor3[0] = (tensor3[0] + tensor2[k2]);
108 for (ax11 = 0; ax11 < 10; ++ax11) {
109 tensor[ax11] = (tensor2[ax11] / tensor3[0]);
122 int32_t
mlp_wrapper(
float* placeholder,
float* placeholder1,
float* placeholder2,
float* tensor)
125 int32_t res, res1, res2;
126 a0[0].
data = placeholder;
127 a1[0].
data = placeholder1;
128 a2[0].
data = placeholder2;
140 #ifdef BAMBU_PROFILING 144 res =
mlp(param1, param2);
146 #ifdef BAMBU_PROFILING TVM_DLL int32_t fused_nn_dense_add(void *args, void *arg_type_ids, int32_t num_args)
void __builtin_bambu_time_start()
int compute(int a, int b, int c, int d, int e, int f, int g, int expected)
Union type of values being passed through API and function calls.
TVM_DLL int32_t fused_nn_softmax(void *args, void *arg_type_ids, int32_t num_args)
static const uint32_t k[]
void * data
The opaque data pointer points to the allocated data. This will be CUDA device pointer or cl_mem hand...
__attribute__((noinline))
Convert the given fixedpt number to a decimal string.
void kernel(unsigned vertex, unsigned *p_Qnext, unsigned *Qnext_N, unsigned *map)
int32_t mlp_wrapper(float *placeholder, float *placeholder1, float *placeholder2, float *tensor)
int32_t mlp(TVMValue *param1, TVMValue *param2)
Plain C Tensor object, does not manage memory.
void __builtin_bambu_time_stop()