PandA-2024.02
e1_mlp.parallel.c
Go to the documentation of this file.
1 #include "c_backend_api.h"
2 #include <math.h>
3 
4 #ifdef BAMBU_PROFILING
5 extern void __builtin_bambu_time_start();
6 extern void __builtin_bambu_time_stop();
7 #endif
8 
17 
18 __attribute__((noinline))
19 void kernel(int32_t y_outer_x_outer_fused, float *compute, float* placeholder, float* placeholder1)
20 {
21  float compute1[16] = {0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f};
22  int32_t k, i;
23  for (k = 0; k < 49; ++k) {
24  for(i=0; i < 16; ++i)
25  compute1[i] = compute1[i] + (placeholder + (k * 16))[i] * (placeholder1 + ((y_outer_x_outer_fused * 784) + (k * 16)))[i];
26  }
27  compute[y_outer_x_outer_fused] = 0.000000e+00f;
28  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]);
29  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]);
30  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]);
31  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]);
32  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]);
33  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]);
34  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]);
35  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]);
36  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]);
37  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]);
38  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]);
39  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]);
40  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]);
41  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]);
42  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]);
43  compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]);
44 }
45 
46 
47 __attribute__((noinline))
48 void parallel(float *compute, float* placeholder, float* placeholder1)
49 {
50  int32_t y_outer_x_outer_fused;
51  #pragma omp parallel for
52  for (y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused)
53  {
54  kernel(y_outer_x_outer_fused, compute, placeholder, placeholder1);
55  }
56 }
57 
58 __attribute__((noinline))
59 int32_t fused_nn_dense_add( void* args, void* arg_type_ids, int32_t num_args) {
60  void* arg0 = (((TVMValue*)args)[0].v_handle);
61  void* arg1 = (((TVMValue*)args)[1].v_handle);
62  void* arg2 = (((TVMValue*)args)[2].v_handle);
63  void* arg3 = (((TVMValue*)args)[3].v_handle);
64  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
65  float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data);
66  float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data);
67  float* T_add = (float*)(((TVMArray*)arg3)[0].data);
68  float compute[10];
69 
70 
71  parallel(compute, placeholder, placeholder1);
72 
73 
74  int32_t ax1;
75  for (ax1 = 0; ax1 < 10; ++ax1) {
76  T_add[ax1] = (compute[ax1] + placeholder2[ax1]);
77  }
78  return 0;
79 }
80 
81 
82 __attribute__((noinline))
83 int32_t fused_nn_softmax( void* args, void* arg_type_ids, int32_t num_args) {
84  void* arg0 = (((TVMValue*)args)[0].v_handle);
85  void* arg1 = (((TVMValue*)args)[1].v_handle);
86  float* placeholder = (float*)(((TVMArray*)arg0)[0].data);
87  float* tensor = (float*)(((TVMArray*)arg1)[0].data);
88  float tensor1[1];
89  float tensor2[10];
90  float tensor3[1];
91  tensor1[0] = -3.402823e+38f;
92  int32_t k1;
93  for (k1 = 0; k1 < 10; ++k1) {
94  float _1 = tensor1[0];
95  float _2 = placeholder[k1];
96  tensor1[0] = ((_1) > (_2) ? (_1) : (_2));
97  }
98  int32_t ax1;
99  for (ax1 = 0; ax1 < 10; ++ax1) {
100  tensor2[ax1] = expf((placeholder[ax1] - tensor1[0]));
101  }
102  tensor3[0] = 0.000000e+00f;
103  int32_t k2;
104  for (k2 = 0; k2 < 10; ++k2) {
105  tensor3[0] = (tensor3[0] + tensor2[k2]);
106  }
107  int32_t ax11;
108  for (ax11 = 0; ax11 < 10; ++ax11) {
109  tensor[ax11] = (tensor2[ax11] / tensor3[0]);
110  }
111  return 0;
112 }
113 
114 int32_t mlp(TVMValue* param1, TVMValue* param2){
115  int32_t res1, res2;
116  res1 = fused_nn_dense_add(param1, 0, 0);
117  res2 = fused_nn_softmax(param2, 0, 0);
118  return res2;
119 }
120 
121 
122 int32_t mlp_wrapper(float* placeholder, float* placeholder1, float* placeholder2, float* tensor)
123 {
124  float T_add[10];
125  int32_t res, res1, res2;
126  a0[0].data = placeholder;
127  a1[0].data = placeholder1;
128  a2[0].data = placeholder2;
129  a3[0].data = T_add;
130  param1[0].v_handle = a0;
131  param1[1].v_handle = a1;
132  param1[2].v_handle = a2;
133  param1[3].v_handle = a3;
134 
135  b0[0].data = T_add;
136  b1[0].data = tensor;
137  param2[0].v_handle = b0;
138  param2[1].v_handle = b1;
139 
140 #ifdef BAMBU_PROFILING
142 #endif
143 
144  res = mlp(param1, param2);
145 
146 #ifdef BAMBU_PROFILING
148 #endif
149 
150  return res;
151 }
TVM_DLL int32_t fused_nn_dense_add(void *args, void *arg_type_ids, int32_t num_args)
Definition: 04_dense_a.cc:7
void __builtin_bambu_time_start()
void * v_handle
int compute(int a, int b, int c, int d, int e, int f, int g, int expected)
Definition: main.c:10
TVMArray b0[1]
TVMArray a0[1]
Union type of values being passed through API and function calls.
TVMValue param2[2]
TVM_DLL int32_t fused_nn_softmax(void *args, void *arg_type_ids, int32_t num_args)
Definition: 06_softmax_a.cc:7
TVMArray a2[1]
static const uint32_t k[]
Definition: sha-256.c:22
TVMArray b1[1]
void * data
The opaque data pointer points to the allocated data. This will be CUDA device pointer or cl_mem hand...
Definition: dlpack.h:131
__attribute__((noinline))
Convert the given fixedpt number to a decimal string.
void kernel(unsigned vertex, unsigned *p_Qnext, unsigned *Qnext_N, unsigned *map)
Definition: bfs.c:44
int32_t mlp_wrapper(float *placeholder, float *placeholder1, float *placeholder2, float *tensor)
TVMArray a3[1]
TVMValue param1[4]
TVMArray a1[1]
int32_t mlp(TVMValue *param1, TVMValue *param2)
Plain C Tensor object, does not manage memory.
Definition: dlpack.h:111
void __builtin_bambu_time_stop()

Generated on Mon Feb 12 2024 13:02:50 for PandA-2024.02 by doxygen 1.8.13