99#include  <math.h> 
1010#include  <complex.h> 
1111#include  <string.h> 
12+ #if  _OPENMP 
13+ #include  <omp.h> 
14+ #endif 
1215#define  TACO_MIN (_a ,_b ) ((_a) < (_b) ? (_a) : (_b))
1316#define  TACO_MAX (_a ,_b ) ((_a) > (_b) ? (_a) : (_b))
1417#define  TACO_DEREF (_a ) (((___context___*)(*__ctx__))->_a)
@@ -26,6 +29,10 @@ typedef struct {
2629  int32_t       vals_size ;     // values array size 
2730} taco_tensor_t ;
2831#endif 
32+ #if  !_OPENMP 
33+ int  omp_get_thread_num () { return  0 ; }
34+ int  omp_get_max_threads () { return  1 ; }
35+ #endif 
2936int  cmp (const  void  * a , const  void  * b ) {
3037  return  * ((const  int * )a ) -  * ((const  int * )b );
3138}
@@ -122,14 +129,18 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
122129  int *  restrict C2_crd  =  (int * )(C -> indices [1 ][1 ]);
123130  double *  restrict C_vals  =  (double * )(C -> vals );
124131
132+   double *  restrict workspace_all  =  0 ;
133+   int32_t *  restrict workspace_index_list_all  =  0 ;
134+   workspace_index_list_all  =  (int32_t * )malloc (sizeof (int32_t ) *  (C2_dimension  *  omp_get_max_threads ()));
135+   bool *  restrict workspace_already_set_all  =  calloc ((C2_dimension  *  omp_get_max_threads ()), sizeof (bool ));
136+   workspace_all  =  (double * )malloc (sizeof (double ) *  (C2_dimension  *  omp_get_max_threads ()));
137+ 
125138  #pragma  omp parallel for schedule(runtime)
126139  for  (int32_t  i  =  0 ; i  <  B1_dimension ; i ++ ) {
127140    int32_t  workspace_index_list_size  =  0 ;
128-     double *  restrict workspace  =  0 ;
129-     int32_t *  restrict workspace_index_list  =  0 ;
130-     workspace_index_list  =  (int32_t * )malloc (sizeof (int32_t ) *  C2_dimension );
131-     bool *  restrict workspace_already_set  =  calloc (C2_dimension , sizeof (bool ));
132-     workspace  =  (double * )malloc (sizeof (double ) *  C2_dimension );
141+     double *  restrict workspace  =  workspace_all  +  C2_dimension  *  omp_get_thread_num ();
142+     int32_t *  restrict workspace_index_list  =  workspace_index_list_all  +  C2_dimension  *  omp_get_thread_num ();
143+     bool *  restrict workspace_already_set  =  workspace_already_set_all  +  C2_dimension  *  omp_get_thread_num ();
133144    for  (int32_t  kB  =  B2_pos [i ]; kB  <  B2_pos [(i  +  1 )]; kB ++ ) {
134145      int32_t  k  =  B2_crd [kB ];
135146      for  (int32_t  jC  =  C2_pos [k ]; jC  <  C2_pos [(k  +  1 )]; jC ++ ) {
@@ -153,11 +164,12 @@ int compute(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
153164      A_vals [pA2 ] =  workspace [j ];
154165      workspace_already_set [j ] =  0 ;
155166    }
156-     free (workspace_index_list );
157-     free (workspace_already_set );
158-     free (workspace );
159167  }
160168
169+   free (workspace_index_list_all );
170+   free (workspace_already_set_all );
171+   free (workspace_all );
172+ 
161173  for  (int32_t  p  =  0 ; p  <  A1_dimension ; p ++ ) {
162174    A2_pos [A1_dimension  -  p ] =  A2_pos [((A1_dimension  -  p ) -  1 )];
163175  }
@@ -184,12 +196,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
184196  int32_t *  restrict A2_nnz  =  0 ;
185197  A2_nnz  =  (int32_t * )malloc (sizeof (int32_t ) *  B1_dimension );
186198
199+   int32_t *  restrict qworkspace_index_list_all  =  0 ;
200+   qworkspace_index_list_all  =  (int32_t * )malloc (sizeof (int32_t ) *  (C2_dimension  *  omp_get_max_threads ()));
201+   bool *  restrict qworkspace_already_set_all  =  calloc ((C2_dimension  *  omp_get_max_threads ()), sizeof (bool ));
202+ 
187203  #pragma  omp parallel for schedule(runtime)
188204  for  (int32_t  i  =  0 ; i  <  B1_dimension ; i ++ ) {
189205    int32_t  qworkspace_index_list_size  =  0 ;
190-     int32_t *  restrict qworkspace_index_list  =  0 ;
191-     qworkspace_index_list  =  (int32_t * )malloc (sizeof (int32_t ) *  C2_dimension );
192-     bool *  restrict qworkspace_already_set  =  calloc (C2_dimension , sizeof (bool ));
206+     int32_t *  restrict qworkspace_index_list  =  qworkspace_index_list_all  +  C2_dimension  *  omp_get_thread_num ();
207+     bool *  restrict qworkspace_already_set  =  qworkspace_already_set_all  +  C2_dimension  *  omp_get_thread_num ();
193208    for  (int32_t  kB  =  B2_pos [i ]; kB  <  B2_pos [(i  +  1 )]; kB ++ ) {
194209      int32_t  k  =  B2_crd [kB ];
195210      for  (int32_t  jC  =  C2_pos [k ]; jC  <  C2_pos [(k  +  1 )]; jC ++ ) {
@@ -208,10 +223,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
208223      qworkspace_already_set [j ] =  0 ;
209224    }
210225    A2_nnz [i ] =  tjA2_nnz_val ;
211-     free (qworkspace_index_list );
212-     free (qworkspace_already_set );
213226  }
214227
228+   free (qworkspace_index_list_all );
229+   free (qworkspace_already_set_all );
230+ 
215231  A2_pos  =  (int32_t * )malloc (sizeof (int32_t ) *  (A1_dimension  +  1 ));
216232  A2_pos [0 ] =  0 ;
217233  for  (int32_t  i  =  0 ; i  <  A1_dimension ; i ++ ) {
@@ -220,12 +236,15 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
220236  A2_crd  =  (int32_t * )malloc (sizeof (int32_t ) *  A2_pos [A1_dimension ]);
221237  A_vals  =  (double * )malloc (sizeof (double ) *  A2_pos [A1_dimension ]);
222238
239+   int32_t *  restrict workspace_index_list_all  =  0 ;
240+   workspace_index_list_all  =  (int32_t * )malloc (sizeof (int32_t ) *  (C2_dimension  *  omp_get_max_threads ()));
241+   bool *  restrict workspace_already_set_all  =  calloc ((C2_dimension  *  omp_get_max_threads ()), sizeof (bool ));
242+ 
223243  #pragma  omp parallel for schedule(runtime)
224244  for  (int32_t  i  =  0 ; i  <  B1_dimension ; i ++ ) {
225245    int32_t  workspace_index_list_size  =  0 ;
226-     int32_t *  restrict workspace_index_list  =  0 ;
227-     workspace_index_list  =  (int32_t * )malloc (sizeof (int32_t ) *  C2_dimension );
228-     bool *  restrict workspace_already_set  =  calloc (C2_dimension , sizeof (bool ));
246+     int32_t *  restrict workspace_index_list  =  workspace_index_list_all  +  C2_dimension  *  omp_get_thread_num ();
247+     bool *  restrict workspace_already_set  =  workspace_already_set_all  +  C2_dimension  *  omp_get_thread_num ();
229248    for  (int32_t  kB0  =  B2_pos [i ]; kB0  <  B2_pos [(i  +  1 )]; kB0 ++ ) {
230249      int32_t  k  =  B2_crd [kB0 ];
231250      for  (int32_t  jC0  =  C2_pos [k ]; jC0  <  C2_pos [(k  +  1 )]; jC0 ++ ) {
@@ -246,10 +265,11 @@ int assemble(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
246265      A2_crd [pA2 ] =  j ;
247266      workspace_already_set [j ] =  0 ;
248267    }
249-     free (workspace_index_list );
250-     free (workspace_already_set );
251268  }
252269
270+   free (workspace_index_list_all );
271+   free (workspace_already_set_all );
272+ 
253273  for  (int32_t  p  =  0 ; p  <  A1_dimension ; p ++ ) {
254274    A2_pos [A1_dimension  -  p ] =  A2_pos [((A1_dimension  -  p ) -  1 )];
255275  }
@@ -281,12 +301,15 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
281301  int32_t *  restrict A2_nnz  =  0 ;
282302  A2_nnz  =  (int32_t * )malloc (sizeof (int32_t ) *  B1_dimension );
283303
304+   int32_t *  restrict qworkspace_index_list_all  =  0 ;
305+   qworkspace_index_list_all  =  (int32_t * )malloc (sizeof (int32_t ) *  (C2_dimension  *  omp_get_max_threads ()));
306+   bool *  restrict qworkspace_already_set_all  =  calloc ((C2_dimension  *  omp_get_max_threads ()), sizeof (bool ));
307+ 
284308  #pragma  omp parallel for schedule(runtime)
285309  for  (int32_t  i  =  0 ; i  <  B1_dimension ; i ++ ) {
286310    int32_t  qworkspace_index_list_size  =  0 ;
287-     int32_t *  restrict qworkspace_index_list  =  0 ;
288-     qworkspace_index_list  =  (int32_t * )malloc (sizeof (int32_t ) *  C2_dimension );
289-     bool *  restrict qworkspace_already_set  =  calloc (C2_dimension , sizeof (bool ));
311+     int32_t *  restrict qworkspace_index_list  =  qworkspace_index_list_all  +  C2_dimension  *  omp_get_thread_num ();
312+     bool *  restrict qworkspace_already_set  =  qworkspace_already_set_all  +  C2_dimension  *  omp_get_thread_num ();
290313    for  (int32_t  kB  =  B2_pos [i ]; kB  <  B2_pos [(i  +  1 )]; kB ++ ) {
291314      int32_t  k  =  B2_crd [kB ];
292315      for  (int32_t  jC  =  C2_pos [k ]; jC  <  C2_pos [(k  +  1 )]; jC ++ ) {
@@ -305,10 +328,11 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
305328      qworkspace_already_set [j ] =  0 ;
306329    }
307330    A2_nnz [i ] =  tjA2_nnz_val ;
308-     free (qworkspace_index_list );
309-     free (qworkspace_already_set );
310331  }
311332
333+   free (qworkspace_index_list_all );
334+   free (qworkspace_already_set_all );
335+ 
312336  A2_pos  =  (int32_t * )malloc (sizeof (int32_t ) *  (A1_dimension  +  1 ));
313337  A2_pos [0 ] =  0 ;
314338  for  (int32_t  i  =  0 ; i  <  A1_dimension ; i ++ ) {
@@ -317,14 +341,18 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
317341  A2_crd  =  (int32_t * )malloc (sizeof (int32_t ) *  A2_pos [A1_dimension ]);
318342  A_vals  =  (double * )malloc (sizeof (double ) *  A2_pos [A1_dimension ]);
319343
344+   double *  restrict workspace_all  =  0 ;
345+   int32_t *  restrict workspace_index_list_all  =  0 ;
346+   workspace_index_list_all  =  (int32_t * )malloc (sizeof (int32_t ) *  (C2_dimension  *  omp_get_max_threads ()));
347+   bool *  restrict workspace_already_set_all  =  calloc ((C2_dimension  *  omp_get_max_threads ()), sizeof (bool ));
348+   workspace_all  =  (double * )malloc (sizeof (double ) *  (C2_dimension  *  omp_get_max_threads ()));
349+ 
320350  #pragma  omp parallel for schedule(runtime)
321351  for  (int32_t  i  =  0 ; i  <  B1_dimension ; i ++ ) {
322352    int32_t  workspace_index_list_size  =  0 ;
323-     double *  restrict workspace  =  0 ;
324-     int32_t *  restrict workspace_index_list  =  0 ;
325-     workspace_index_list  =  (int32_t * )malloc (sizeof (int32_t ) *  C2_dimension );
326-     bool *  restrict workspace_already_set  =  calloc (C2_dimension , sizeof (bool ));
327-     workspace  =  (double * )malloc (sizeof (double ) *  C2_dimension );
353+     double *  restrict workspace  =  workspace_all  +  C2_dimension  *  omp_get_thread_num ();
354+     int32_t *  restrict workspace_index_list  =  workspace_index_list_all  +  C2_dimension  *  omp_get_thread_num ();
355+     bool *  restrict workspace_already_set  =  workspace_already_set_all  +  C2_dimension  *  omp_get_thread_num ();
328356    for  (int32_t  kB0  =  B2_pos [i ]; kB0  <  B2_pos [(i  +  1 )]; kB0 ++ ) {
329357      int32_t  k  =  B2_crd [kB0 ];
330358      for  (int32_t  jC0  =  C2_pos [k ]; jC0  <  C2_pos [(k  +  1 )]; jC0 ++ ) {
@@ -350,11 +378,12 @@ int evaluate(taco_tensor_t *A, taco_tensor_t *B, taco_tensor_t *C) {
350378      A_vals [pA2 ] =  workspace [j ];
351379      workspace_already_set [j ] =  0 ;
352380    }
353-     free (workspace_index_list );
354-     free (workspace_already_set );
355-     free (workspace );
356381  }
357382
383+   free (workspace_index_list_all );
384+   free (workspace_already_set_all );
385+   free (workspace_all );
386+ 
358387  for  (int32_t  p  =  0 ; p  <  A1_dimension ; p ++ ) {
359388    A2_pos [A1_dimension  -  p ] =  A2_pos [((A1_dimension  -  p ) -  1 )];
360389  }
0 commit comments