00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #define __STDC_LIMIT_MACROS
00041 #include <stdint.h>
00042 #ifdef USE_PCH
00043 #include "lno_pch.h"
00044 #endif // USE_PCH
00045 #pragma hdrstop
00046
00047 #include <sys/types.h>
00048 #include <limits.h>
00049 #include "pu_info.h"
00050 #include "lnoutils.h"
00051 #include "lnopt_main.h"
00052 #include "stab.h"
00053 #include "targ_const.h"
00054 #include "wn_simp.h"
00055 #include "stdlib.h"
00056 #include "lwn_util.h"
00057 #include "strtab.h"
00058 #include "config.h"
00059 #include "optimizer.h"
00060 #include "opt_du.h"
00061 #include "name.h"
00062 #include "wintrinsic.h"
00063 #include "lno_bv.h"
00064 #include "dep_graph.h"
00065 #include "debug.h"
00066 #include "scalar_expand.h"
00067 #include "cxx_memory.h"
00068 #include "reduc.h"
00069 #include "snl_utils.h"
00070 #include "sxlist.h"
00071 #include "snl_dist.h"
00072 #include "permute.h"
00073 #include "sxlimit.h"
00074 #include "parallel.h"
00075 #include "fiz_fuse.h"
00076 #include "ara.h"
00077 #include "snl_deps.h"
00078 #include "lego_util.h"
00079 #include "tile.h"
00080 #include "model.h"
00081 #include "cache_model.h"
00082 #include "config_cache.h"
00083 #include "config_lno.h"
00084 #include "parmodel.h"
00085 #include "wn_mp.h"
00086 #include "call_info.h"
00087 #include "ipa_lno_cost.h"
00088
00089 INT PAR_STAT::id_count = 0;
00090 INT Parallel_Debug_Level = 3;
00091
00092
00093
00094
00095
00096
00097
00098 PAR_STAT::PAR_STAT()
00099 {
00100 _next = NULL;
00101 _prev = NULL;
00102 _parent = NULL;
00103 _first = NULL;
00104 _last = NULL;
00105 _depth = 0;
00106 _is_parallel = FALSE;
00107 _count = 0;
00108 _id = 0;
00109 _wn = NULL;
00110 _is_cloned = FALSE;
00111 _num_estimated_iters = -1;
00112 }
00113
00114
00115
00116
00117
00118
00119 PAR_STAT::PAR_STAT(PAR_STAT* ps)
00120 {
00121 FmtAssert(ps != NULL, ("Tried to copy from NULL source"));
00122 _next = NULL;
00123 _prev = NULL;
00124 _parent = NULL;
00125 _first = NULL;
00126 _last = NULL;
00127 _is_parallel = ps->_is_parallel;
00128 _depth = ps->_depth;
00129 _count = ps->_count;
00130 _id = ps->_id;
00131 _wn = ps->_wn;
00132 _is_cloned = TRUE;
00133 _num_estimated_iters = -1;
00134 }
00135
00136
00137
00138
00139
00140
00141
00142 BOOL PAR_STAT::Has_Loop()
00143 {
00144 if (WN_opcode(_wn) == OPC_DO_LOOP)
00145 return TRUE;
00146 for (PAR_STAT* ps = _first; ps != NULL; ps = ps->_next)
00147 if (ps->Has_Loop())
00148 return TRUE;
00149 return FALSE;
00150 }
00151
00152
00153
00154
00155
00156
00157
00158 BOOL PAR_STAT::Is_Outer_Loop()
00159 {
00160 if (WN_opcode(_wn) != OPC_DO_LOOP)
00161 return FALSE;
00162 for (PAR_STAT* ps = _parent; ps != NULL; ps = ps->_parent)
00163 if (WN_opcode(_wn) == OPC_DO_LOOP)
00164 return FALSE;
00165 return TRUE;
00166 }
00167
00168
00169
00170
00171
00172
00173
00174 BOOL PAR_STAT::Is_Inner_Loop()
00175 {
00176 if (WN_opcode(_wn) != OPC_DO_LOOP)
00177 return FALSE;
00178 for (PAR_STAT* ps = _first; ps != NULL; ps = ps->_next)
00179 if (ps->Has_Loop())
00180 return FALSE;
00181 return TRUE;
00182 }
00183
00184
00185
00186
00187
00188
00189
00190 BOOL PAR_STAT::Is_Parallel_Enclosed_Loop()
00191 {
00192 for (PAR_STAT* ps = this; ps != NULL; ps = ps->_parent)
00193 if (WN_opcode(_wn) == OPC_DO_LOOP && ps->_is_parallel)
00194 return TRUE;
00195 return FALSE;
00196 }
00197
00198
00199
00200
00201
00202
00203 void PAR_STAT::Remove()
00204 {
00205 if (_parent != NULL) {
00206 if (_parent->_first == this)
00207 _parent->_first = this->_next;
00208 if (_parent->_last == this)
00209 _parent->_last = this->_prev;
00210 }
00211 if (_prev != NULL)
00212 _prev->_next = this->_next;
00213 if (_next != NULL)
00214 _next->_prev = this->_prev;
00215 _next = NULL;
00216 _prev = NULL;
00217 _parent = NULL;
00218 }
00219
00220
00221
00222
00223
00224
00225
00226 void PAR_STAT::Make_Parent(PAR_STAT* ps_parent,
00227 BOOL first)
00228 {
00229 _parent = ps_parent;
00230 _depth = 0;
00231 if (ps_parent != NULL) {
00232 _depth = ps_parent->_depth + 1;
00233 if (first) {
00234 _next = ps_parent->_first;
00235 _prev = NULL;
00236 if (_next != NULL)
00237 _next->_prev = this;
00238 if (ps_parent->_last == NULL)
00239 ps_parent->_last = this;
00240 ps_parent->_first = this;
00241 } else {
00242 _prev = ps_parent->_last;
00243 _next = NULL;
00244 if (_prev != NULL)
00245 _prev->_next = this;
00246 if (ps_parent->_first == NULL)
00247 ps_parent->_first = this;
00248 ps_parent->_last = this;
00249 }
00250 }
00251 }
00252
00253
00254
00255
00256
00257
00258
00259
00260 void PAR_STAT::Make_Sibling(PAR_STAT* ps_sibling,
00261 BOOL above)
00262 {
00263 ps_sibling->_parent = _parent;
00264 if (above) {
00265 if (_parent != NULL && _parent->_first == this)
00266 _parent->_first = ps_sibling;
00267 if (_prev != NULL)
00268 _prev->_next = ps_sibling;
00269 ps_sibling->_prev = _prev;
00270 ps_sibling->_next = this;
00271 _prev = ps_sibling;
00272 } else {
00273 if (_parent != NULL && _parent->_last == this)
00274 _parent->_last = ps_sibling;
00275 if (_next != NULL)
00276 _next->_prev = ps_sibling;
00277 ps_sibling->_prev = this;
00278 ps_sibling->_next = _next;
00279 _next = ps_sibling;
00280 }
00281 }
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291 PAR_STAT* PAR_STAT::Find(WN* wn,
00292 BOOL uncloned)
00293 {
00294 if (wn == _wn && (!uncloned || !_is_cloned))
00295 return this;
00296 if (_first != NULL) {
00297 PAR_STAT* ps_first = _first->Find(wn, uncloned);
00298 if (ps_first != NULL)
00299 return ps_first;
00300 }
00301 if (_next != NULL) {
00302 PAR_STAT* ps_next = _next->Find(wn, uncloned);
00303 if (ps_next != NULL)
00304 return ps_next;
00305 }
00306 return NULL;
00307 }
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318 PAR_STAT* PAR_STAT::Innermost_Sandwiched_Code(PAR_STAT* ps_inner,
00319 BOOL above)
00320 {
00321 DOLOOP_STACK loop_stack(&LNO_local_pool);
00322 Build_Doloop_Stack(ps_inner->_wn, &loop_stack);
00323
00324 PAR_STAT* ps = this;
00325 PAR_STAT* ps_stat = NULL;
00326 INT start_depth = this->_depth;
00327 INT end_depth = ps_inner->_depth;
00328 for (INT i = start_depth + 1; i <= end_depth; i++) {
00329 WN* wn_loop = loop_stack.Bottom_nth(i);
00330 PAR_STAT* pss = ps->_first;
00331 if (above) {
00332 for (ps = pss; ps->_is_cloned || ps->_wn != wn_loop; ps = ps->_next)
00333 ps_stat = ps;
00334 } else {
00335 for (ps = pss; ps->_is_cloned || ps->_wn != wn_loop; ps = ps->_next);
00336 PAR_STAT* ps_save = ps;
00337 for (ps = ps->_next; ps != NULL; ps = ps->_next)
00338 ps_stat = ps;
00339 ps = ps_save;
00340 }
00341 }
00342 return ps_stat;
00343 }
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353 PAR_STAT* PAR_STAT::Distribute(PAR_STAT* ps_inner,
00354 BOOL above)
00355 {
00356 DOLOOP_STACK loop_stack(&LNO_local_pool);
00357 Build_Doloop_Stack(ps_inner->_wn, &loop_stack);
00358
00359 PAR_STAT* ps_stat = Innermost_Sandwiched_Code(ps_inner, above);
00360 if (ps_stat == NULL)
00361 return NULL;
00362
00363 PAR_STAT* ps_dist = CXX_NEW(PAR_STAT(this), &LNO_local_pool);
00364 PAR_STAT* ps_old_loop = ps_dist;
00365 PAR_STAT* ps = this;
00366 INT start_depth = this->_depth;
00367 INT end_depth = ps_inner->_depth;
00368 for (INT i = start_depth + 1; i <= end_depth; i++) {
00369 WN* wn_loop = loop_stack.Bottom_nth(i);
00370 if (above) {
00371 PAR_STAT* pss = NULL;
00372 for (ps = ps->_first; ps->_is_cloned || ps->_wn != wn_loop; ps = pss) {
00373 pss = ps->_next;
00374 ps->Remove();
00375 ps->Make_Parent(ps_old_loop, FALSE);
00376 if (ps == ps_stat)
00377 break;
00378 }
00379 if (ps == ps_stat)
00380 break;
00381 PAR_STAT* ps_new_loop = CXX_NEW(PAR_STAT(ps), &LNO_local_pool);
00382 ps_new_loop->Make_Parent(ps_old_loop, FALSE);
00383 ps_old_loop = ps_new_loop;
00384 } else {
00385 PAR_STAT* pss = NULL;
00386 for (ps = ps->_first; ps->_is_cloned || ps->_wn != wn_loop; ps = pss)
00387 pss = ps->_next;
00388 PAR_STAT* ps_loop = ps;
00389 pss = NULL;
00390 for (ps = ps ->_next; ps != NULL; ps = pss) {
00391 pss = ps->_next;
00392 ps->Remove();
00393 ps->Make_Parent(ps_old_loop, FALSE);
00394 if (ps == ps_stat)
00395 break;
00396 }
00397 if (ps == ps_stat)
00398 break;
00399 ps = ps_loop;
00400 PAR_STAT* ps_new_loop = CXX_NEW(PAR_STAT(ps), &LNO_local_pool);
00401 ps_new_loop->Make_Parent(ps_old_loop, TRUE);
00402 ps_old_loop = ps_new_loop;
00403 }
00404 }
00405 Make_Sibling(ps_dist, above);
00406 return ps_dist;
00407 }
00408
00409
00410
00411
00412
00413
00414
00415
00416 PAR_STAT::PAR_STAT(WN* wn_tree,
00417 INT nloops,
00418 MEM_POOL* pool,
00419 PAR_STAT* ps_parent)
00420 {
00421 _next = NULL;
00422 _prev = NULL;
00423 _first = NULL;
00424 _last = NULL;
00425 _parent = NULL;
00426 _num_estimated_iters = -1;
00427 _depth = WN_opcode(wn_tree) == OPC_DO_LOOP
00428 ? Do_Loop_Depth(wn_tree) : Loop_Depth(wn_tree) + 1;
00429 _is_parallel = FALSE;
00430 if (WN_opcode(wn_tree) == OPC_DO_LOOP) {
00431 DO_LOOP_INFO* dli = Get_Do_Loop_Info(wn_tree);
00432 _count = MAX(dli->Est_Num_Iterations, 1);
00433 } else {
00434 _count = 0;
00435 }
00436 _wn = wn_tree;
00437 _id = id_count++;
00438 _is_cloned = FALSE;
00439 if (ps_parent != NULL)
00440 Make_Parent(ps_parent, FALSE);
00441 if (WN_opcode(wn_tree) == OPC_DO_LOOP && nloops >= 1) {
00442 WN* wn_first = WN_first(WN_do_body(wn_tree));
00443 for (WN* wn = wn_first; wn != NULL; wn = WN_next(wn))
00444 CXX_NEW(PAR_STAT(wn, nloops - 1, pool, this), pool);
00445 }
00446 }
00447
00448
00449
00450
00451
00452
00453
00454 PAR_STAT* PAR_STAT::Distribute_For_Permutation(WN* wn_outer,
00455 WN* wn_inner,
00456 INT permutation[],
00457 INT nloops)
00458 {
00459 DOLOOP_STACK stack(&LNO_local_pool);
00460 Build_Doloop_Stack(wn_inner, &stack);
00461 INT last = -1;
00462 INT outer_index = Do_Loop_Depth(wn_outer);
00463 PAR_STAT* ps_return = this;
00464 for (INT first = 0; first < nloops; first = last + 1) {
00465 last = Permutation_Last(first, permutation, nloops);
00466 INT first_depth = outer_index + first;
00467 INT last_depth = outer_index + last;
00468 PAR_STAT* ps_outer = Find(stack.Bottom_nth(first_depth), TRUE);
00469 PAR_STAT* ps_inner = Find(stack.Bottom_nth(last_depth), TRUE);
00470 PAR_STAT* ps_above = ps_outer->Distribute(ps_inner, TRUE);
00471 if (ps_outer == this && ps_above != NULL)
00472 ps_return = ps_above;
00473 PAR_STAT* ps_below = ps_outer->Distribute(ps_inner, FALSE);
00474 }
00475 return ps_return;
00476 }
00477
00478
00479
00480
00481
00482
00483
00484 PAR_STAT* PAR_STAT::Distribute_By_Splitting(WN* wn_outer,
00485 WN* wn_inner,
00486 INT nloops,
00487 INT split_depth)
00488 {
00489 if (wn_outer == NULL || nloops == 0)
00490 return this;
00491 INT outer_depth = Do_Loop_Depth(wn_outer);
00492 if (split_depth == -1 || split_depth == outer_depth)
00493 return this;
00494 DOLOOP_STACK local_stack(&LNO_local_pool);
00495 Build_Doloop_Stack(wn_inner, &local_stack);
00496 PAR_STAT* ps_inner = Find(local_stack.Bottom_nth(split_depth), TRUE);
00497 PAR_STAT* ps_above = Distribute(ps_inner, TRUE);
00498 Distribute(ps_inner, FALSE);
00499 return ps_above != NULL ? ps_above : this;
00500 }
00501
00502
00503
00504
00505
00506
00507
00508 void PAR_STAT::Permute_Loops(WN* wn_outer,
00509 WN* wn_inner,
00510 INT permutation[],
00511 INT nloops)
00512 {
00513 DOLOOP_STACK stack(&LNO_local_pool);
00514 Build_Doloop_Stack(wn_inner, &stack);
00515 PAR_STAT* ps_array = CXX_NEW_ARRAY(PAR_STAT, nloops, &LNO_local_pool);
00516 INT outer_depth = Do_Loop_Depth(wn_outer);
00517 PAR_STAT* ps = this;
00518 INT i;
00519 for (i = 0; i < nloops; i++, ps = ps->_first) {
00520 for (; ps->_is_cloned || ps->_wn != stack.Bottom_nth(outer_depth + i);
00521 ps = ps->_next);
00522 ps_array[i] = *ps;
00523 }
00524 ps = this;
00525 for (i = 0; i < nloops; i++, ps = ps->_first) {
00526 for (; ps->_is_cloned || ps->_wn != stack.Bottom_nth(outer_depth + i);
00527 ps = ps->_next);
00528 ps->_is_parallel = ps_array[permutation[i]]._is_parallel;
00529 ps->_count = ps_array[permutation[i]]._count;
00530 ps->_id = ps_array[permutation[i]]._id;
00531 ps->_wn = ps_array[permutation[i]]._wn;
00532 }
00533 CXX_DELETE_ARRAY(ps_array, &LNO_local_pool);
00534 }
00535
00536
00537
00538
00539
00540
00541
00542 PAR_STAT* PAR_STAT::Parallel_Interchange(WN* wn_outer,
00543 INT permutation[],
00544 INT nloops,
00545 INT parallel_depth,
00546 INT sd_split_depth,
00547 INT split_depth)
00548 {
00549 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_outer, nloops);
00550 DOLOOP_STACK stack(&LNO_local_pool);
00551 Build_Doloop_Stack(wn_inner, &stack);
00552 PAR_STAT* ps_return = NULL;
00553 PAR_STAT* ps_root = Distribute_By_Splitting(wn_outer, wn_inner, nloops,
00554 sd_split_depth);
00555 if (ps_return == NULL && ps_root != this)
00556 ps_return = ps_root;
00557 ps_root = Distribute_For_Permutation(wn_outer, wn_inner, permutation,
00558 nloops);
00559 if (ps_return == NULL && ps_root != this)
00560 ps_return = ps_root;
00561 ps_root = Distribute_By_Splitting(wn_outer, wn_inner, nloops, split_depth);
00562 if (ps_return == NULL && ps_root != this)
00563 ps_return = ps_root;
00564 Permute_Loops(wn_outer, wn_inner, permutation, nloops);
00565 PAR_STAT* ps_parallel_loop = NULL;
00566 for (INT i = 0; i < stack.Elements(); i++) {
00567 WN* wn_loop = stack.Bottom_nth(i);
00568 PAR_STAT* ps_loop = Find(wn_loop, TRUE);
00569 if (ps_loop != NULL && ps_loop->_depth == parallel_depth) {
00570 ps_parallel_loop = ps_loop;
00571 break;
00572 }
00573 }
00574 if (ps_parallel_loop != NULL)
00575 ps_parallel_loop->_is_parallel = TRUE;
00576 return ps_return == NULL ? this : ps_return;
00577 }
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587 static double Estimate_Cycles(WN* wn_loop)
00588 {
00589 double cost = 0.0;
00590 LWN_ITER* itr = LWN_WALK_TreeIter(WN_do_body(wn_loop));
00591 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
00592 WN* wn = itr->wn;
00593 cost++;
00594 }
00595 return cost;
00596 }
00597
00598
00599
00600
00601
00602
00603
00604 static double Machine_Cost_Calls(WN* wn_loop)
00605 {
00606 DO_LOOP_INFO* dli = Get_Do_Loop_Info(wn_loop);
00607 if (!dli->Has_Calls)
00608 return 0;
00609 INT64 call_cost = 0;
00610 LWN_ITER* itr = LWN_WALK_TreeIter(wn_loop);
00611 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
00612 WN* wn_call = itr->wn;
00613 if (WN_operator(wn_call) == OPR_CALL && Has_Execution_Cost(wn_call))
00614 call_cost += Simple_Execution_Cost(wn_call);
00615 }
00616 return (double) call_cost;
00617 }
00618
00619
00620
00621
00622
00623
00624
00625
00626
00627
00628
00629
00630
00631
00632 static double
00633 Estimate_IF_Cost(WN *wn_if, WN *inner_do_loop, WN2INT *se_needed,
00634 HASH_TABLE<WN *, BIT_VECTOR *> *invar_table)
00635 {
00636 Is_True(WN_operator(wn_if) == OPR_IF, ("not an IF"));
00637 Is_True(WN_operator(inner_do_loop) == OPR_DO_LOOP, ("not a DO_LOOP"));
00638
00639 REGISTER_MODEL test_model(&LNO_local_pool);
00640
00641 INT fp_regs_used_out = 0;
00642 INT int_regs_used_out = 0;
00643 INT tlb_used_out = 0;
00644
00645 double return_cost = 0.0;
00646 test_model.Add_Statement(WN_if_test(wn_if));
00647 test_model.Evaluate(inner_do_loop, se_needed, invar_table,
00648 &return_cost, &fp_regs_used_out, &int_regs_used_out,
00649 &tlb_used_out);
00650
00651 double then_cost = 0.0, else_cost = 0.0;
00652
00653 if (WN_first(WN_then(wn_if))) {
00654 REGISTER_MODEL then_model(&LNO_local_pool);
00655 double then_if_cost = 0.0;
00656
00657 for (WN *wn = WN_first(WN_then(wn_if)); wn; wn = WN_next(wn)) {
00658 if (WN_operator(wn) != OPR_IF)
00659 then_model.Add_Statement(wn);
00660 else
00661 then_if_cost += Estimate_IF_Cost(wn, inner_do_loop, se_needed,
00662 invar_table);
00663 }
00664
00665 then_model.Evaluate(inner_do_loop, se_needed, invar_table,
00666 &then_cost, &fp_regs_used_out, &int_regs_used_out,
00667 &tlb_used_out);
00668
00669 then_cost += then_if_cost;
00670 }
00671
00672 if (WN_first(WN_else(wn_if))) {
00673 REGISTER_MODEL else_model(&LNO_local_pool);
00674 double else_if_cost = 0.0;
00675
00676 for (WN *wn = WN_first(WN_else(wn_if)); wn; wn = WN_next(wn)) {
00677 if (WN_operator(wn) != OPR_IF)
00678 else_model.Add_Statement(wn);
00679 else
00680 else_if_cost += Estimate_IF_Cost(wn, inner_do_loop, se_needed,
00681 invar_table);
00682 }
00683
00684 else_model.Evaluate(inner_do_loop, se_needed, invar_table,
00685 &else_cost, &fp_regs_used_out, &int_regs_used_out,
00686 &tlb_used_out);
00687 else_cost += else_if_cost;
00688 }
00689
00690 return_cost += (then_cost > else_cost) ? then_cost : else_cost;
00691
00692 return return_cost;
00693 }
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706 static double SNL_Inner_Machine_Cost(WN* wn_total_outer,
00707 INT total_nloops,
00708 WN* wn_outer,
00709 INT nloops,
00710 INT parallel_depth,
00711 SX_PLIST* plist,
00712 double* work_estimate,
00713 BOOL include_calls)
00714 {
00715 extern void Mark_Invar(WN *region, INT num_loops, DOLOOP_STACK *do_stack,
00716 HASH_TABLE<WN *,BIT_VECTOR *> *htable,
00717 MEM_POOL *pool, BOOL outer_only);
00718
00719 REGISTER_MODEL machine_model(&LNO_local_pool);
00720 double loop_cycles = 0.0;
00721 double if_cycles = 0.0;
00722 DYN_ARRAY<WN *> if_list(&LNO_local_pool);
00723 INT fp_regs_used_out = 0;
00724 INT int_regs_used_out = 0;
00725 INT tlb_used_out = 0;
00726 INT outer_depth = Do_Loop_Depth(wn_outer);
00727 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_outer, nloops);
00728 WN* wn_first = WN_first(WN_do_body(wn_inner));
00729 WN* wn = 0;
00730 for (wn = wn_first; wn != NULL; wn = WN_next(wn)) {
00731 if (WN_operator(wn) != OPR_IF)
00732 machine_model.Add_Statement(wn);
00733 else
00734 if_list.AddElement(wn);
00735 }
00736 WN2INT* se_needed = CXX_NEW(WN2INT(HASH_SIZE, &LNO_local_pool),
00737 &LNO_local_pool);
00738 LWN_ITER* itr = LWN_WALK_TreeIter(WN_do_body(wn_inner));
00739 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
00740 WN* wn = itr->wn;
00741 OPERATOR opr = WN_operator(wn);
00742 if (opr != OPR_LDID && opr != OPR_STID)
00743 continue;
00744 TYPE_ID type = opr == OPR_STID
00745 ? WN_desc(wn) : WN_rtype(wn);
00746 SYMBOL symb(wn);
00747 if (plist != NULL) {
00748 SX_PITER ii(plist);
00749 BOOL is_found = FALSE;
00750 SX_PNODE *n, *found_n = NULL;
00751 for (n = ii.First(); n != NULL && !is_found; n = ii.Next()) {
00752 if (n->Symbol() == symb) {
00753 is_found = TRUE;
00754 SX_PNODE::STATUS status = n->Transformable(outer_depth);
00755 if (status != SX_PNODE::SE_NOT_REQD)
00756 found_n = n;
00757 }
00758 }
00759 if (found_n)
00760 se_needed->Enter(wn, 1);
00761 }
00762 }
00763
00764 DOLOOP_STACK *do_stack =
00765 CXX_NEW(DOLOOP_STACK(&LNO_local_pool), &LNO_local_pool);
00766 Build_Doloop_Stack(wn_inner, do_stack);
00767 HASH_TABLE<WN *,BIT_VECTOR *> invar_table(500, &LNO_local_pool);
00768 Mark_Invar(WN_do_body(wn_inner), nloops, do_stack,
00769 &invar_table, &LNO_local_pool, FALSE);
00770
00771 machine_model.Evaluate(wn_inner, se_needed, &invar_table, &loop_cycles,
00772 &fp_regs_used_out, &int_regs_used_out, &tlb_used_out);
00773
00774 for (INT i = 0; i < if_list.Elements(); i++)
00775 if_cycles += Estimate_IF_Cost(if_list[i], wn_inner, se_needed,
00776 &invar_table);
00777 loop_cycles += if_cycles;
00778
00779 if (include_calls) {
00780 double call_cycles = Machine_Cost_Calls(wn_inner);
00781 loop_cycles += call_cycles;
00782 }
00783 *work_estimate = loop_cycles;
00784 if (loop_cycles < 0.0)
00785 *work_estimate = Estimate_Cycles(wn_inner);
00786 if (wn_total_outer != wn_outer) {
00787 for (WN* wn = wn_inner; wn != NULL; wn = LWN_Get_Parent(wn)) {
00788 if (WN_opcode(wn) == OPC_DO_LOOP) {
00789 DO_LOOP_INFO* dli = Get_Do_Loop_Info(wn);
00790 *work_estimate *= (double) dli->Est_Num_Iterations;
00791 if (Do_Loop_Depth(wn) == Do_Loop_Depth(wn_total_outer) + total_nloops)
00792 break;
00793 }
00794 }
00795 }
00796
00797 WN* wn_last = LWN_Get_Parent(wn_total_outer);
00798 for (wn = wn_inner; wn != wn_last; wn = LWN_Get_Parent(wn)) {
00799 if (WN_opcode(wn) == OPC_DO_LOOP) {
00800 DO_LOOP_INFO* dli = Get_Do_Loop_Info(wn);
00801 double multiplier = (double) (Do_Loop_Depth(wn) == parallel_depth
00802 ? (dli->Est_Num_Iterations + NOMINAL_PROCS - 1) / NOMINAL_PROCS
00803 : dli->Est_Num_Iterations);
00804 loop_cycles *= multiplier;
00805 }
00806 }
00807 return loop_cycles;
00808 }
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820 extern double SNL_Machine_Cost(WN* wn_outer,
00821 INT nloops,
00822 INT parallel_depth,
00823 SX_PLIST* plist,
00824 double* work_estimate,
00825 BOOL include_calls)
00826 {
00827 double local_work_estimate = 0.0;
00828 double total_work_estimate = 0.0;
00829 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_outer, nloops);
00830 DO_LOOP_INFO* dli_inner = Get_Do_Loop_Info(wn_inner);
00831 if (dli_inner->Is_Inner)
00832 return SNL_Inner_Machine_Cost(wn_outer, nloops, wn_outer, nloops,
00833 parallel_depth, plist, work_estimate, include_calls);
00834
00835 FIZ_FUSE_INFO* ffi=
00836 CXX_NEW(FIZ_FUSE_INFO(&LNO_local_pool), &LNO_local_pool);
00837 ffi->Build(wn_outer, TRUE);
00838
00839 double total_cycles = 0;
00840 *work_estimate = 0.0;
00841 for (INT i = 0; i < ffi->Num_Snl(); i++) {
00842 if (ffi->Get_Type(i) != Inner)
00843 continue;
00844 WN* wn_loop = ffi->Get_Wn(i);
00845 INT nnloops = ffi->Get_Depth(i);
00846 double local_cycles = SNL_Inner_Machine_Cost(wn_outer, nloops, wn_loop,
00847 nnloops, parallel_depth, plist, &local_work_estimate, include_calls);
00848 total_work_estimate += local_work_estimate;
00849 total_cycles += local_cycles;
00850 }
00851 *work_estimate = total_work_estimate;
00852 return total_cycles;
00853 }
00854
00855
00856
00857
00858
00859
00860
00861
00862 extern double SNL_Min_Parallel_Overhead_Cost(WN* wn_outer,
00863 INT nloops,
00864 INT parallel_depth)
00865 {
00866 INT outer_depth = Do_Loop_Depth(wn_outer);
00867 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_outer, nloops);
00868 DOLOOP_STACK stack(&LNO_local_pool);
00869 Build_Doloop_Stack(wn_inner, &stack);
00870 INT* small_trips = CXX_NEW_ARRAY(INT, nloops, &LNO_local_pool);
00871 INT i;
00872 for (i = outer_depth; i < outer_depth + nloops - 1; i++) {
00873 WN* wn_loop = stack.Bottom_nth(i);
00874 DO_LOOP_INFO* dli_loop = Get_Do_Loop_Info(wn_loop);
00875 small_trips[i - outer_depth] = dli_loop->Est_Num_Iterations;
00876 }
00877 INT snloops = parallel_depth - outer_depth;
00878 for (i = 0; i < snloops; i++) {
00879 for (INT j = i; j < nloops - 1; j++) {
00880 if (small_trips[j] < small_trips[j+1]) {
00881 INT temp = small_trips[j];
00882 small_trips[j] = small_trips[j+1];
00883 small_trips[j+1] = temp;
00884 }
00885 }
00886 }
00887 double total_cost = (double) LNO_Parallel_Overhead;
00888 for (i = 0; i < snloops; i++)
00889 total_cost *= (double) small_trips[i];
00890 return total_cost;
00891 }
00892
00893
00894
00895
00896
00897
00898
00899
00900 BOOL PAR_STAT::Invariant_Reduction(WN* wn_istore)
00901 {
00902 WN* wn_array = WN_kid1(wn_istore);
00903 ACCESS_ARRAY *aa = (ACCESS_ARRAY *) WN_MAP_Get(LNO_Info_Map, wn_array);
00904 for (INT i = 0; i < aa->Num_Vec(); i++) {
00905 ACCESS_VECTOR* av = aa->Dim(i);
00906 if (av->Too_Messy)
00907 return FALSE;
00908 #ifdef KEY //bug 8434
00909 if (av->Non_Const_Loops() >= _depth)
00910 return FALSE;
00911 #endif
00912 PAR_STAT* ps = NULL;
00913 for (ps = this; ps != NULL; ps = ps->_parent) {
00914 if (WN_opcode(ps->_wn) == OPC_DO_LOOP
00915 && av->Loop_Coeff(Do_Loop_Depth(ps->_wn)) != 0)
00916 return FALSE;
00917 if (ps->_is_parallel)
00918 break;
00919 }
00920 }
00921 return TRUE;
00922 }
00923
00924
00925
00926
00927
00928
00929
00930
00931
00932 void PAR_STAT::Reduction_List(REDUCTION_LIST *rlist)
00933 {
00934 REDUCTION_MANAGER* rm = red_manager;
00935 if (rm == NULL)
00936 return;
00937
00938 if (WN_opcode(_wn) == OPC_DO_LOOP) {
00939 for (PAR_STAT* ps = _first; ps != NULL; ps = ps->_next)
00940 ps->Reduction_List(rlist);
00941 } else {
00942 LWN_ITER* itr = LWN_WALK_TreeIter(_wn);
00943 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
00944 WN* wn = itr->wn;
00945 OPERATOR opr = WN_operator(wn);
00946 if (opr == OPR_STID && rm->Which_Reduction(wn) != RED_NONE)
00947 rlist->AddElement(wn);
00948 else if (opr == OPR_ISTORE && rm->Which_Reduction(wn) != RED_NONE
00949 && Invariant_Reduction(wn))
00950 rlist->AddElement(wn);
00951 }
00952 }
00953 return;
00954 }
00955
00956
00957
00958
00959
00960
00961
00962
00963
00964 INT PAR_STAT::Num_Reductions()
00965 {
00966 REDUCTION_LIST rlist(&LNO_local_pool);
00967
00968 Reduction_List(&rlist);
00969 return rlist.Elements();
00970 }
00971
00972
00973
00974
00975
00976
00977
00978
00979 double PAR_STAT::Reduction_Cost()
00980 {
00981 REDUCTION_MANAGER* rm = red_manager;
00982 if (rm == NULL)
00983 return 0.0;
00984
00985 double red_cost = 0.0;
00986
00987
00988 if (Is_Inner_Loop() && Is_Parallel_Enclosed_Loop()) {
00989 INT red_count = Num_Reductions();
00990 MHD_LEVEL* Cur_Mhd = NULL;
00991 for (INT i = MHD_MAX_LEVELS - 1; i >= 0; i--) {
00992 Cur_Mhd = &Mhd.L[i];
00993 if (Cur_Mhd->Valid())
00994 break;
00995 }
00996 if (red_count > 0) {
00997 double dirty_penalty = (double) Cur_Mhd->Dirty_Miss_Penalty;
00998 double dirty_bytes = (double) red_count;
00999 dirty_bytes *= (double) NOMINAL_PROCS;
01000 PAR_STAT* ps = 0;
01001 for (ps = this; ps != NULL; ps = ps->_next)
01002 if (WN_opcode(ps->_wn) == OPC_DO_LOOP && ps->_is_parallel)
01003 break;
01004 for (ps = _parent; ps != NULL; ps = ps->_next)
01005 dirty_bytes *= ps->_count;
01006 red_cost += dirty_bytes * dirty_penalty;
01007 }
01008 }
01009
01010 if (_first != NULL)
01011 red_cost += _first->Reduction_Cost();
01012 if (_next != NULL)
01013 red_cost += _next->Reduction_Cost();
01014 return red_cost;
01015 }
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027 static INT Sx_Depth(WN* wn_outer,
01028 INT permutation[],
01029 INT nloops,
01030 INT split_depth,
01031 SX_PLIST* plist)
01032 {
01033 INT outer_depth = Do_Loop_Depth(wn_outer);
01034 INT i;
01035 for (i = 0; i < nloops; i++)
01036 if (permutation[i] != i)
01037 break;
01038 INT sx_perm_depth = outer_depth + i;
01039 INT sx_par_depth = Split_Sx_Depth(wn_outer, nloops, plist, split_depth);
01040 INT sx_depth = sx_perm_depth;
01041 if (sx_par_depth != -1 && sx_par_depth < sx_depth)
01042 sx_depth = sx_par_depth;
01043 return sx_depth;
01044 }
01045
01046
01047
01048
01049
01050
01051
01052
01053
01054
01055
01056 static void Enter_Scalar_Expandable_Refs(WN* wn_outer,
01057 WN* wn_inner,
01058 INT permutation[],
01059 INT nloops,
01060 INT split_depth,
01061 ARRAY_REF* arl,
01062 SX_PLIST* plist)
01063 {
01064 INT outer_depth = Do_Loop_Depth(wn_outer);
01065 INT sx_depth = Sx_Depth(wn_outer, permutation, nloops, split_depth, plist);
01066 if (sx_depth >= outer_depth + nloops)
01067 return;
01068 BOOL* can_be_inner = CXX_NEW_ARRAY(BOOL, nloops, &LNO_local_pool);
01069 for (INT i = 0; i < nloops; i++)
01070 can_be_inner[i] = TRUE;
01071 LWN_ITER* itr = LWN_WALK_TreeIter(WN_do_body(wn_inner));
01072 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
01073 WN* wn = itr->wn;
01074 OPERATOR opr = WN_operator(wn);
01075 if (opr != OPR_LDID && opr != OPR_STID)
01076 continue;
01077 TYPE_ID type = opr == OPR_STID
01078 ? WN_desc(wn) : WN_rtype(wn);
01079 SYMBOL symb(wn);
01080 SX_PITER ii(plist);
01081 BOOL is_found = FALSE;
01082 SX_PNODE *n, *found_n = NULL;
01083 for (n = ii.First(); n != NULL && !is_found; n = ii.Next()) {
01084 if (n->Symbol() == symb) {
01085 is_found = TRUE;
01086 SX_PNODE::STATUS status = n->Transformable(sx_depth, permutation,
01087 nloops);
01088 if (status == SX_PNODE::SE_REQD)
01089 found_n = n;
01090 }
01091 }
01092 if (found_n)
01093 arl->Enter_Scalar_Expand(wn, found_n, can_be_inner, nloops);
01094 }
01095 }
01096
01097
01098
01099
01100
01101
01102
01103
01104
01105 static INT Index_Variable_Ldid(WN* wn_node,
01106 INT nloops)
01107 {
01108 if (WN_operator(wn_node) != OPR_LDID)
01109 return FALSE;
01110
01111 INT loop_count = 0;
01112 for (WN* wn = wn_node; wn != NULL; wn = LWN_Get_Parent(wn)) {
01113 if (WN_operator(wn) == OPR_DO_LOOP) {
01114 loop_count++;
01115 if (SYMBOL(WN_index(wn)) == SYMBOL(wn_node))
01116 return loop_count;
01117 }
01118 if (loop_count == nloops)
01119 break;
01120 }
01121 return 0;
01122 }
01123
01124
01125
01126
01127
01128
01129
01130
01131
01132
01133 static void Loop_Index_Count_Traverse(WN* wn_tree,
01134 INT nloops,
01135 BIT_VECTOR* bv)
01136 {
01137 INT loop_position = Index_Variable_Ldid(wn_tree, nloops);
01138 if (loop_position > 0)
01139 bv->Set(loop_position - 1);
01140
01141 if (WN_operator(wn_tree) == OPR_BLOCK) {
01142 for (WN* wn = WN_first(wn_tree); wn != NULL; wn = WN_next(wn))
01143 Loop_Index_Count_Traverse(wn, nloops, bv);
01144 } else {
01145 for (INT i = 0; i < WN_kid_count(wn_tree); i++)
01146 Loop_Index_Count_Traverse(WN_kid(wn_tree, i), nloops, bv);
01147 }
01148 }
01149
01150
01151
01152
01153
01154
01155
01156 static INT Loop_Index_Count(WN* wn_tree,
01157 INT nloops)
01158 {
01159 BIT_VECTOR bv(nloops, &LNO_local_pool);
01160 Loop_Index_Count_Traverse(wn_tree, nloops, &bv);
01161 return bv.Pop_Count();
01162 }
01163
01164
01165
01166
01167
01168
01169
01170
01171
01172 static void SNL_Bad_Array_Footprints(WN* wn_loop,
01173 INT nloops,
01174 INT64 trip_product,
01175 double* bad_read_bytes,
01176 double* bad_write_bytes)
01177 {
01178 LWN_ITER* itr = LWN_WALK_TreeIter(wn_loop);
01179 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
01180 WN* wn = itr->wn;
01181 if (Is_Bad_Array(wn, nloops)) {
01182 BOOL is_load = OPCODE_is_load(WN_opcode(wn));
01183 WN* wn_array = is_load ? WN_kid0(wn) : WN_kid1(wn);
01184 double bad_bytes = (double) WN_element_size(wn_array);
01185 if (bad_bytes < 0)
01186 bad_bytes = -bad_bytes;
01187 bad_bytes *= trip_product;
01188 if (is_load)
01189 *bad_read_bytes += bad_bytes;
01190 else
01191 *bad_write_bytes += bad_bytes;
01192 }
01193 }
01194 }
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210 static double SNL_Inner_Cache_Cost(WN* wn_outer,
01211 WN* wn_inner,
01212 INT permutation[],
01213 INT nloops,
01214 INT parallel_depth,
01215 INT split_depth,
01216 SX_PLIST* plist,
01217 double *est_num_iters)
01218 {
01219 INT parallel_debug_level = Get_Trace(TP_LNOPT2, TT_LNO_PARALLEL_DEBUG)
01220 ? Parallel_Debug_Level : 0;
01221 INT outer_depth = Do_Loop_Depth(wn_outer);
01222 DOLOOP_STACK loop_stack(&LNO_local_pool);
01223 Build_Doloop_Stack(wn_inner, &loop_stack);
01224 ARRAY_REF* arl =
01225 CXX_NEW(ARRAY_REF(wn_inner, nloops, &LNO_local_pool,NULL), &LNO_local_pool);
01226 Enter_Scalar_Expandable_Refs(wn_outer, wn_inner, permutation, nloops,
01227 split_depth, arl, plist);
01228 INT depth = outer_depth + nloops - 1;
01229 INT* loops = CXX_NEW_ARRAY(INT, depth + 1, &LNO_local_pool);
01230 INT* iters = CXX_NEW_ARRAY(INT, depth + 1, &LNO_local_pool);
01231 INT* arl_stripsz = CXX_NEW_ARRAY(INT, depth + 1, &LNO_local_pool);
01232 mINT64* est_iters = CXX_NEW_ARRAY(mINT64, depth + 1, &LNO_local_pool);
01233 mINT64* max_iters = CXX_NEW_ARRAY(mINT64, depth + 1, &LNO_local_pool);
01234 INT* unrolls = CXX_NEW_ARRAY(INT, depth + 1, &LNO_local_pool);
01235 INT* permute_order = CXX_NEW_ARRAY(INT, depth + 1, &LNO_local_pool);
01236 INT snloops = nloops - (parallel_depth - outer_depth);
01237 INT i;
01238 for (i = 0; i <= depth; i++) {
01239 WN* wn_loop = loop_stack.Bottom_nth(i);
01240 DO_LOOP_INFO* dli_loop = Get_Do_Loop_Info(wn_loop);
01241 arl_stripsz[i] = 1;
01242 extern INT64 Get_Good_Num_Iters(DO_LOOP_INFO *dli);
01243 est_iters[i] = Get_Good_Num_Iters(dli_loop);
01244 max_iters[i] = UNBOUNDED_ITERS;
01245 unrolls[i] = 1;
01246 permute_order[i] = i < outer_depth
01247 ? i : outer_depth + permutation[i - outer_depth];
01248 }
01249 est_iters[outer_depth + permutation[parallel_depth - outer_depth]] =
01250 (est_iters[outer_depth + permutation[parallel_depth - outer_depth]]
01251 + NOMINAL_PROCS) / NOMINAL_PROCS;
01252 for (i = 0; i <= depth; i++)
01253 if (est_iters[i] <= 0)
01254 est_iters[i] = 1;
01255 for (i = 0; i < snloops; i++)
01256 loops[i] = outer_depth + permutation[i + parallel_depth - outer_depth];
01257 for (i = 0; i < nloops; i++)
01258 iters[i] = est_iters[outer_depth + permutation[i]];
01259 INT stripdepth = depth + 1;
01260 INT v_first = -1;
01261 DO_LOOP_INFO* dli_outer = Get_Do_Loop_Info(wn_outer);
01262 INT64 outersz = iters[parallel_depth - outer_depth];
01263 BOOL using_tlb = FALSE;
01264 INT middle_loop_no = loops[0];
01265 double clean_cost = 0.0;
01266 double dirty_cost = 0.0;
01267 double bad_read_bytes = 0;
01268 double bad_write_bytes = 0;
01269 INT64 trip_product = 1;
01270 for (i = parallel_depth - outer_depth; i < nloops; i++)
01271 trip_product *= iters[i];
01272 if (arl->Num_Bad() > 0)
01273 SNL_Bad_Array_Footprints(wn_inner, nloops, trip_product,
01274 &bad_read_bytes, &bad_write_bytes);
01275 for (i = 0; i < MHD_MAX_LEVELS; i++) {
01276 MHD_LEVEL* Cur_Mhd = &Mhd.L[i];
01277 if (!Cur_Mhd->Valid())
01278 continue;
01279 Set_Cache_Model_Statics(i);
01280 INT line_size = Cur_Mhd->Line_Size;
01281 double clean_penalty = double(Cur_Mhd->Clean_Miss_Penalty)/line_size;
01282 double dirty_penalty = double(Cur_Mhd->Dirty_Miss_Penalty)/line_size;
01283 COMPUTE_FOOTPRINT_RVAL rval = Compute_Footprint(arl, snloops, loops,
01284 arl_stripsz, est_iters, max_iters, unrolls, depth, stripdepth,
01285 permute_order, v_first, outersz, using_tlb, middle_loop_no);
01286 if (parallel_debug_level >= 3) {
01287 rval.Print(stdout);
01288 fprintf(stdout, "Evaluating: ");
01289 for (INT j = 0; j < snloops - 1; j++) {
01290 fprintf(stdout, "v%d=%d", j,
01291 iters[parallel_depth - outer_depth + 1 + j]);
01292 if ( j < snloops - 2)
01293 fprintf(stdout, " ");
01294 }
01295 fprintf(stdout, "\n");
01296 }
01297 INT offset = parallel_depth - outer_depth + 1;
01298 double read_bytes = rval.RFormula == NULL
01299 ? 0.0 : rval.RFormula->Eval(snloops - 1, &iters[offset]);
01300 read_bytes += bad_read_bytes;
01301 double write_bytes = rval.WFormula == NULL
01302 ? 0.0 : rval.WFormula->Eval(snloops - 1, &iters[offset]);
01303 write_bytes += bad_write_bytes;
01304 clean_cost += read_bytes * clean_penalty;
01305 dirty_cost += write_bytes * dirty_penalty;
01306 }
01307 double total_cost = clean_cost + dirty_cost;
01308 for (i = 0; i < parallel_depth - outer_depth; i++)
01309 total_cost *= iters[i];
01310 *est_num_iters = 1.0;
01311 for (i = 0; i < nloops; i++)
01312 *est_num_iters *= iters[i];
01313 *est_num_iters *= NOMINAL_PROCS;
01314 return total_cost;
01315 }
01316
01317
01318
01319
01320
01321
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331
01332
01333 extern double SNL_Cache_Cost(WN* wn_outer,
01334 INT permutation[],
01335 INT nloops,
01336 INT parallel_depth,
01337 INT split_depth,
01338 SX_PLIST* plist,
01339 double *est_num_iters)
01340 {
01341 double _est_num_iters;
01342 *est_num_iters = 0.0;
01343
01344 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_outer, nloops);
01345 DO_LOOP_INFO* dli_inner = Get_Do_Loop_Info(wn_inner);
01346 if (dli_inner->Is_Inner)
01347 return SNL_Inner_Cache_Cost(wn_outer, wn_inner, permutation, nloops,
01348 parallel_depth, split_depth, plist, est_num_iters);
01349
01350 FIZ_FUSE_INFO* ffi=
01351 CXX_NEW(FIZ_FUSE_INFO(&LNO_local_pool), &LNO_local_pool);
01352 ffi->Build(wn_outer, TRUE);
01353
01354 double total_cycles = 0;
01355 for (INT i = 0; i < ffi->Num_Snl(); i++) {
01356 if (ffi->Get_Type(i) != Inner)
01357 continue;
01358 WN* wn_loop = ffi->Get_Wn(i);
01359 INT nnloops = ffi->Get_Depth(i);
01360 INT depth_diff = Do_Loop_Depth(wn_loop) - Do_Loop_Depth(wn_outer);
01361 INT xnloops = nnloops + depth_diff;
01362 INT* xpermutation = CXX_NEW_ARRAY(INT, xnloops, &LNO_local_pool);
01363 INT j;
01364 for (j = 0; j < nloops; j++)
01365 xpermutation[j] = permutation[j];
01366 for (j = nloops; j < xnloops; j++)
01367 xpermutation[j] = j;
01368 WN* wn_inner = SNL_Get_Inner_Snl_Loop(wn_loop, nnloops);
01369 double local_cycles = SNL_Inner_Cache_Cost(wn_outer, wn_inner,
01370 xpermutation, xnloops, parallel_depth, split_depth, plist,
01371 &_est_num_iters);
01372 *est_num_iters += _est_num_iters;
01373 total_cycles += local_cycles;
01374 CXX_DELETE_ARRAY(xpermutation, &LNO_local_pool);
01375 }
01376 return total_cycles;
01377 }
01378
01379
01380
01381
01382
01383
01384
01385 INT PAR_STAT::Num_Refs()
01386 {
01387 INT num_refs = 0;
01388 if (WN_opcode(_wn) == OPC_DO_LOOP) {
01389 for (PAR_STAT* ps = _first; ps != NULL; ps = ps->_next)
01390 num_refs += ps->Num_Refs();
01391 } else {
01392 LWN_ITER* itr = LWN_WALK_TreeIter(_wn);
01393 for (; itr != NULL; itr = LWN_WALK_TreeNext(itr)) {
01394 OPERATOR opr = WN_operator(itr->wn);
01395 if (opr == OPR_ILOAD || opr == OPR_ISTORE)
01396 num_refs++;
01397 }
01398 }
01399 return num_refs;
01400 }
01401
01402
01403
01404
01405
01406
01407
01408
01409
01410
01411
01412
01413
01414
01415
01416
01417
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429
01430
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440
01441
01442
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452
01453
01454
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466 double PAR_STAT::Loop_Overhead_Cost()
01467 {
01468 double loop_cycles = 0.0;
01469 if (WN_operator(_wn) == OPR_DO_LOOP) {
01470 double multiplier = LOOP_CYCLES_PER_ITER;
01471 for (PAR_STAT* ps = this; ps != NULL; ps = ps->_parent) {
01472 if (WN_operator(ps->_wn) == OPR_DO_LOOP) {
01473 INT count = ps->_is_parallel
01474 ? (ps->_count + NOMINAL_PROCS - 1)/NOMINAL_PROCS : ps->_count;
01475 multiplier *= (double) count;
01476 }
01477 }
01478 loop_cycles += multiplier;
01479 }
01480 if (_first != NULL)
01481 loop_cycles += _first->Loop_Overhead_Cost();
01482 if (_next != NULL)
01483 loop_cycles += _next->Loop_Overhead_Cost();
01484 return loop_cycles;
01485 }
01486
01487
01488
01489
01490
01491
01492
01493 double PAR_STAT::Parallel_Overhead_Cost()
01494 {
01495 double parallel_cycles = 0;
01496 if (WN_opcode(_wn) == OPC_DO_LOOP && _is_parallel) {
01497
01498 #ifdef KEY
01499 double multiplier = (double) (LNO_Parallel_Overhead +
01500 NOMINAL_PROCS * LNO_Parallel_per_proc_overhead);
01501 #else
01502 double multiplier = (double) (LNO_Parallel_Overhead +
01503 NOMINAL_PROCS * 123);
01504 #endif
01505
01506 if (Is_Inner_Loop() && Is_Parallel_Enclosed_Loop()) {
01507
01508 REDUCTION_LIST rlist(&LNO_local_pool);
01509 BOOL dummy;
01510
01511 Reduction_List(&rlist);
01512 if (rlist.Elements() > 0)
01513 multiplier += MP_Reduction_Combine_Cycles(&rlist, &dummy);
01514 }
01515 BOOL first_loop=FALSE;
01516 for (PAR_STAT* ps = _parent; ps != NULL; ps = ps->_parent)
01517 if (WN_opcode(_wn) == OPC_DO_LOOP)
01518 if (first_loop)
01519 first_loop=FALSE;
01520 else
01521 multiplier *= (double) ps->_count;
01522 parallel_cycles += multiplier;
01523 }
01524 if (_first != NULL)
01525 parallel_cycles += _first->Parallel_Overhead_Cost();
01526 if (_next != NULL)
01527 parallel_cycles += _next->Parallel_Overhead_Cost();
01528 return parallel_cycles;
01529 }
01530
01531
01532
01533
01534
01535
01536
01537
01538
01539
01540
01541
01542
01543
01544
01545
01546
01547
01548
01549
01550 double PAR_STAT::Cycle_Count(WN* wn_outer,
01551 INT permutation[],
01552 INT nloops,
01553 INT parallel_depth,
01554 SX_PLIST* plist,
01555 INT split_depth,
01556 double machine_cycles,
01557 double *cache_cycles_per_iter,
01558 BOOL is_doacross)
01559 {
01560 INT parallel_debug_level = Get_Trace(TP_LNOPT2, TT_LNO_PARALLEL_DEBUG)
01561 ? Parallel_Debug_Level : 0;
01562 double reduction_cycles = Reduction_Cost();
01563 double loop_cycles = Loop_Overhead_Cost();
01564 double parallel_cycles = Parallel_Overhead_Cost();
01565 double est_num_iters;
01566 double cache_cycles = SNL_Cache_Cost(wn_outer, permutation, nloops,
01567 parallel_depth, split_depth, plist, &est_num_iters);
01568 Is_True(cache_cycles >= 0.0,
01569 ("PAR_STAT::Cycle_Count: Unexpected negative cache cycle count"));
01570 double total_cycles = machine_cycles + reduction_cycles + parallel_cycles
01571 + loop_cycles + cache_cycles;
01572 if (parallel_debug_level >= 2) {
01573 INT outer_depth = Do_Loop_Depth(wn_outer);
01574 fprintf(stdout, "Permutation = (");
01575 if (!is_doacross)
01576 for (INT i = 0; i < nloops; i++) {
01577 fprintf(stdout, "%d%s", permutation[i],
01578 i == parallel_depth - outer_depth ? "-P" : "");
01579 if (i < nloops - 1)
01580 fprintf(stdout, ",");
01581 }
01582 else
01583 for (INT i = 0; i < nloops; i++) {
01584 fprintf(stdout, "%d%s", permutation[i],
01585 parallel_depth == i ? "-X" : "");
01586 if (i < nloops - 1)
01587 fprintf(stdout, ",");
01588 }
01589 fprintf(stdout, ")\n");
01590 fprintf(stdout, " Machine cycles = %13.2f\n", machine_cycles);
01591 fprintf(stdout, " Reduction cycles = %13.2f\n", reduction_cycles);
01592 fprintf(stdout, " Loop Overhead cycles = %13.2f\n", loop_cycles);
01593 fprintf(stdout, " Parallel Overhead cycles = %13.2f\n", parallel_cycles);
01594 fprintf(stdout, " Cache cycles = %13.2f\n", cache_cycles);
01595 fprintf(stdout, " *Total cycles = %13.2f\n", total_cycles);
01596 }
01597
01598
01599 _num_estimated_iters = est_num_iters;
01600 *cache_cycles_per_iter = (cache_cycles * NOMINAL_PROCS) / est_num_iters;
01601 return total_cycles;
01602 }
01603
01604
01605
01606
01607
01608
01609
01610 #define PS_ASSERT(a, b) \
01611 ((a) ? 0 : (fprintf b, fprintf(fp, "\n"), error_count++))
01612
01613 INT PAR_STAT::Sanity_Check_Node(FILE* fp)
01614 {
01615 INT error_count = 0;
01616 PAR_STAT* ps_root = 0;
01617 for (ps_root = this; ps_root->_parent != NULL;
01618 ps_root = ps_root->_parent);
01619 for (; ps_root->_prev != NULL; ps_root = ps_root->_prev);
01620 PS_ASSERT(_next == NULL || _next->_prev == this,
01621 (fp, "PAR_STAT: Bad _next pointer 0x%p", this));
01622 PS_ASSERT(_prev == NULL || _prev->_next == this,
01623 (fp, "PAR_STAT: Bad _prev pointer 0x%p", this));
01624 if (_parent != NULL) {
01625 PAR_STAT* ps = 0;
01626 for (ps = _parent->_first; ps != NULL; ps = ps->_next)
01627 if (ps == this)
01628 break;
01629 PS_ASSERT(ps != NULL, (fp, "PAR_STAT: Bad _parent pointer 0x%p", this));
01630 }
01631 if (_first != NULL) {
01632 PS_ASSERT(_last != NULL,
01633 (fp, "PAR_STAT: _last not consistent with _first 0x%p", this));
01634 PAR_STAT* pss = NULL;
01635 for (PAR_STAT* ps = _first; ps != NULL; pss = ps, ps = ps->_next);
01636 PS_ASSERT(pss == _last,
01637 (fp, "PAR_STAT: _last not consistent with actual _last 0x%p", this));
01638 }
01639 if (_last != NULL)
01640 PS_ASSERT(_first != NULL,
01641 (fp, "PAR_STAT: _last not consistent with _first 0x%p", this));
01642 PS_ASSERT(WN_opcode(_wn) == OPC_DO_LOOP || _first == NULL && _last == NULL,
01643 (fp, "PAR_STAT: STATEMENT is not leaf node 0x%p", this));
01644 INT loop_count = 0;
01645 for (PAR_STAT* ps = this; ps != NULL; ps = ps->_parent)
01646 if (WN_opcode(ps->_wn) == OPC_DO_LOOP)
01647 loop_count++;
01648 loop_count += ps_root->_depth;
01649 INT depth = (WN_opcode(_wn) == OPC_DO_LOOP) ? loop_count - 1: loop_count;
01650 PS_ASSERT(_depth == depth, (fp, "PAR_STAT: Improper depth 0x%p", this));
01651 PS_ASSERT(WN_opcode(_wn) == OPC_DO_LOOP || !_is_parallel,
01652 (fp, "PAR_STAT: _is_parallel set on STATEMENT 0x%p", this));
01653 PS_ASSERT(WN_opcode(_wn) != OPC_DO_LOOP || _count > 0,
01654 (fp, "PAR_STAT: _count <= 0 on DO LOOP 0x%p", this));
01655 PS_ASSERT(_id <= id_count, (fp, "PAR_STAT: Improper _id value 0x%p", this));
01656 PS_ASSERT(!_is_cloned || ps_root->Find(_wn, TRUE) != NULL,
01657 (fp, "PAR_STAT: _is_cloned is set, but could not find original 0x%p",
01658 this));
01659 PS_ASSERT(WN_opcode(_wn) == OPC_DO_LOOP || !_is_cloned,
01660 (fp, "PAR_STAT: _is_cloned set on STATEMENT 0x%p", this));
01661 return error_count;
01662 }
01663
01664
01665
01666
01667
01668
01669
01670 INT PAR_STAT::Sanity_Check(FILE* fp)
01671 {
01672 INT error_count = Sanity_Check_Node(fp);
01673 if (_first != NULL)
01674 error_count += _first->Sanity_Check(fp);
01675 if (_next != NULL)
01676 error_count += _next->Sanity_Check(fp);
01677 return error_count;
01678 }
01679
01680
01681
01682
01683
01684
01685
01686 void PAR_STAT::Print(FILE* fp,
01687 INT indent_count)
01688 {
01689 if (this == NULL)
01690 fprintf(fp, "<NULL>\n");
01691 for (INT i = 0; i < indent_count; i++)
01692 fprintf(fp, " ");
01693 if (WN_opcode(_wn) == OPC_DO_LOOP)
01694 fprintf(fp, "%s %d [%d]%s (0x%p) [%d]\n", _is_parallel ? "PR" : "DO",
01695 _depth, _id, _is_cloned ? "+" : "", _wn, _count);
01696 else
01697 fprintf(fp, "ST %d (%d) (0x%p) [%d]\n", _depth, _id, _wn, _count);
01698 if (_first != NULL)
01699 _first->Print(fp, indent_count + 2);
01700 if (_next != NULL)
01701 _next->Print(fp, indent_count);
01702 }