00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include <stdlib.h>
00030 #include <algorithm>
00031 #include <vector>
00032 #include <stack>
00033 #include <list>
00034 #include <set>
00035
00036 #include "bb.h"
00037 #include "defs.h"
00038 #include "cg_region.h"
00039 #include "fb_tnv.h"
00040 #include "op_map.h"
00041 #include "data_layout.h"
00042 #include "symtab_access.h"
00043 #include "cg_flags.h"
00044 #include "vt_region.h"
00045 #include "tracing.h"
00046 #include "instr_reader.h"
00047 #include "dump_feedback.h"
00048 #include "freq.h"
00049 #include "ipfec_defs.h"
00050 #include "gra_live.h"
00051 #include "region.h"
00052 #include "region_bb_util.h"
00053 #include "region_update.h"
00054 #include "region_verify.h"
00055 #include "ipfec_options.h"
00056 #include "cg.h"
00057
00058 #include "stride_prefetch.h"
00059 #include "val_prof.h"
00060
00061 #include "cg_dep_graph.h"
00062 #include "targ_cache_info.h"
00063 #include "cache_analysis.h"
00064
00065
00066 #define EMPT (-1)
00067 #define STRONG_SINGL_STRIDE 0x0001
00068 #define PHASED_MULTI_STRIDE 0x0002
00069
00070 #define MIN_STRIDE 9
00071 #define MAX_DISTANCE 0x1fff
00072
00073
00074 extern OP_MAP OP_to_WN_map;
00075
00076 #ifndef REGION_VECTOR_ITER
00077 typedef REGION_VECTOR::iterator REGION_VECTOR_ITER;
00078 #endif
00079
00080 class REGION_STRIDE_PREFETCH {
00081 private:
00082 MEM_POOL * _mempool;
00083 REGION *region;
00084 UINT64 pu_freq;
00085 UINT64 THRESH_FREQ;
00086 UINT32 PRE_DISTANCE;
00087 UINT32 LOAD_LATENCY;
00088 UINT64 CACHE_SIZE;
00089 float STRONG_SINGLE_STRIDE_THREHOLD;
00090 UINT32 PHASED_MULTI_STRIDE_THRESHOLD;
00091 UINT64 PMST_SECOD_FREQ;
00092 float PMST_ZERO_FREQ;
00093 UINT64 SINGLE_FIRST_FREQ;
00094 float NTA_THRESHOLD;
00095 STRIDE_LOOP_HEADER *loop_header;
00096 INT32 flags;
00097
00098
00099 public:
00100 REGION_STRIDE_PREFETCH(MEM_POOL *m ,INT32 stride_flags):_mempool(m),region(NULL),pu_freq(0),THRESH_FREQ(10),
00101 PRE_DISTANCE(16),LOAD_LATENCY(90),CACHE_SIZE(1000000),STRONG_SINGLE_STRIDE_THREHOLD(0.70),PHASED_MULTI_STRIDE_THRESHOLD(10000),
00102 PMST_SECOD_FREQ(100),PMST_ZERO_FREQ(0.80),SINGLE_FIRST_FREQ(1000),
00103 NTA_THRESHOLD(0.90),loop_header(NULL),flags(stride_flags)
00104 {
00105 }
00106
00107 ~REGION_STRIDE_PREFETCH(){}
00108
00109
00110 void Stride_A_Region(REGION * regn);
00111
00112 protected:
00113 void Stride_Prefetch_Initial(void);
00114 UINT64 Find_Loop_Count(REGION *rgn );
00115 double Compute_Iteration_Cycles(UINT64 loop_count);
00116 double Compute_Region_Cycles(REGION * regn);
00117 float Compute_Iteration_Data_Size(UINT64 loop_count);
00118 float Compute_Region_Data_Size(REGION * regn);
00119 UINT32 BB_Ld_Count(BB * bb);
00120 void Compute_Prefetch_Distance(INT32* distance);
00121 INT32 Min( double a, double b, double c);
00122 void Insert_Prefetch_List(INT32* distance);
00123 void Stride_Ins(BB *bb, OP *op, INT32 distance);
00124 OP * Mk_Add_OP(mINT64 int_arg, TN *tn, TN * total_tn);
00125 OP * Mk_Prefetch_OP(float zero_prob, TN * address);
00126 void Strong_Single_Stride_Ins(BB *bb, OP *op, INT32 distance );
00127 void Phased_Multi_Stride_Ins(BB *bb, OP *op, INT32 distance);
00128 };
00129
00130 struct pref_dis_map{
00131 OP *op;
00132 mINT64 dis;
00133 };
00134 static std::vector<struct pref_dis_map> prefetch_list;
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147 void Stride_Region(REGION_TREE *region_tree, INT32 stride_flags)
00148 {
00149 MEM_POOL local_mempool;
00150 MEM_POOL_Initialize( &local_mempool, "stride prefetch", FALSE );
00151 MEM_POOL_Push( &local_mempool );
00152 REGION_VECTOR region_set;
00153 REGION_VECTOR_ITER iter;
00154 REGION *region;
00155 region_set = region_tree->Region_Set();
00156 REGION_STRIDE_PREFETCH region_stride( &local_mempool, stride_flags );
00157 for (iter = region_set.begin(); iter != region_set.end(); iter++) {
00158 region =*iter;
00159 region_stride.Stride_A_Region(region);
00160 }
00161 MEM_POOL_Pop( &local_mempool );
00162 MEM_POOL_Delete( &local_mempool );
00163 }
00164
00165
00166 void REGION_STRIDE_PREFETCH::Stride_A_Region(REGION * regn)
00167 {
00168 region = regn ;
00169 BOOL worth_prefetch;
00170 INT32 distance;
00171
00172 loop_header = CXX_NEW(STRIDE_LOOP_HEADER, _mempool);
00173 Stride_Prefetch_Initial();
00174 if ( loop_header != NULL ){
00175
00176 Compute_Prefetch_Distance(&distance);
00177
00178 Insert_Prefetch_List( &distance);
00179 }
00180
00181 }
00182
00183
00184
00185
00186
00187 void REGION_STRIDE_PREFETCH::Stride_Prefetch_Initial(void)
00188 {
00189
00190 BOOL found_loop_region = FALSE;
00191 UINT64 loop_count ;
00192 while ( !found_loop_region){
00193 if (region->Region_Type() == LOOP)
00194 found_loop_region = TRUE;
00195 else if (region->Next_Sibling() == NULL)
00196 break;
00197 else region = region->Next_Sibling();
00198 }
00199
00200
00201
00202 BB *bb = REGION_First_BB;
00203 pu_freq= (UINT64)bb->freq;
00204 if (!found_loop_region){
00205 if ( edge_done ){
00206 loop_header = NULL;
00207 }else{
00208 loop_header->trip_count = (UINT64)EMPT;
00209 }
00210
00211 return ;
00212 }
00213
00214
00215
00216 if ( edge_done )
00217 loop_count = Find_Loop_Count( region );
00218 else loop_count = (UINT64)EMPT;
00219
00220 if ((loop_count != EMPT) && ( loop_count < THRESH_FREQ)){
00221 loop_header = NULL;
00222 return ;
00223 }
00224
00225 loop_header->trip_count = loop_count;
00226
00227
00228
00229
00230
00231 double iteration_average_cycle;
00232 double iteration_average_data_size;
00233 if (loop_count !=EMPT){
00234 iteration_average_cycle = Compute_Iteration_Cycles( loop_count );
00235 loop_header->average_cycle_iter = (UINT64)iteration_average_cycle;
00236 iteration_average_data_size = Compute_Iteration_Data_Size( loop_count );
00237 loop_header->average_data_size_iter = (UINT64)iteration_average_data_size;
00238 }
00239
00240 return ;
00241 }
00242
00243
00244 UINT64 REGION_STRIDE_PREFETCH::Find_Loop_Count(REGION *rgn ){
00245 UINT64 freq=0;
00246 NODE_VECTOR nodes;
00247 nodes = rgn->Entries();
00248 for(NODE_VECTOR_ITER iter = nodes.begin(); iter!=nodes.end(); iter++){
00249 if((*iter)->Is_Region())
00250 freq +=Find_Loop_Count((*iter)->Region_Node());
00251 else
00252 freq +=( UINT64)BB_freq((*iter)->BB_Node());
00253 }
00254 return freq;
00255 }
00256
00257
00258
00259 double REGION_STRIDE_PREFETCH::Compute_Iteration_Cycles(UINT64 loop_count)
00260 {
00261 double total_cycles;
00262 total_cycles = Compute_Region_Cycles( region );
00263 return total_cycles / loop_count;
00264
00265 }
00266
00267
00268 double REGION_STRIDE_PREFETCH::Compute_Region_Cycles(REGION * regn)
00269 {
00270 double cycles_of_cur_region = 0;
00271
00272
00273
00274 REGIONAL_CFG_NODE * node;
00275 REGIONAL_CFG *rgn_cfg;
00276 BB *bb;
00277 NODE_VECTOR _node_set;
00278 NODE_VECTOR_ITER iter;
00279 rgn_cfg = regn->Regional_Cfg();
00280 _node_set = rgn_cfg->Node_Set();
00281 for (iter = _node_set.begin(); iter != _node_set.end(); iter++) {
00282 node = *iter;
00283 if (node->Is_Region()){
00284 REGION * nested_region = node->Region_Node();
00285 cycles_of_cur_region += Compute_Region_Cycles(nested_region);
00286 }
00287 else{
00288 bb = node->BB_Node();
00289 Calculate_BB_Cycle( bb, FALSE);
00290 if (BB_call(bb))
00291
00292
00293
00294
00295 cycles_of_cur_region += BB_cycle(bb) * BB_freq(bb);
00296 else
00297
00298
00299
00300
00301 cycles_of_cur_region += BB_cycle(bb) * BB_freq(bb);
00302
00303 }
00304 }
00305
00306 return cycles_of_cur_region;
00307 }
00308
00309
00310
00311
00312
00313
00314
00315 float REGION_STRIDE_PREFETCH::Compute_Iteration_Data_Size( UINT64 loop_count)
00316 {
00317 double total_data_size;
00318 total_data_size = Compute_Region_Cycles(region);
00319 return total_data_size / loop_count;
00320 }
00321
00322 float REGION_STRIDE_PREFETCH::Compute_Region_Data_Size(REGION * regn)
00323 {
00324 double data_size_of_cur_region = 0;
00325
00326
00327 REGIONAL_CFG_NODE * node;
00328 REGIONAL_CFG *rgn_cfg;
00329 BB *bb;
00330 NODE_VECTOR _node_set;
00331 NODE_VECTOR_ITER iter;
00332 rgn_cfg = regn->Regional_Cfg();
00333 _node_set = rgn_cfg->Node_Set();
00334 node = region->Regional_Cfg_Node();
00335 for (iter = _node_set.begin(); iter != _node_set.end(); iter++) {
00336 node = *iter;
00337 if (node->Is_Region()) {
00338 REGION * nested_region = node->Region_Node();
00339 data_size_of_cur_region += Compute_Region_Data_Size(nested_region);
00340 }else{
00341 bb = node->BB_Node();
00342 if( BB_call(bb))
00343
00344 data_size_of_cur_region += BB_Ld_Count(bb) * BB_freq(bb);
00345 else
00346 data_size_of_cur_region += BB_Ld_Count(bb) * BB_freq(bb);
00347 }
00348 }
00349
00350 return data_size_of_cur_region;
00351
00352 }
00353
00354 UINT32 REGION_STRIDE_PREFETCH::BB_Ld_Count(BB * bb)
00355 {
00356 OP *op, *tmp_op;
00357 UINT32 i = 0;
00358 for (op = BB_first_op(bb); op != NULL; op = tmp_op)
00359 {
00360 tmp_op = OP_next(op);
00361 switch (OP_code(op)) {
00362 case TOP_ld1:
00363 case TOP_ld4:
00364 case TOP_ld2:
00365 case TOP_ld8:
00366 i++;
00367 break;
00368 default:
00369
00370 break;
00371 }
00372
00373 }
00374 return i;
00375 }
00376
00377 void REGION_STRIDE_PREFETCH::Compute_Prefetch_Distance( INT32 *distance)
00378 {
00379 double distance_of_cycle;
00380 double distance_of_data_size;
00381 if (loop_header->trip_count == EMPT)
00382 *distance = PRE_DISTANCE;
00383 else{
00384 if( loop_header->average_cycle_iter !=0)
00385 distance_of_cycle = LOAD_LATENCY / loop_header->average_cycle_iter;
00386 else
00387 distance_of_cycle = 100;
00388
00389
00390
00391
00392 if( loop_header->average_data_size_iter!=0)
00393 distance_of_data_size = 100;
00394 else
00395 distance_of_data_size =100;
00396 *distance = Min(distance_of_cycle, distance_of_data_size, PRE_DISTANCE);
00397 }
00398 }
00399
00400 INT32 REGION_STRIDE_PREFETCH::Min( double a, double b, double c)
00401 {
00402 double i;
00403 if ((a < b)&& (a< c))
00404 i = a;
00405 else if ( b < c)
00406 i = b;
00407 else
00408 i =c ;
00409 return (INT32) i;
00410 }
00411
00412 void REGION_STRIDE_PREFETCH::Insert_Prefetch_List(INT32* distance)
00413 {
00414 UINT64 trip_count = loop_header->trip_count;
00415
00416 for (SEQ_REGIONAL_CFG_ITER node_iter(region->Regional_Cfg()); node_iter!=0; ++node_iter){
00417 REGIONAL_CFG_NODE *node = *node_iter;
00418 if ( !node->Is_Region() ){
00419 BB *bb = node->BB_Node();
00420 OP *op, *tmp_op;
00421 for (op = BB_first_op(bb); op != NULL; op = tmp_op)
00422 {
00423 tmp_op = OP_next(op);
00424 switch (OP_code(op)) {
00425 case TOP_ld1:
00426 case TOP_ld4:
00427 case TOP_ld2:
00428 case TOP_ld8:
00429 Stride_Ins(bb, op, *distance);
00430 break;
00431 default:
00432
00433 break;
00434 }
00435 }
00436 }
00437
00438 }
00439
00440 }
00441
00442 void REGION_STRIDE_PREFETCH::Stride_Ins(BB *bb, OP *op, INT32 distance)
00443 {
00444
00445 if (op_stride_tnv_map == NULL){
00446 return;
00447 }else if(OP_MAP_Is_Delete(op_stride_tnv_map))
00448 return;
00449 if (OP_Prefetched(op))
00450 return;
00451 FB_TNV *tnv = (FB_TNV *)OP_MAP_Get(op_stride_tnv_map, op);
00452 UINT64 first_val_freq;
00453 UINT64 secod_val_freq;
00454 UINT64 total_access;
00455 UINT64 total_freq = 0;
00456 UINT64 zeros_freq =0;
00457
00458 if ( tnv == NULL){
00459 DevWarn("can not find tnv-info in op_stride_tnv_map! val_prof_id = %d",OP_val_prof_id(op));
00460 return;
00461 }
00462 first_val_freq = tnv->_counters[0];
00463 secod_val_freq = tnv->_counters[1];
00464 total_access = tnv->_exec_counter;
00465
00466 int i;
00467 for( i = 0; i < 10; i++){
00468 total_freq += tnv->_counters[i];
00469 }
00470
00471 zeros_freq = tnv->_zero_std_counter;
00472
00473 if ( total_access == 0){
00474 DevWarn("total_access can not equ 0");
00475 return;
00476 }
00477
00478 float zeros_prob =((float)zeros_freq) / total_access;
00479 float stride_prob;
00480
00481
00482
00483 stride_prob = ((float)first_val_freq) / total_access;
00484 if ( stride_prob > STRONG_SINGLE_STRIDE_THREHOLD && (first_val_freq > SINGLE_FIRST_FREQ))
00485 Strong_Single_Stride_Ins(bb, op, distance);
00486 else if (total_access > PHASED_MULTI_STRIDE_THRESHOLD
00487 && zeros_prob > PMST_ZERO_FREQ)
00488 Phased_Multi_Stride_Ins(bb, op, distance);
00489 }
00490
00491
00492 void REGION_STRIDE_PREFETCH::Strong_Single_Stride_Ins(BB *bb, OP *op, INT32 distance )
00493 {
00494 if (!( flags & STRONG_SINGL_STRIDE))
00495 return;
00496 FB_TNV *tnv =(FB_TNV *)OP_MAP_Get(op_stride_tnv_map,op);
00497 mINT64 first_val;
00498 mINT64 prefetch_distance;
00499 float zero_prob;
00500
00501 first_val = tnv->_values[0];
00502 if( abs(first_val) < MIN_STRIDE) return;
00503 prefetch_distance = distance * first_val;
00504 if ( (prefetch_distance == 0) || ( prefetch_distance > MAX_DISTANCE ))
00505 return;
00506
00507
00508 OP *cur_op;
00509 INT max_dis=0, min_dis=0;
00510 if (prefetch_list.size()>0) {
00511 max_dis = min_dis = prefetch_list[prefetch_list.size()-1].dis;
00512 }
00513 for(INT i= prefetch_list.size()-1; i>=0; i--){
00514 cur_op = prefetch_list[i].op;
00515 if (cur_op == op) break;
00516 INT diff;
00517 if (OP_Prefetched(cur_op)) {
00518 INT cache_line_size = Cache_Line_Size(CACHE_L2);
00519 if (Cache_Access_Same_Line(cur_op, op, &diff) && diff <= cache_line_size/2) {
00520 max_dis = prefetch_list[i].dis > max_dis ? prefetch_list[i].dis : max_dis;
00521 min_dis = prefetch_list[i].dis < min_dis ? prefetch_list[i].dis : min_dis;
00522 if (abs(prefetch_distance - prefetch_list[i].dis)<cache_line_size) {
00523 INT value = abs(cache_line_size/first_val)+1;
00524 if (prefetch_distance < prefetch_list[i].dis || distance>8) {
00525 prefetch_distance -= value*first_val;
00526 } else {
00527 prefetch_distance += value*first_val;
00528 }
00529 INT boundary = first_val>0?max_dis:min_dis;
00530 while(abs(prefetch_distance - boundary) < cache_line_size) {
00531 prefetch_distance += value*first_val;
00532 }
00533 break;
00534 }
00535 }
00536 }
00537 }
00538
00539 struct pref_dis_map pdm;
00540 pdm.op = op;
00541 pdm.dis = prefetch_distance;
00542 prefetch_list.push_back(pdm);
00543 if (prefetch_list.size()>5) { prefetch_list.erase(prefetch_list.begin());}
00544
00545 zero_prob = ((float)tnv->_zero_std_counter) / tnv->_exec_counter;
00546 TN *base = Gen_Register_TN( ISA_REGISTER_CLASS_integer, 8 );
00547 TN *distance_tn = Gen_Register_TN( ISA_REGISTER_CLASS_integer, 8 );
00548 OP *op_add = Mk_OP( TOP_adds, base, True_TN, Gen_Literal_TN(prefetch_distance, 8), OP_opnd(op, 3));
00549 OP *op_prefetch =Mk_Prefetch_OP( zero_prob, base);
00550 WN * wn = WN_Create(OPC_PREFETCH, 1);
00551 WN_offset(wn) = prefetch_distance;
00552 WN_pf_set_manual(wn) ;
00553 OP_MAP_Set(OP_to_WN_map, op_prefetch, wn);
00554 OP_srcpos(op_prefetch) = OP_srcpos(op);
00555 OP_srcpos(op_add) = OP_srcpos(op);
00556 BB_Insert_Op_Before(bb, op, op_prefetch);
00557 BB_Insert_Op_Before(bb, op_prefetch, op_add);
00558 Set_OP_Prefetched(op);
00559 }
00560
00561
00562
00563 OP * REGION_STRIDE_PREFETCH::Mk_Add_OP(mINT64 int_arg, TN *tn, TN * total_tn)
00564 {
00565 TN *arg_tn = Gen_Literal_TN(int_arg, 8);
00566 OP *op = Mk_OP( TOP_add, total_tn, True_TN, arg_tn, tn );
00567 return op;
00568 }
00569
00570 OP * REGION_STRIDE_PREFETCH::Mk_Prefetch_OP(float zero_prob, TN * address)
00571 {
00572 TN *lfhint;
00573
00574 lfhint = Gen_Enum_TN(ECV_lfhint);
00575
00576 OP *op =Mk_OP(TOP_lfetch, True_TN, lfhint, address);
00577 return op;
00578 }
00579
00580
00581 void REGION_STRIDE_PREFETCH::Phased_Multi_Stride_Ins(BB *bb, OP *op, INT32 distance)
00582 {
00583 if (!( flags & PHASED_MULTI_STRIDE))
00584 return;
00585 FB_TNV *tnv =(FB_TNV *)OP_MAP_Get(op_stride_tnv_map,op);
00586 mINT64 first_val;
00587 mINT64 prefetch_distance;
00588 float zero_prob;
00589 TN *address_tn = Gen_Register_TN( ISA_REGISTER_CLASS_integer, 8 );
00590 TN *base = Gen_Register_TN( ISA_REGISTER_CLASS_integer, 8 );
00591 TN *stride = Gen_Register_TN( ISA_REGISTER_CLASS_integer, 8 );
00592 prefetch_distance = 1;
00593 zero_prob = ((float) tnv->_zero_std_counter) / tnv->_exec_counter;
00594 OP *op_sub = Mk_OP( TOP_sub, stride, True_TN, OP_opnd(op,3), address_tn );
00595 OP *op_add =Mk_OP( TOP_shladd, base, True_TN, stride, Gen_Literal_TN(prefetch_distance, 8),OP_opnd(op,3));
00596 OP *op_mov = Mk_OP(TOP_mov, address_tn, True_TN, OP_opnd(op,3));
00597 OP *op_prefetch =Mk_Prefetch_OP( zero_prob, base);
00598 WN * wn = WN_Create(OPC_PREFETCH,1);
00599 WN_offset(wn) = prefetch_distance;
00600 WN_pf_set_manual(wn) ;
00601 OP_MAP_Set(OP_to_WN_map, op_prefetch, wn);
00602 OP_srcpos(op_prefetch) = OP_srcpos(op);
00603 OP_srcpos(op_add) = OP_srcpos(op);
00604 OP_srcpos(op_mov) = OP_srcpos(op);
00605 OP_srcpos(op_sub) = OP_srcpos(op);
00606 BB_Insert_Op_Before(bb, op, op_prefetch);
00607 BB_Insert_Op_Before(bb, op_prefetch, op_mov);
00608 BB_Insert_Op_Before(bb, op_mov, op_add);
00609 BB_Insert_Op_Before(bb, op_add, op_sub);
00610 Set_OP_Prefetched(op);
00611 }
00612
00613