00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
01151
01152
01153
01154
01155
01156
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168
01169
01170
01171
01172
01173
01174
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217
01218
01219
01220
01221
01222
01223
01224
01225
01226
01227
01228
01229
01230
01231
01232
01233
01234
01235
01236
01237
01238
01239
01240
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266
01267
01268
01269
01270
01271
01272
01273
01274
01275
01276
01277
01278
01279
01280
01281
01282
01283
01284
01285
01286
01287
01288
01289
01290
01291
01292
01293
01294
01295
01296
01297
01298
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315
01316
01317
01318
01319
01320
01321
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331
01332
01333
01334
01335
01336
01337
01338
01339
01340
01341
01342
01343
01344
01345
01346
01347
01348
01349
01350
01351
01352
01353
01354
01355
01356
01357
01358
01359
01360
01361
01362
01363
01387 #define __STDC_LIMIT_MACROS
01388 #include <stdint.h>
01389 #ifdef USE_PCH
01390 #include "lno_pch.h"
01391 #endif // USE_PCH
01392 #pragma hdrstop
01393
01394 #define cache_model_CXX "cache_model.cxx"
01395
01396 #ifdef _KEEP_RCS_ID
01397
01398 static char *rcs_id = cache_model_CXX "$Revision: 1.6 $";
01399 #endif
01400
01401 #include <sys/types.h>
01402 #include <math.h>
01403
01404 #include "access_vector.h"
01405 #include "model.h"
01406 #include "cache_model.h"
01407 #include "lnopt_main.h"
01408 #include "alloca.h"
01409 #include "lu_mat.h"
01410 #include "vs.h"
01411 #include "config_targ.h"
01412 #include "snl.h"
01413
01414
01415
01416
01417 #define CM_DEBUGGING_LEVEL 3
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429
01430 #define MAX_DIFFERENT_BLOCKSIZES 2
01431
01432
01433
01434
01435 #define MAX_DEPTH 62
01436
01437
01438
01439
01440
01441
01442
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452
01453
01454
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466
01467 #define BASIC_MIDDLE_PENALTY 0.20
01468 #define ADDITIONAL_MIDDLE_PENALTY 1.00
01469
01470
01471
01472
01473
01474 #define BASIC_MIDDLE_PENALTY_MEM 0.00
01475 #define ADDITIONAL_MIDDLE_PENALTY_MEM 2.00
01476
01477
01478
01479
01480
01481
01482
01483
01484 #define BASIC_MIDDLE_PENALTY_TLB 0.00
01485 #define ADDITIONAL_MIDDLE_PENALTY_TLB 0.20
01486
01487
01488
01489
01490
01491
01492
01493
01494
01495 #define RECT_BLOCKSIZE_UNCERTAINTY 32
01496
01497
01498
01499 #define UNBOUNDED_ITERS 12345678
01500
01501
01502
01503
01504
01505
01506 #define MAX_BLOCKSIZE 100000
01507 #define MAX_LCM (64*3*3*5*5*5*7*11*13*17)
01508
01509
01510
01511
01512
01513
01514
01515
01516 #define BAD_REF_CYCLE_FIRST_BIAS 1.5
01517 #define BAD_REF_CYCLE_INCREMENTAL_BIAS 0.5
01518
01519
01520
01521 #define MAX_COEFF 20
01522
01523
01524
01525
01526
01527
01528
01529
01530
01531
01532
01533
01534 enum LONG_LINE_POLICY {LONG_LINE_OFF, LONG_LINE_ON,
01535 LONG_LINE_TLB_ONLY, LONG_LINE_CACHE_ONLY};
01536 static LONG_LINE_POLICY Long_Line_Policy = LONG_LINE_ON;
01537 static INT32 Loop_Overhead = -1;
01538
01539
01540
01541
01542
01543 INT Debug_Cache_Model = 0;
01544 static INT64 Nominal_Blocksize[SNL_MAX_LOOPS+1];
01545 static MHD_LEVEL* Cur_Mhd = NULL;
01546 static INT Rtry_Count;
01547 static INT Happy_Coefficient;
01548 static INT Max_Different_Blocksizes = MAX_DIFFERENT_BLOCKSIZES;
01549
01550
01551
01552
01553
01554 #ifndef ABS
01555 #define ABS(a) ((a<0)?-(a):(a))
01556 #endif
01557
01558 static INT Divceil(INT a, INT b)
01559 {
01560 return (a + b - 1)/b;
01561 }
01562
01563
01564 static BOOL Is_In_Array(INT val, const INT* array, INT array_sz)
01565 {
01566 INT i;
01567 for (i = 0; i < array_sz; i++)
01568 if (array[i] == val)
01569 return TRUE;
01570 return FALSE;
01571 }
01572
01573
01574
01575
01576
01577
01578
01579
01580 extern void Set_Cache_Model_Statics(INT mhd_level)
01581 {
01582 Cur_Mhd = &Mhd.L[mhd_level];
01583 FmtAssert(Cur_Mhd->Valid(), ("Not a valid MHD level"));
01584 Rtry_Count = 0;
01585 Happy_Coefficient = MAX(10, 3*LNO_Outer_Unroll);
01586 Happy_Coefficient = MAX(Happy_Coefficient, 3*LNO_Outer_Unroll_Max);
01587 Happy_Coefficient = MAX(Happy_Coefficient, 3*LNO_Outer_Unroll_Prod_Max);
01588 Happy_Coefficient = MIN(Happy_Coefficient, 30);
01589 MAT<FRAC>::Set_Default_Pool(&LNO_local_pool);
01590 FORMULA::Fpool = &LNO_local_pool;
01591 }
01592
01593 static void Print(FILE* f,
01594 char* cs,
01595 INT depth,
01596 INT nstrips,
01597 INT stripdepth,
01598 const INT* new_order,
01599 const INT* available_order,
01600 INT available_depth,
01601 const INT* iloop,
01602 const INT* stripsz,
01603 const INT* striplevel)
01604 {
01605 INT i;
01606
01607 fprintf(f, "*** cache model transformation information <%s, depth=%d>\n",
01608 cs ? cs : "", depth);
01609 fprintf(f, "*** new_order:");
01610 for (i = 0; i <= depth; i++)
01611 fprintf(f, " %d", new_order[i]);
01612 fprintf(f, " *** available_order:");
01613 for (i = 0; i <= available_depth; i++)
01614 fprintf(f, " %d", available_order[i]);
01615 if (nstrips > 0) {
01616 fprintf(f, "\n*** strips <outerdepth=%d>:", stripdepth);
01617 if (striplevel) {
01618 INT s;
01619 for (s = 0; s < nstrips; s++)
01620 fprintf(f, " %d[sz=%d,lv=%d]", iloop[s], stripsz[s], striplevel[s]);
01621 }
01622 else {
01623 for (INT s = 0; s < nstrips; s++)
01624 fprintf(f, " %d[sz=%d]", iloop[s], stripsz[s]);
01625 }
01626 }
01627 fprintf(f, "\n");
01628 }
01629
01630
01631
01632
01633
01634 MEM_POOL* FORMULA::Fpool = NULL;
01635 double FORMULA::_scratch[FORMULA::SCRATCH_REGISTERS];
01636
01637 static INT rprint_cnt;
01638
01639 double FORMULA::Eval(INT cnt, const mINT64* vars) const
01640 {
01641 double* dvars = cnt ? (double*) alloca(cnt * sizeof(double)) : NULL;
01642 INT i;
01643 for (i = 0; i < cnt; i++)
01644 dvars[i] = (double) vars[i];
01645
01646 if (Debug_Cache_Model >= 3)
01647 rprint_cnt = 0;
01648
01649 double e = Eval(dvars);
01650
01651 if (Debug_Cache_Model >= 3 && rprint_cnt)
01652 fprintf(TFile, "\n");
01653
01654 return e;
01655 }
01656
01657 double FORMULA::Eval(INT cnt, const mINT32* vars) const
01658 {
01659 double* dvars = cnt ? (double*) alloca(cnt * sizeof(double)) : NULL;
01660 INT i;
01661 for (i = 0; i < cnt; i++)
01662 dvars[i] = (double) vars[i];
01663
01664 if (Debug_Cache_Model >= 3)
01665 rprint_cnt = 0;
01666
01667 double e = Eval(dvars);
01668
01669 if (Debug_Cache_Model >= 3 && rprint_cnt)
01670 fprintf(TFile, "\n");
01671
01672 return e;
01673 }
01674
01675 double FORMULA::Eval(INT cnt, const double* vars) const
01676 {
01677 if (Debug_Cache_Model >= 3)
01678 rprint_cnt = 0;
01679
01680 double e = Eval(cnt ? vars : NULL);
01681
01682 if (Debug_Cache_Model >= 3 && rprint_cnt)
01683 fprintf(TFile, "\n");
01684
01685 return e;
01686 }
01687
01688 double FORMULA::Eval(const double* vars) const
01689 {
01690 Is_True(this, ("FORMULA::Eval() called with this == NULL"));
01691
01692 double ans;
01693
01694 switch (_fop) {
01695 case FORMULA_ADD:
01696 case FORMULA_SUB:
01697 case FORMULA_MUL:
01698 case FORMULA_DIV:
01699 case FORMULA_MAX:
01700 case FORMULA_MIN:
01701 case FORMULA_GE:
01702 case FORMULA_GT:
01703 case FORMULA_LE:
01704 case FORMULA_LT:
01705 case FORMULA_COMMA:
01706 {
01707 double l = _kids.Left->Eval_Inlined(vars);
01708 double r = _kids.Right->Eval_Inlined(vars);
01709 switch (_fop) {
01710 case FORMULA_ADD: ans = l + r; break;
01711 case FORMULA_SUB: ans = l - r; break;
01712 case FORMULA_MUL: ans = l * r; break;
01713 case FORMULA_DIV: if (r == 0.0 && Debug_Cache_Model) {
01714 fprintf(TFile, "zero divide in formula: ");
01715 Print(TFile);
01716 fprintf(TFile, "\n");
01717 }
01718 FmtAssert(r, ("zero divide"));
01719 ans = l / r; break;
01720 case FORMULA_MAX: ans = MAX(l,r); break;
01721 case FORMULA_MIN: ans = MIN(l,r); break;
01722 case FORMULA_GE: ans = l >= r; break;
01723 case FORMULA_GT: ans = l > r; break;
01724 case FORMULA_LE: ans = l <= r; break;
01725 case FORMULA_LT: ans = l < r; break;
01726 case FORMULA_COMMA: ans = r; break;
01727 }
01728 }
01729 break;
01730 case FORMULA_AND:
01731 if (_kids.Left->Eval_Inlined(vars) == 0.0)
01732 ans = 0.0;
01733 else
01734 ans = _kids.Right->Eval_Inlined(vars) != 0.0;
01735 break;
01736 case FORMULA_OR:
01737 if (_kids.Left->Eval_Inlined(vars) == 1.0)
01738 ans = 1.0;
01739 else
01740 ans = _kids.Right->Eval_Inlined(vars) != 0.0;
01741 break;
01742 case FORMULA_COND:
01743 if (_kids.Cond->Eval(vars))
01744 ans = _kids.Left->Eval(vars);
01745 else
01746 ans = _kids.Right->Eval(vars);
01747 break;
01748 case FORMULA_FCONST:
01749 ans = _fconst;
01750 break;
01751 case FORMULA_VAR:
01752 Is_True(vars, ("vars is NULL"));
01753 ans = vars[_var];
01754 break;
01755 case FORMULA_USE:
01756 Is_True(_var >= 0 && _var < SCRATCH_REGISTERS,
01757 ("Bad scratch register use %d", _var));
01758 ans = _scratch[_var];
01759 break;
01760 case FORMULA_SET:
01761 Is_True(_var >= 0 && _var < SCRATCH_REGISTERS,
01762 ("Bad scratch register set %d", _var));
01763 ans = _scratch[_var] = _kids.Left->Eval(vars);
01764 if (Debug_Cache_Model >= 4) {
01765 rprint_cnt++;
01766 fprintf(TFile, "[r%d=%.4g]", _var, ans);
01767 }
01768 break;
01769 default:
01770 FmtAssert(0, ("bad formula"));
01771 ans = 0.0;
01772 break;
01773 }
01774
01775 return ans;
01776 }
01777
01778 FORMULA* FORMULA::Add_To_Variable(INT var, INT val)
01779 {
01780 Is_True(this, ("FORMULA::Duplicate() called with this == NULL"));
01781
01782 switch (_fop) {
01783 case FORMULA_ADD:
01784 case FORMULA_SUB:
01785 case FORMULA_MUL:
01786 case FORMULA_DIV:
01787 case FORMULA_MAX:
01788 case FORMULA_MIN:
01789 case FORMULA_GE:
01790 case FORMULA_GT:
01791 case FORMULA_LE:
01792 case FORMULA_LT:
01793 case FORMULA_AND:
01794 case FORMULA_OR:
01795 case FORMULA_COMMA:
01796 _kids.Left = _kids.Left->Add_To_Variable(var,val);
01797 _kids.Right = _kids.Right->Add_To_Variable(var,val);
01798 return this;
01799 case FORMULA_COND:
01800 _kids.Cond = _kids.Cond->Add_To_Variable(var,val);
01801 _kids.Left = _kids.Left->Add_To_Variable(var,val);
01802 _kids.Right = _kids.Right->Add_To_Variable(var,val);
01803 return this;
01804 case FORMULA_FCONST:
01805 return this;
01806 case FORMULA_VAR:
01807 return FORMULA::Add(this, FORMULA::Const(val));
01808 case FORMULA_USE:
01809 return this;
01810 case FORMULA_SET:
01811 _kids.Left = _kids.Left->Add_To_Variable(var,val);
01812 return this;
01813 default:
01814 FmtAssert(0, ("Bad formula for Duplicate"));
01815 return NULL;
01816 }
01817 }
01818
01819 FORMULA* FORMULA::Duplicate() const
01820 {
01821 Is_True(this, ("FORMULA::Duplicate() called with this == NULL"));
01822
01823 switch (_fop) {
01824 case FORMULA_ADD:
01825 return FORMULA::Add(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01826 case FORMULA_SUB:
01827 return FORMULA::Sub(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01828 case FORMULA_MUL:
01829 return FORMULA::Mul(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01830 case FORMULA_DIV:
01831 return FORMULA::Div(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01832 case FORMULA_MAX:
01833 return FORMULA::Max(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01834 case FORMULA_MIN:
01835 return FORMULA::Min(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01836 case FORMULA_GE:
01837 return FORMULA::Ge(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01838 case FORMULA_GT:
01839 return FORMULA::Gt(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01840 case FORMULA_LE:
01841 return FORMULA::Le(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01842 case FORMULA_LT:
01843 return FORMULA::Lt(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01844 case FORMULA_AND:
01845 return FORMULA::And(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01846 case FORMULA_OR:
01847 return FORMULA::Or(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01848 case FORMULA_COMMA:
01849 return FORMULA::Comma(_kids.Left->Duplicate(), _kids.Right->Duplicate());
01850 case FORMULA_COND:
01851 return FORMULA::Cond(_kids.Cond->Duplicate(),
01852 _kids.Left->Duplicate(),
01853 _kids.Right->Duplicate());
01854 case FORMULA_FCONST:
01855 return FORMULA::Const(_fconst);
01856 case FORMULA_VAR:
01857 return FORMULA::Var(_var);
01858 case FORMULA_USE:
01859 return FORMULA::Use(_var);
01860 case FORMULA_SET:
01861 return FORMULA::Set(_var, _kids.Left->Duplicate());
01862 default:
01863 FmtAssert(0, ("Bad formula for Duplicate"));
01864 return NULL;
01865 }
01866 }
01867
01868 static INT precidence(FORMULA_OP fop)
01869 {
01870 switch (fop) {
01871 case FORMULA_BAD:
01872 return -1;
01873 case FORMULA_COMMA:
01874 return 0;
01875 case FORMULA_SET:
01876 return 1;
01877 case FORMULA_COND:
01878 return 2;
01879 case FORMULA_OR:
01880 return 3;
01881 case FORMULA_AND:
01882 return 4;
01883 case FORMULA_GE:
01884 case FORMULA_GT:
01885 case FORMULA_LE:
01886 case FORMULA_LT:
01887 return 5;
01888 case FORMULA_ADD:
01889 case FORMULA_SUB:
01890 return 6;
01891 case FORMULA_MUL:
01892 case FORMULA_DIV:
01893 return 7;
01894 case FORMULA_MAX:
01895 case FORMULA_MIN:
01896 case FORMULA_FCONST:
01897 case FORMULA_VAR:
01898 case FORMULA_USE:
01899 return 8;
01900 }
01901 FmtAssert(0, ("Bad fop = %d", fop));
01902 return 0;
01903 }
01904
01905 inline BOOL assoc(FORMULA_OP fop)
01906 {
01907 return fop == FORMULA_ADD || fop == FORMULA_MUL || fop == FORMULA_COMMA;
01908 }
01909
01910 void FORMULA::Print(FILE* f, FORMULA_OP parent) const
01911 {
01912 FmtAssert(this, ("FORMULA::Print() called with this == NULL"));
01913
01914 BOOL dont_parenthesize = ((_fop == parent && assoc(_fop)) ||
01915 precidence(_fop) > precidence(parent));
01916
01917 if (!dont_parenthesize)
01918 fprintf(f, "(");
01919
01920 switch (_fop) {
01921
01922 case FORMULA_FCONST:
01923 fprintf(f, "%.4g", _fconst);
01924 break;
01925
01926 case FORMULA_VAR:
01927 fprintf(f, "v%d", _var);
01928 break;
01929
01930 case FORMULA_USE:
01931 fprintf(f, "r%d", _var);
01932 break;
01933
01934 case FORMULA_SET:
01935 fprintf(f, "r%d=", _var);
01936 _kids.Left->Print(f, _fop);
01937 fprintf(f, "\n");
01938 break;
01939
01940 case FORMULA_ADD:
01941 case FORMULA_SUB:
01942 case FORMULA_MUL:
01943 case FORMULA_DIV:
01944 case FORMULA_GE:
01945 case FORMULA_GT:
01946 case FORMULA_LE:
01947 case FORMULA_LT:
01948 case FORMULA_AND:
01949 case FORMULA_OR:
01950 case FORMULA_COMMA:
01951 _kids.Left->Print(f, _fop);
01952 fprintf(f, "%s",
01953 _fop == FORMULA_ADD ? "+" :
01954 _fop == FORMULA_SUB ? "-" :
01955 _fop == FORMULA_MUL ? "*" :
01956 _fop == FORMULA_DIV ? "/" :
01957 _fop == FORMULA_GE ? ">=" :
01958 _fop == FORMULA_GT ? ">" :
01959 _fop == FORMULA_LE ? "<=" :
01960 _fop == FORMULA_LT ? "<" :
01961 _fop == FORMULA_AND ? "&&" :
01962 _fop == FORMULA_OR ? "||" :
01963 _fop == FORMULA_COMMA ? "," :
01964 (FmtAssert(0, ("Bad fop = %d to FORMULA;:Print()", _fop)), ""));
01965 _kids.Right->Print(f, _fop);
01966 break;
01967
01968 case FORMULA_MAX:
01969 case FORMULA_MIN:
01970
01971 fprintf(f, "%s", _fop == FORMULA_MAX ? "Max(" : "Min(");
01972 _kids.Left->Print(f);
01973 fprintf(f, ",");
01974 _kids.Right->Print(f);
01975 fprintf(f, ")");
01976 break;
01977
01978 case FORMULA_COND:
01979 _kids.Cond->Print(f, _fop);
01980 fprintf(f, "?");
01981 _kids.Left->Print(f, _fop);
01982 fprintf(f, ":");
01983 _kids.Right->Print(f, _fop);
01984 break;
01985
01986 default:
01987 FmtAssert(0, ("Bad FORMULA::_fop %d", _fop));
01988 }
01989
01990 if (!dont_parenthesize)
01991 fprintf(f, ")");
01992 }
01993
01994
01995
01996
01997
01998
01999
02000 static BOOL Same_Ug(const ACCESS_VECTOR* av1, const ACCESS_VECTOR* av2)
02001 {
02002 FmtAssert(!av1->Too_Messy && !av2->Too_Messy,
02003 ("Same_Ug(): a too messy access vector"));
02004 FmtAssert(av1->Nest_Depth() == av2->Nest_Depth(),
02005 ("Same_Ug(): bad depths %d and %d",
02006 av1->Nest_Depth(), av2->Nest_Depth()));
02007 INT i;
02008 for (i = 0; i < av1->Nest_Depth(); i++)
02009 if (av1->Loop_Coeff(i) != av2->Loop_Coeff(i))
02010 return FALSE;
02011
02012 if (av1->Contains_Lin_Symb()) {
02013 if (av2->Contains_Lin_Symb() == FALSE ||
02014 !(*av1->Lin_Symb == *av2->Lin_Symb))
02015 return FALSE;
02016 }
02017 else if (av2->Contains_Lin_Symb()) {
02018 return FALSE;
02019 }
02020
02021 if (av1->Contains_Non_Lin_Symb()) {
02022 if (av2->Contains_Non_Lin_Symb() == FALSE ||
02023 !(*av1->Non_Lin_Symb == *av2->Non_Lin_Symb))
02024 return FALSE;
02025 }
02026 else if (av2->Contains_Non_Lin_Symb()) {
02027 return FALSE;
02028 }
02029
02030 return TRUE;
02031 }
02032
02033 static BOOL Same_Ug(const ACCESS_ARRAY* aa1, const ACCESS_ARRAY* aa2)
02034 {
02035 if (aa1->Num_Vec() != aa2->Num_Vec())
02036 return FALSE;
02037 INT i;
02038 for (i = aa1->Num_Vec() - 1; i >= 0; i--)
02039 if (!Same_Ug(aa1->Dim(i), aa2->Dim(i)))
02040 return FALSE;
02041
02042 return TRUE;
02043 }
02044
02045
02046
02047
02048
02049
02050
02051
02052
02053
02054
02055
02056
02057
02058
02059
02060
02061
02062
02063
02064
02065
02066
02067
02068
02069
02070
02071
02072
02073
02074
02075
02076
02077
02078
02079
02080
02081
02082
02083
02084
02085
02086
02087
02088
02089
02090
02091
02092
02093
02094
02095
02096
02097
02098 class RG_NODE : public CHAIN_NODE {
02099 DECLARE_CHAIN_NODE_CLASS(RG_NODE);
02100 public:
02101 INT Mn[SNL_MAX_LOOPS];
02102 INT Mx[SNL_MAX_LOOPS];
02103 INT Mn_Stride;
02104 INT Mx_Stride;
02105 INT Entries;
02106
02107 RG_NODE(INT elements, const INT* mn, const INT* mx,
02108 INT mn_stride, INT mx_stride);
02109 void Print(FILE* f) const;
02110 ~RG_NODE() {}
02111 };
02112
02113 class RG_LIST : public CHAIN {
02114 DECLARE_CHAIN_CLASS(RG_LIST, RG_NODE);
02115 friend class RG;
02116 public:
02117 RG_LIST(MEM_POOL* p, INT nloops, INT element_size, const INT*, const INT*,
02118 BOOL);
02119 ~RG_LIST();
02120
02121 void Insert(const INT* c, INT stride, BOOL has_store);
02122 void Print(FILE*) const;
02123
02124 MEM_POOL* Pool;
02125 INT Elements;
02126 INT Element_Size;
02127 INT Count;
02128 INT Store_Count;
02129 INT Stride_Loop;
02130
02131
02132
02133 INT Actual_Stride;
02134
02135
02136
02137
02138
02139
02140
02141
02142
02143
02144
02145 INT Effective_Element_Size;
02146
02147
02148 INT Next_Outer_Stride_Loop;
02149
02150
02151
02152 mBOOL Using_TLB;
02153 INT Stride_Loops[SNL_MAX_LOOPS];
02154
02155
02156
02157 private:
02158
02159 mINT32 Max_Diff[SNL_MAX_LOOPS];
02160 void Simplify(BOOL first_only);
02161 };
02162
02163 RG_LIST::RG_LIST(MEM_POOL* p, INT nloops, INT element_size,
02164 const INT* loops, const INT* approx_iters, BOOL using_tlb)
02165 : Pool(p), Elements(nloops), Element_Size(element_size),
02166 Count(0), Store_Count(0),
02167 Using_TLB(using_tlb),
02168 Effective_Element_Size(element_size),
02169 Next_Outer_Stride_Loop(-1),
02170 Actual_Stride(-1), Stride_Loop(-1)
02171 {
02172 INT i;
02173 for (i = 0; i < nloops; i++)
02174 Max_Diff[i] = approx_iters[loops[i]];
02175 }
02176
02177 class RG {
02178 public:
02179 RG_LIST Rglist;
02180 RG(MEM_POOL* p, ACCESS_ARRAY* aa, INT elements,
02181 const INT* loop, INT element_size, SYMBOL sym, const INT*, BOOL,
02182 WN*, INT, INT*);
02183 ~RG();
02184
02185 MEM_POOL* Pool;
02186 INT Elements;
02187
02188
02189 BOOL Insert(const ACCESS_ARRAY*, BOOL has_store, INT, const INT*);
02190 void Print(FILE*) const;
02191
02192 ACCESS_ARRAY* Aa;
02193 SYMBOL Sym;
02194 BOOL Is_Local_Array;
02195
02196
02197
02198 INT* C;
02199
02200
02201
02202
02203
02204
02205
02206
02207
02208 INT Big_Indxs;
02209 IMAT* H;
02210 LU_FMAT* H_lu;
02211 LU_FMAT* H_lu_s;
02212 VECTOR_SPACE<FRAC>* KerH;
02213 VECTOR_SPACE<FRAC>* OKerH;
02214 };
02215
02216 class RG_ITER : public CHAIN_ITER {
02217 DECLARE_CHAIN_ITER_CLASS(RG_ITER, RG_NODE, RG_LIST);
02218 };
02219
02220 class RG_CONST_ITER : public CHAIN_ITER {
02221 DECLARE_CHAIN_CONST_ITER_CLASS(RG_CONST_ITER, RG_NODE, RG_LIST);
02222 };
02223
02224 static void Compute_Stride_One_Loop(const IMAT* h,
02225 INT elements,
02226 INT s1row,
02227 INT* s1loop,
02228 INT* stride)
02229 {
02230 *s1loop = -1;
02231 INT j;
02232 for (j = 0; j < elements; j++) {
02233 INT n = (*h)(s1row,j);
02234 if (n == 0)
02235 continue;
02236 INT nn = n < 0 ? -n : n;
02237
02238
02239
02240
02241 INT i;
02242 for (i = 0; i < s1row; i++) {
02243 if ((*h)(i,j) != 0) {
02244 INT jj;
02245 for (jj = 0; jj < elements; jj++) {
02246 if (jj != j && (*h)(i,jj) != 0)
02247 break;
02248 }
02249 if (jj == elements)
02250 break;
02251 }
02252 }
02253
02254 if (i == s1row) {
02255 *s1loop = j;
02256 *stride = nn;
02257 }
02258 }
02259 }
02260
02261
02262 void Fill_Constant_Array(const ACCESS_ARRAY* aa, INT* c,
02263 const INT* loop, INT nloops,
02264 INT big_indxs, INT indxs)
02265 {
02266 Is_True(indxs <= aa->Num_Vec() + big_indxs, ("Broken input"));
02267
02268 INT b = 0;
02269 INT i;
02270 for (i = big_indxs; i < indxs; i++) {
02271 INT constant = aa->Dim(i-big_indxs)->Const_Offset;
02272 for (INT j = 0; j < nloops; j++) {
02273 INT coeff = aa->Dim(i-big_indxs)->Loop_Coeff(loop[j]);
02274 if (ABS(coeff) > MAX_COEFF) {
02275 if (ABS(constant) >= ABS(coeff)) {
02276 c[b++] = constant/coeff;
02277 constant %= coeff;
02278 }
02279 else
02280 c[b++] = 0;
02281 }
02282 }
02283 c[i] = constant;
02284 }
02285
02286 Is_True(big_indxs == b, ("internal check failed"));
02287 }
02288
02289 RG::RG(MEM_POOL* p, ACCESS_ARRAY* aa, INT nloops,
02290 const INT* loop, INT element_size, SYMBOL sym, const INT* approx_iters,
02291 BOOL using_tlb, WN* wn, INT middle_loop_no, INT* approx_inner_iters)
02292 : Pool(p), Aa(aa), Elements(nloops),
02293 Rglist(p, nloops, element_size, loop, approx_iters, using_tlb),
02294 Sym(sym), Big_Indxs(0),
02295 Is_Local_Array(wn?Is_Local_Array_Reference(wn):TRUE)
02296 {
02297 Is_True(nloops <= SNL_MAX_LOOPS, ("Too many loops in nest"));
02298
02299 INT indxs = aa->Num_Vec();
02300
02301
02302
02303
02304
02305
02306 Rglist.Effective_Element_Size = Rglist.Element_Size;
02307 if (indxs > 1) {
02308 if (wn == NULL ||
02309 Long_Line_Policy == LONG_LINE_OFF ||
02310 (Long_Line_Policy == LONG_LINE_CACHE_ONLY && using_tlb) ||
02311 (Long_Line_Policy == LONG_LINE_TLB_ONLY && !using_tlb)) {
02312
02313 }
02314 else if (indxs != WN_num_dim(wn)) {
02315
02316
02317
02318 }
02319 else {
02320 INT lsz = (Rglist.Using_TLB ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size);
02321 while (indxs > 1) {
02322 WN* dim_wn = WN_array_dim(wn,indxs-1);
02323 if (WN_operator(dim_wn) != OPR_INTCONST)
02324 break;
02325 if (Rglist.Effective_Element_Size * WN_const_val(dim_wn) >= lsz)
02326 break;
02327 Rglist.Effective_Element_Size *= INT(WN_const_val(dim_wn));
02328 indxs--;
02329 }
02330 }
02331 }
02332
02333
02334
02335
02336
02337
02338
02339 INT i;
02340 for (i = 0; i < indxs; i++) {
02341 INT j;
02342 for (j = 0; j < nloops; j++) {
02343 INT coeff = aa->Dim(i)->Loop_Coeff(loop[j]);
02344 if (ABS(coeff) > MAX_COEFF)
02345 Big_Indxs++;
02346 }
02347 }
02348
02349 indxs += Big_Indxs;
02350
02351 H = NULL;
02352 H_lu = NULL;
02353 H_lu_s = NULL;
02354 KerH = NULL;
02355 OKerH = NULL;
02356 C = CXX_NEW_ARRAY(INT, indxs, Pool);
02357 Fill_Constant_Array(aa, C, loop, nloops, Big_Indxs, indxs);
02358 Rglist.Stride_Loop = -1;
02359 Rglist.Next_Outer_Stride_Loop = -1;
02360 Rglist.Actual_Stride = 0;
02361 for (i = 0; i < SNL_MAX_LOOPS; i++)
02362 Rglist.Stride_Loops[i] = -1;
02363
02364 if (nloops) {
02365 H = CXX_NEW(IMAT(indxs, nloops, Pool), Pool);
02366 H->D_Zero();
02367
02368 FMAT Hf(indxs, nloops, Pool);
02369 FMAT Hf_s(indxs, nloops, Pool);
02370
02371 INT b = 0;
02372
02373 for (i = Big_Indxs; i < indxs; i++) {
02374 INT j;
02375 for (j = 0; j < nloops; j++) {
02376 INT coeff = aa->Dim(i-Big_Indxs)->Loop_Coeff(loop[j]);
02377 if (ABS(coeff) > MAX_COEFF) {
02378
02379
02380
02381
02382
02383
02384
02385
02386
02387
02388
02389 (*H)(b,j) = 1;
02390 Hf(b,j) = FRAC(1);
02391 Hf_s(b++,j) = FRAC(1);
02392 }
02393 else {
02394 (*H)(i,j) = coeff;
02395 Hf(i,j) = FRAC(coeff);
02396 if (i < indxs - 1)
02397 Hf_s(i,j) = FRAC(coeff);
02398 }
02399 }
02400 }
02401 Is_True(Big_Indxs == b, ("internal check failed"));
02402 Is_True(H->Rows() == indxs, ("internal check2 failed"));
02403
02404 H_lu = CXX_NEW(LU_FMAT(Hf, Pool), Pool);
02405 H_lu_s = CXX_NEW(LU_FMAT(Hf_s, Pool), Pool);
02406
02407 KerH = CXX_NEW(VECTOR_SPACE<FRAC>(*H_lu, Pool), Pool);
02408 KerH->Beautify();
02409 VECTOR_SPACE<FRAC> KerH_s(*H_lu_s, Pool);
02410 KerH_s.Beautify();
02411 OKerH = CXX_NEW(VECTOR_SPACE<FRAC>(KerH->N(), Pool, TRUE), Pool);
02412 *OKerH -= *KerH;
02413 OKerH->Beautify();
02414 FmtAssert(OKerH->Basis().Cols() == nloops, ("OKerH bug"));
02415
02416
02417
02418 if (KerH->D() != KerH_s.D()) {
02419 FmtAssert(KerH->D() == KerH_s.D() - 1,
02420 ("%d != %d -1", KerH->D(), KerH_s.D()));
02421 Compute_Stride_One_Loop(H, Elements, indxs-1,
02422 &Rglist.Stride_Loop, &Rglist.Actual_Stride);
02423
02424 if (wn && Rglist.Stride_Loop != -1 && indxs >= 2) {
02425 INT actual_stride = 0;
02426 Compute_Stride_One_Loop(H, Elements, indxs-2,
02427 &Rglist.Next_Outer_Stride_Loop,
02428 &actual_stride);
02429 if (actual_stride != 1 ||
02430 Rglist.Next_Outer_Stride_Loop == Rglist.Stride_Loop)
02431 Rglist.Next_Outer_Stride_Loop = -1;
02432 }
02433
02434
02435 Rglist.Stride_Loops[0] = Rglist.Stride_Loop;
02436 Rglist.Stride_Loops[1] = Rglist.Next_Outer_Stride_Loop;
02437 if (Rglist.Next_Outer_Stride_Loop != -1) {
02438 for (i = 2; i <= indxs; i++) {
02439 if (wn != NULL) {
02440 INT actual_stride = 0;
02441 Compute_Stride_One_Loop(H, Elements, indxs - (i + 1),
02442 &(Rglist.Stride_Loops[i]), &actual_stride);
02443 if (actual_stride != 1) {
02444 Rglist.Stride_Loops[i] = -1;
02445 break;
02446 }
02447 INT j;
02448 for (j = 0; j < i; j++)
02449 if (Rglist.Stride_Loops[j] == Rglist.Stride_Loops[i])
02450 break;
02451 if (j < i) {
02452 Rglist.Stride_Loops[i] = -1;
02453 break;
02454 }
02455 }
02456 }
02457 }
02458 if (Debug_Cache_Model >= 2) {
02459 fprintf(TFile, "Stride Loops = {");
02460 INT i;
02461 for (i = 0; i < SNL_MAX_LOOPS; i++) {
02462 if (Rglist.Stride_Loops[i] == -1)
02463 break;
02464 fprintf(TFile, "%d", Rglist.Stride_Loops[i]);
02465 if (i+1 < SNL_MAX_LOOPS && Rglist.Stride_Loops[i+1] != -1)
02466 fprintf(TFile, ",");
02467 }
02468 fprintf(TFile, "}\n");
02469 }
02470
02471
02472
02473
02474
02475
02476
02477
02478
02479
02480
02481
02482
02483
02484 if (Rglist.Stride_Loop != -1 && wn && !using_tlb) {
02485 INT s1_depth = loop[Rglist.Stride_Loop];
02486 if (middle_loop_no == s1_depth && approx_inner_iters[s1_depth] <= 1 &&
02487 Rglist.Effective_Element_Size < Cur_Mhd->Line_Size &&
02488 LNO_Power_Of_Two_Hack && Cur_Mhd->Type == MHD_TYPE_CACHE) {
02489 WN* wn_leading_dim = WN_array_dim(wn, WN_num_dim(wn)-1);
02490 if (WN_operator(wn_leading_dim) == OPR_INTCONST) {
02491 INT64 leading = WN_const_val(wn_leading_dim);
02492 const INT pulled_out_of_my_butt = 16;
02493 INT64 f = (INT64(Cur_Mhd->Associativity) * INT64(Cur_Mhd->Size))
02494 / (Cur_Mhd->Line_Size * pulled_out_of_my_butt);
02495
02496
02497 if (leading >= f/2) {
02498 INT diff = leading % f;
02499 if (diff > f/2)
02500 diff = f - diff;
02501
02502 if ((diff-2)*128 < f) {
02503 Rglist.Actual_Stride = -1;
02504 Rglist.Stride_Loop = -1;
02505 if (Debug_Cache_Model >= 3) {
02506 fprintf(TFile, "pwr2 hack (spatial locality cs=%lld): ",
02507 Cur_Mhd->Size);
02508 Print(TFile);
02509 fprintf(TFile, "\n");
02510 }
02511 }
02512 }
02513 }
02514 }
02515 }
02516 }
02517 }
02518 }
02519
02520 RG::~RG()
02521 {
02522 if (H)
02523 CXX_DELETE(H, Pool);
02524 if (H_lu)
02525 CXX_DELETE(H_lu, Pool);
02526 if (H_lu_s)
02527 CXX_DELETE(H_lu_s, Pool);
02528 if (KerH)
02529 CXX_DELETE(KerH, Pool);
02530 if (OKerH)
02531 CXX_DELETE(OKerH, Pool);
02532 if (C)
02533 CXX_DELETE_ARRAY(C, Pool);
02534 }
02535
02536 RG_NODE::RG_NODE(INT entries, const INT* mn, const INT* mx,
02537 INT mn_stride, INT mx_stride)
02538 : Entries(entries), Mn_Stride(mn_stride), Mx_Stride(mx_stride)
02539 {
02540 INT i;
02541 for (i = 0; i < entries; i++) {
02542 Mn[i] = mn[i];
02543 Mx[i] = mx[i];
02544 }
02545 }
02546
02547 void RG_NODE::Print(FILE* f) const
02548 {
02549 INT i;
02550 for (i = 0; i < Entries; i++) {
02551 fprintf(f, "%s%d/%d", (i == 0 ? "[" : " "), Mn[i], Mx[i]);
02552 }
02553 fprintf(f, " stride=%d/%d]", Mn_Stride, Mx_Stride);
02554 }
02555
02556
02557 RG_LIST::~RG_LIST()
02558 {
02559 RG_ITER iter(this);
02560 RG_NODE* next = NULL;
02561 for (RG_NODE* rg = iter.First(); !iter.Is_Empty(); rg = next) {
02562 next = iter.Next();
02563 CXX_DELETE(rg, Pool);
02564 }
02565 }
02566
02567
02568
02569
02570
02571 void RG_LIST::Simplify(BOOL first_only)
02572 {
02573 BOOL changed = FALSE;
02574
02575 RG_ITER iter(this);
02576 RG_NODE* rgnext = NULL;
02577 for (RG_NODE* rg = iter.First(); rg && !iter.Is_Empty(); rg = rgnext) {
02578 rgnext = first_only ? NULL : iter.Next();
02579
02580
02581
02582 for (RG_NODE* rg2 = rg->Next(); rg2; rg2 = rg2->Next()) {
02583 BOOL ok_to_insert;
02584 INT lsz = Using_TLB ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size;
02585 INT esz = Effective_Element_Size;
02586
02587 ok_to_insert = (esz*(rg->Mn_Stride - rg2->Mx_Stride) < lsz &&
02588 esz*(rg2->Mn_Stride - rg->Mx_Stride) < lsz);
02589 if (ok_to_insert) {
02590 INT i;
02591 for (i = 0; i < Elements; i++) {
02592 if (rg2->Mn[i] - rg->Mx[i] > Max_Diff[i] ||
02593 rg->Mn[i] - rg2->Mx[i] > Max_Diff[i]) {
02594 ok_to_insert = FALSE;
02595 break;
02596 }
02597 }
02598 }
02599 if (ok_to_insert) {
02600 INT i;
02601 for (i = 0; i < Elements; i++) {
02602 rg2->Mn[i] = MIN(rg->Mn[i], rg2->Mn[i]);
02603 rg2->Mx[i] = MAX(rg->Mx[i], rg2->Mx[i]);
02604 }
02605 rg2->Mx_Stride = MAX(rg2->Mx_Stride, rg->Mx_Stride);
02606 rg2->Mn_Stride = MIN(rg2->Mn_Stride, rg->Mn_Stride);
02607 Remove(rg);
02608 CXX_DELETE(rg, Pool);
02609 changed = TRUE;
02610 break;
02611 }
02612 }
02613 }
02614
02615 if (changed)
02616 Simplify(FALSE);
02617 }
02618
02619
02620
02621
02622 void RG_LIST::Insert(const INT* c, INT stride, BOOL has_store)
02623 {
02624 Count++;
02625 if (has_store)
02626 Store_Count++;
02627
02628 switch (Len()) {
02629 case 0:
02630 if (Debug_Cache_Model >= 4)
02631 fprintf(TFile, "INSERT<2>: first in this rglist\n");
02632 Prepend(CXX_NEW(RG_NODE(Elements, c, c, stride, stride), Pool));
02633 break;
02634
02635 case 1:
02636 {
02637 RG_NODE* rg = Head();
02638 BOOL ok_to_insert;
02639
02640 INT lsz = Using_TLB ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size;
02641 INT esz = Effective_Element_Size;
02642 double elements_in_line = double(lsz)/esz;
02643 if (Stride_Loop != -1 && Actual_Stride < lsz)
02644 elements_in_line += Max_Diff[Stride_Loop] * Actual_Stride;
02645 elements_in_line = MAX(elements_in_line, 1);
02646
02647 ok_to_insert = (stride - rg->Mx_Stride < elements_in_line &&
02648 rg->Mn_Stride - stride < elements_in_line);
02649
02650 if (ok_to_insert) {
02651 INT i;
02652 for (i = 0; i < Elements; i++) {
02653 if (c[i]-rg->Mx[i] > Max_Diff[i] || rg->Mn[i]-c[i] > Max_Diff[i]) {
02654 if (Debug_Cache_Model >= 4) {
02655 fprintf(TFile, "INSERT<2>: index clash: can't go in group: ");
02656 rg->Print(TFile);
02657 }
02658 ok_to_insert = FALSE;
02659 break;
02660 }
02661 }
02662 }
02663 else {
02664 if (Debug_Cache_Model >= 4) {
02665 fprintf(TFile, "INSERT<2>: cache line clash: can't go in group: ");
02666 rg->Print(TFile);
02667 }
02668 }
02669
02670 if (ok_to_insert) {
02671 INT i;
02672 for (i = 0; i < Elements; i++) {
02673 rg->Mx[i] = MAX(rg->Mx[i], c[i]);
02674 rg->Mn[i] = MIN(rg->Mn[i], c[i]);
02675 }
02676 rg->Mx_Stride = MAX(rg->Mx_Stride, stride);
02677 rg->Mn_Stride = MIN(rg->Mn_Stride, stride);
02678 if (Debug_Cache_Model >= 4) {
02679 fprintf(TFile, "INSERT<2>: inserted, producing ");
02680 Print(TFile);
02681 fprintf(TFile, "\n");
02682 }
02683 }
02684 else
02685 Prepend(CXX_NEW(RG_NODE(Elements, c, c, stride, stride), Pool));
02686 break;
02687
02688 }
02689 default:
02690 if (Debug_Cache_Model >= 4)
02691 fprintf(TFile, "INSERT<2>: rglist already has %d -- complex\n", Len());
02692
02693
02694 Prepend(CXX_NEW(RG_NODE(Elements, c, c, stride, stride), Pool));
02695 Simplify(TRUE);
02696
02697 break;
02698 }
02699 }
02700
02701
02702
02703
02704 BOOL RG::Insert(const ACCESS_ARRAY* aa, BOOL has_store, INT nloops,
02705 const INT* loop)
02706 {
02707 INT cc[SNL_MAX_LOOPS];
02708 FRAC x[SNL_MAX_LOOPS];
02709
02710 INT indxs = nloops ? H->Rows() : aa->Num_Vec() - 1;
02711
02712
02713
02714 INT* newC = (INT*) alloca(sizeof(INT)*indxs);
02715 Fill_Constant_Array(aa, newC, loop, nloops, Big_Indxs, indxs);
02716
02717
02718 BOOL zero_const_diff = TRUE;
02719 INT i;
02720 for (i = 0; i < indxs-1; i++) {
02721 if (newC[i] != C[i]) {
02722 zero_const_diff = FALSE;
02723 break;
02724 }
02725 }
02726
02727
02728 if (zero_const_diff) {
02729 INT stride1d = newC[indxs-1] - C[indxs-1];
02730 if (Debug_Cache_Model >= 4)
02731 fprintf(TFile, "INSERT: Inserting! Stride one diff: %d\n", stride1d);
02732 INT i;
02733 for (i = 0; i < Elements; i++)
02734 cc[i] = 0;
02735 Rglist.Insert(cc, stride1d, has_store);
02736 return TRUE;
02737 }
02738
02739 BOOL ok = FALSE;
02740 if (nloops) {
02741
02742 FRAC* c = CXX_NEW_ARRAY(FRAC, indxs, &LNO_local_pool);
02743 for (i = 0; i < indxs-1; i++)
02744 c[i] = FRAC(newC[i] - C[i]);
02745 for ( ; i < indxs; i++)
02746 c[i] = FRAC(0);
02747
02748 ok = H_lu_s->Particular_Solution(c, x);
02749 CXX_DELETE_ARRAY(c, &LNO_local_pool);
02750 }
02751
02752
02753
02754 if (ok) {
02755 for (i = 0; i < Elements; i++) {
02756 if (x[i].D() != 1) {
02757 ok = FALSE;
02758 break;
02759 }
02760 }
02761 }
02762
02763
02764 if (ok) {
02765 INT stride1d = newC[indxs-1] - C[indxs-1];
02766
02767 for (i = 0; i < Elements; i++)
02768 cc[i] = x[i].N();
02769
02770
02771
02772 INT lsz = (Rglist.Using_TLB ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size);
02773 if (Rglist.Stride_Loop != -1 && lsz > Rglist.Effective_Element_Size)
02774 cc[Rglist.Stride_Loop] = 0;
02775
02776 if (Debug_Cache_Model >= 4) {
02777 fprintf(TFile, "INSERT: Particular solution:");
02778 INT i;
02779 for (i = 0; i < Elements; i++)
02780 fprintf(TFile, " %d", cc[i]);
02781 fprintf(TFile, " with const diff %d\n", stride1d);
02782 }
02783
02784 Rglist.Insert(cc, stride1d, has_store);
02785 return TRUE;
02786 }
02787
02788
02789
02790 if (Debug_Cache_Model >= 4)
02791 fprintf(TFile, "INSERT: No particular solution! No insertion!\n");
02792
02793 return FALSE;
02794 }
02795
02796 void RG_LIST::Print(FILE* f) const
02797 {
02798 fprintf(f, "<es=%d, s1l=%d, s1r=%d, cnt=%d(w=%d) tlb=%d>",
02799 Effective_Element_Size, Stride_Loop, Actual_Stride,
02800 Count, Store_Count, Using_TLB);
02801 RG_CONST_ITER iter(this);
02802 for (const RG_NODE* rg = iter.First(); !iter.Is_Empty(); rg = iter.Next()) {
02803 fprintf(f, " ");
02804 rg->Print(f);
02805 }
02806 }
02807
02808 void RG::Print(FILE* f) const
02809 {
02810 fprintf(f, "RG_LIST: %s", Sym.Name());
02811 if (Is_Local_Array)
02812 fprintf(f, "<local>");
02813 Aa->Print(f);
02814 fprintf(f, "{");
02815 Rglist.Print(f);
02816 fprintf(f, "}\n");
02817
02818 #if 0
02819 if (C) {
02820 fprintf(f, "C:");
02821 INT i;
02822 for (i = 0; i < Elements; i++)
02823 fprintf(f, " %d", C[i]);
02824 fprintf(f, "\n");
02825 }
02826 if (H) {
02827 fprintf(f, "H:\n");
02828 H->Print(f);
02829 }
02830 if (H_lu) {
02831 fprintf(f, "H_lu:\n");
02832 H_lu->Print(f);
02833 }
02834 if (H_lu_s) {
02835 fprintf(f, "H_lu_s:\n");
02836 H_lu_s->Print(f);
02837 }
02838 if (KerH) {
02839 fprintf(f, "KerH:");
02840 KerH->Print(f);
02841 }
02842 if (OKerH) {
02843 fprintf(f, "OKerH:");
02844 OKerH->Print(f);
02845 }
02846 #endif
02847 }
02848
02849
02850
02851
02852
02853
02854
02855
02856
02857
02858
02859
02860
02861
02862
02863
02864
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884 FORMULA* Formula_For_Nk(INT k,
02885 INT v_first,
02886 INT64 outersz,
02887 INT* approx_inner_iters,
02888 INT nloops,
02889 const INT* all_loops,
02890 INT* const_answer)
02891 {
02892 FmtAssert(outersz == -1 || outersz > 0, ("Bad outersz %lld", outersz));
02893
02894 INT innersz = MAX(approx_inner_iters[all_loops[k]],1);
02895 FORMULA* f = NULL;
02896
02897 if (k == 0 && v_first == -1) {
02898 if (const_answer)
02899 *const_answer = outersz * innersz;
02900 else
02901 f = FORMULA::Const(outersz * innersz);
02902 }
02903 else if (k >= nloops) {
02904 if (const_answer)
02905 *const_answer = innersz;
02906 else
02907 f = FORMULA::Const(innersz);
02908 }
02909 else {
02910 f = FORMULA::Var(v_first + k);
02911 if (innersz > 1)
02912 f = FORMULA::Mul(f, FORMULA::Const(innersz));
02913 }
02914
02915 return f;
02916 }
02917
02918 static INT Cvt_To_All_K(INT loop, INT all_nloops, const INT* all_loops)
02919 {
02920 INT kk;
02921 for (kk = 0; kk < all_nloops; kk++) {
02922 if (loop == all_loops[kk])
02923 return kk;
02924 }
02925 FmtAssert(0, ("can't find loop=%d in all_loops", loop));
02926 return -1;
02927 }
02928
02998 static FORMULA* Formula_For_Ak(INT k,
02999 INT v_first,
03000 INT64 outersz,
03001 INT* approx_inner_iters,
03002 INT nloops,
03003 INT all_nloops,
03004 const INT* all_loops,
03005 const INT* nc_loops,
03006 RG_NODE* n,
03007 RG* rg,
03008 INT coeff,
03009 BOOL using_tlb,
03010 INT inner_iters_to_stop_edge_effects,
03011 const INT* unrolls)
03012 {
03013 Is_True(k >= 0 && k < n->Entries, ("Indexed RG_NODE out of bounds"));
03014 INT lsz = using_tlb ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size;
03015 INT esz = rg->Rglist.Effective_Element_Size;
03016 INT stride = rg->Rglist.Actual_Stride;
03017
03018
03019
03020
03021
03022
03023 INT stride_loop = rg->Rglist.Stride_Loop;
03024 if (stride_loop != -1)
03025 stride_loop = Cvt_To_All_K(nc_loops[stride_loop], all_nloops, all_loops);
03026 if (stride_loop != k ||
03027 lsz <= (stride-(n->Mx_Stride-n->Mn_Stride))*esz) {
03028
03029
03030 FORMULA* f = Formula_For_Nk(k, v_first, outersz, approx_inner_iters,
03031 nloops, all_loops, NULL);
03032 INT diff = n->Mx[k] - n->Mn[k];
03033 if (diff < 0) {
03034 Is_True(0, ("Impossible diff"));
03035 diff = 0;
03036 }
03037 if (diff)
03038 f = FORMULA::Add(f, FORMULA::Const(diff));
03039
03040
03041
03042 INT i;
03043 for (i = 0; i < SNL_MAX_LOOPS && rg->Rglist.Stride_Loops[i] != -1; i++)
03044 if (rg->Rglist.Stride_Loops[i] == k)
03045 break;
03046 FORMULA* fx = NULL;
03047 if (i < SNL_MAX_LOOPS && rg->Rglist.Stride_Loops[i] != -1) {
03048 INT cls = (esz < lsz) ? lsz/esz : 1;
03049 double d = double(cls - 1);
03050 fx = FORMULA::Const(d);
03051 INT j;
03052 for (j = 0; j < i; j++) {
03053 FORMULA* fxp = Formula_For_Nk(rg->Rglist.Stride_Loops[j], v_first,
03054 outersz, approx_inner_iters, nloops, all_loops, NULL);
03055 if (unrolls[j] != 1)
03056 fx = FORMULA::Div(fx, FORMULA::Mul(FORMULA::Const(unrolls[j]), fxp));
03057 else
03058 fx = FORMULA::Div(fx, fxp);
03059 }
03060 }
03061 if (fx != NULL)
03062 f = FORMULA::Add(f, fx);
03063
03064 if (Debug_Cache_Model >= 3) {
03065 fprintf(TFile, "Ak <1,k=%d> returning ", k);
03066 f->Print(TFile);
03067 fprintf(TFile, "\n");
03068 }
03069 return f;
03070 }
03071
03072
03073
03074
03075
03076
03077 INT next_inner_k = rg->Rglist.Next_Outer_Stride_Loop;
03078 if (next_inner_k != -1)
03079 next_inner_k = Cvt_To_All_K(nc_loops[next_inner_k], all_nloops, all_loops);
03080 INT cls = (esz < lsz) ? lsz/esz : 1;
03081 if (coeff < 0)
03082 coeff = -coeff;
03083
03084
03085 INT x = MIN(coeff, cls + n->Mx_Stride - n->Mn_Stride);
03086 x = MAX(x, 1);
03087
03088 BOOL reduce_edge_effects;
03089
03090 if (LNO_Cache_Model_Edge_Effects == FALSE || cls == 1)
03091 reduce_edge_effects = FALSE;
03092 else if (inner_iters_to_stop_edge_effects == 0)
03093 reduce_edge_effects = TRUE;
03094 else if (next_inner_k == -1 || inner_iters_to_stop_edge_effects < 0)
03095 reduce_edge_effects = FALSE;
03096 else
03097 reduce_edge_effects = TRUE;
03098
03099
03100
03101
03102
03103
03104
03105
03106
03107
03108
03109
03110
03111
03112 FORMULA* f = Formula_For_Nk(k, v_first, outersz, approx_inner_iters,
03113 nloops, all_loops, NULL);
03114 INT uu = next_inner_k == -1 ? 1 : unrolls[next_inner_k];
03115 FORMULA* v = NULL;
03116 if (reduce_edge_effects && inner_iters_to_stop_edge_effects && cls > 1) {
03117 FORMULA* fdup = f->Duplicate();
03118 v = Formula_For_Nk(next_inner_k, v_first, outersz,
03119 approx_inner_iters, nloops, all_loops, NULL);
03120 if (uu > 1)
03121 v = FORMULA::Mul(FORMULA::Const(uu), v);
03122 if (reduce_edge_effects > 0)
03123 v = FORMULA::Cond(
03124 FORMULA::Lt(fdup, FORMULA::Const(inner_iters_to_stop_edge_effects)),
03125 FORMULA::Const(uu),
03126 v);
03127 }
03128
03129 FORMULA* fx = x == 1 ? f : FORMULA::Mul(f, FORMULA::Const(x));
03130 FORMULA* ans;
03131
03132 if (v == NULL) {
03133 double d = 1 - x + n->Mx_Stride - n->Mn_Stride + double(cls - 1)/uu;
03134 ans = fabs(d) < 0.000001 ? fx : FORMULA::Add(fx, FORMULA::Const(d));
03135 }
03136 else {
03137 INT d = 1 - x + n->Mx_Stride - n->Mn_Stride;
03138 ans = FORMULA::Add(d == 0 ? fx : FORMULA::Add(fx, FORMULA::Const(d)),
03139 FORMULA::Div(FORMULA::Const(cls-1), v));
03140 }
03141 if (Debug_Cache_Model >= 3) {
03142 fprintf(TFile, "Ak <2,k=%d> returning ", k);
03143 ans->Print(TFile);
03144 fprintf(TFile, "\n");
03145 }
03146 return ans;
03147 }
03148
03149 FORMULA* COMPUTE_FOOTPRINT_RVAL::AllFormula()
03150 {
03151 if (_formula == NULL) {
03152 if (RFormula == NULL)
03153 _formula = WFormula;
03154 else if (WFormula == NULL)
03155 _formula = RFormula;
03156 else
03157 _formula = FORMULA::Add(RFormula, WFormula);
03158 }
03159 return _formula;
03160 }
03161
03162 void COMPUTE_FOOTPRINT_RVAL::Print(FILE* f) const
03163 {
03164 fprintf(f, "Footprint D=%d,", D);
03165 fprintf(f, " rformula=");
03166 if (RFormula)
03167 RFormula->Print(f);
03168 else
03169 fprintf(f, "<none>");
03170 fprintf(f, " wformula=");
03171 if (WFormula)
03172 WFormula->Print(f);
03173 else
03174 fprintf(f, "<none>");
03175 fprintf(f, "\n");
03176 }
03177
03178
03179
03180
03181
03182
03183
03184
03185
03186
03187
03188
03189
03190
03191
03192
03193
03194
03195 static BOOL Middle_Loop_Pwr2_Group_Hack(INT* approx_inner_iters,
03196 RG* rg,
03197 const ARRAY_REF_NODE* n,
03198 BOOL tlb,
03199 const INT* nc_loops,
03200 INT nc_nloops)
03201 {
03202 if (LNO_Power_Of_Two_Hack == FALSE ||
03203 tlb ||
03204 Cur_Mhd->Type != MHD_TYPE_CACHE)
03205
03206
03207
03208 return TRUE;
03209
03210 WN* wn = n->Wn;
03211
03212 if (wn == NULL)
03213 return TRUE;
03214
03215 const INT pulled_out_of_my_butt = 16;
03216 INT64 f = (INT64(Cur_Mhd->Associativity) * INT64(Cur_Mhd->Size))
03217 / (Cur_Mhd->Line_Size * pulled_out_of_my_butt);
03218
03219 INT64 leading = 1;
03220 INT leading_dim = WN_num_dim(wn) - 1;
03221 INT dim;
03222 for (dim = leading_dim; dim >= 0; dim--) {
03223 WN* wn_dim = WN_array_dim(wn, dim);
03224 if (WN_operator(wn_dim) != OPR_INTCONST)
03225 return TRUE;
03226 leading *= (INT64) WN_const_val(wn_dim);
03227
03228
03229 if (leading > f/2) {
03230 INT diff = leading % f;
03231 if (diff > f/2)
03232 diff = f - diff;
03233
03234
03235
03236 if ((diff-2)*128 < f)
03237 break;
03238 }
03239 }
03240
03241
03242
03243
03244
03245
03246
03247 ACCESS_ARRAY* aa = n->Array;
03248 ACCESS_ARRAY* rgaa = rg->Aa;
03249
03250 if (aa->Num_Vec() != rgaa->Num_Vec())
03251 return TRUE;
03252
03253 if (aa->Num_Vec() <= dim)
03254 return TRUE;
03255 INT i;
03256 for (i = 0; i < dim; i++) {
03257 ACCESS_VECTOR* av = aa->Dim(i);
03258 ACCESS_VECTOR* rgav = rgaa->Dim(i);
03259 if (av->Nest_Depth() != rgav->Nest_Depth())
03260 continue;
03261 if (av->Const_Offset == rgav->Const_Offset)
03262 continue;
03263
03264
03265
03266
03267 INT status = -1;
03268 INT j;
03269 for (j = 0; j < av->Nest_Depth(); j++) {
03270 if (av->Loop_Coeff(j) != 0) {
03271 if (status == -1)
03272 status = j;
03273 else
03274 status = -2;
03275 }
03276 }
03277 if (status < 0)
03278 continue;
03279
03280
03281
03282 BOOL ncn_valid = FALSE;
03283 INT ncn;
03284 for (ncn = 0; ncn < nc_nloops; ncn++) {
03285 if (nc_loops[ncn] == status) {
03286 ncn_valid = TRUE;
03287 break;
03288 }
03289 }
03290 if (ncn_valid == FALSE)
03291 continue;
03292
03293
03294 INT ncnn;
03295 for (ncnn = 0; ncnn < nc_nloops; ncnn++) {
03296 if (ncnn == ncn)
03297 continue;
03298
03299 INT jn = nc_loops[ncnn];
03300
03301
03302 INT d;
03303 for (d = 0; d <= dim; d++) {
03304 if (d != i && aa->Dim(d)->Loop_Coeff(jn))
03305 break;
03306 }
03307
03308 if (d <= dim) {
03309
03310 if (ncn < ncnn || approx_inner_iters[jn] > 1) {
03311 if (Debug_Cache_Model >= 3) {
03312 fprintf(TFile, "pwr2 hack group locality, cs=%lld: won't insert ",
03313 Cur_Mhd->Size);
03314 n->Print(TFile);
03315 fprintf(TFile, "into ");
03316 rg->Print(TFile);
03317 }
03318 return FALSE;
03319 }
03320 }
03321 }
03322 }
03323
03324 return TRUE;
03325 }
03326
03327
03328
03329
03330
03331
03332
03333
03334
03335
03336
03337
03338
03339
03340 static BOOL Cache_Line_Edge_Reuse(INT loop, INT u, ACCESS_ARRAY* aa)
03341 {
03342 FmtAssert(aa, ("Bad access array passed to Cache_Line_Edge_Reuse"));
03343 INT indxs = aa->Num_Vec();
03344 INT j;
03345 for (j = indxs - 1; j >= 0; j--) {
03346 ACCESS_VECTOR* av = aa->Dim(j);
03347 INT coeff = av->Loop_Coeff(loop);
03348 if (coeff > u || (j < indxs-2 && coeff))
03349 return FALSE;
03350 }
03351 return TRUE;
03352 }
03353
03354
03355
03356
03357
03358
03359
03360
03361
03362
03363
03364
03365
03366
03367
03368
03369
03370
03371
03372
03373
03374
03375
03376
03377
03378
03379
03380
03381
03382
03383
03384
03385
03386
03387
03388
03389
03390
03391
03392
03393
03394
03395
03396
03397
03398
03399
03400
03401 extern COMPUTE_FOOTPRINT_RVAL Compute_Footprint(
03402 const ARRAY_REF* arl,
03403 INT nloops,
03404 const INT* loops,
03405 const INT* arl_stripsz,
03406 const INT64* est_iters,
03407 const INT64* max_iters,
03408 const INT* unrolls,
03409 INT depth,
03410 INT stripdepth,
03411 const INT* permute_order,
03412 INT v_first,
03413 INT64 outersz,
03414 BOOL using_tlb,
03415 INT middle_loop_no)
03416 {
03417 COMPUTE_FOOTPRINT_RVAL rval;
03418
03419 if (Debug_Cache_Model >= 3) {
03420 fprintf(TFile,
03421 "Compute_Footprint <tlb=%d>: v_first=%d outersz=%lld loops=(",
03422 using_tlb, v_first, outersz);
03423 INT i;
03424 for (i = 0; i < nloops; i++)
03425 fprintf(TFile, "%s%d", (i == 0 ? "" : ","), loops[i]);
03426 fprintf(TFile, ")\n");
03427 }
03428
03429
03430
03431
03432 INT all_nloops = nloops;
03433 INT* all_loops = (INT*) alloca(sizeof(INT)*(depth+1));
03434 INT i;
03435 for (i = 0; i < nloops; i++)
03436 all_loops[i] = loops[i];
03437 for (i = stripdepth; i <= depth; i++) {
03438 INT ii;
03439 for (ii = 0; ii < all_nloops; ii++) {
03440 if (all_loops[ii] == permute_order[i])
03441 break;
03442 }
03443 if (ii == all_nloops)
03444 all_loops[all_nloops++] = permute_order[i];
03445 }
03446
03447 INT* max_diff = (INT*)alloca(sizeof(INT) * (depth+1));
03448 for (i = 0; i <= depth; i++)
03449 max_diff[i] = MAX(5, est_iters[i]-5) * unrolls[i];
03450
03451
03452
03453
03454 INT* approx_inner_iters = (INT*)alloca(sizeof(INT) * (depth+1));
03455 for (i = 0; i <= depth; i++) {
03456
03457
03458 approx_inner_iters[i] = (arl_stripsz[i] > 1) ? arl_stripsz[i] : 1;
03459 }
03460 for (i = 1; i < all_nloops; i++) {
03461
03462 INT lp = all_loops[i];
03463 if (arl_stripsz[lp] < 1)
03464 approx_inner_iters[lp] = est_iters[lp];
03465 }
03466
03467
03468 INT ix;
03469 for (ix = 0; ix < arl->Elements(); ix++) {
03470
03471 const ARRAY_REF_LIST* arlist = arl->Array_Ref_List(ix);
03472
03473
03474 INT ncdepth = 0;
03475 ARRAY_REF_CONST_ITER iternn(arlist);
03476 for (const ARRAY_REF_NODE* nnn = iternn.First();
03477 !iternn.Is_Empty(); nnn = iternn.Next()) {
03478 if (nnn->Array) {
03479 INT this_ref_ncdepth = nnn->Array->Non_Const_Loops();
03480 ncdepth = MAX(ncdepth, this_ref_ncdepth);
03481 }
03482 }
03483
03484 INT nc_nloops = 0;
03485 INT* nc_loops = (INT*) alloca(sizeof(INT)*(depth+1));
03486 INT i;
03487 for (i = 0; i < all_nloops; i++) {
03488 if (all_loops[i] >= ncdepth)
03489 nc_loops[nc_nloops++] = all_loops[i];
03490 }
03491
03492 STACK<RG*> rg_stack(&LNO_local_pool);
03493
03494 ARRAY_REF_CONST_ITER iter(arlist);
03495
03496 for (const ARRAY_REF_NODE* n = iter.First();
03497 !iter.Is_Empty(); n = iter.Next()) {
03498
03499
03500
03501 INT esz = n->Element_Size();
03502 #ifdef KEY
03503
03504
03505 if (esz == 0)
03506 esz = WN_element_size(n->Wn);
03507
03508 if (esz < 0) esz = -esz;
03509 #endif
03510 if (Debug_Cache_Model >= 4)
03511 fprintf(TFile, "-> arl entry from base %d, insertion info\n", ix);
03512
03513 BOOL inserted = FALSE;
03514 INT i;
03515 for (i = 0; i < rg_stack.Elements(); i++) {
03516 RG* rg = rg_stack.Bottom_nth(i);
03517 if (esz == rg->Rglist.Element_Size &&
03518 Same_Ug(n->Array, rg->Aa) &&
03519 Middle_Loop_Pwr2_Group_Hack(approx_inner_iters, rg, n,
03520 using_tlb, nc_loops, nc_nloops) &&
03521 rg->Insert(n->Array, n->Has_Store(), nc_nloops, nc_loops)) {
03522 inserted = TRUE;
03523 break;
03524 }
03525 }
03526 if (!inserted) {
03527 RG *rg = CXX_NEW(RG(&LNO_local_pool, n->Array,
03528 nc_nloops, nc_loops, esz,
03529 *arl->Array_Ref_List(ix)->Base_Array,
03530 max_diff, using_tlb, n->Wn, middle_loop_no,
03531 approx_inner_iters),&LNO_local_pool);
03532 rg_stack.Push(rg);
03533 rg_stack.Top()->Insert(n->Array, n->Has_Store(), nc_nloops, nc_loops);
03534 }
03535 }
03536
03537
03538
03539 for (i = 0; i < rg_stack.Elements(); i++) {
03540
03541
03542
03543 RG* rg = rg_stack.Bottom_nth(i);
03544 RG_LIST* rglist = &rg->Rglist;
03545 RG_ITER iter(rglist);
03546 INT esz = rglist->Effective_Element_Size;
03547 INT llsz = using_tlb ? Cur_Mhd->Page_Size : Cur_Mhd->Line_Size;
03548 INT lsz = MAX(llsz, esz);
03549
03550 INT inner_iters_to_stop_edge_effects = -1;
03551 if (rg->Rglist.Stride_Loop != -1 &&
03552 rg->Rglist.Next_Outer_Stride_Loop != -1) {
03553 INT lp = nc_loops[rg->Rglist.Stride_Loop];
03554 INT mest = max_iters[lp];
03555 if (mest != UNBOUNDED_ITERS)
03556 inner_iters_to_stop_edge_effects = mest;
03557 }
03558 if (inner_iters_to_stop_edge_effects == -1 && nloops == 1) {
03559
03560
03561
03562
03563 INT p0;
03564 for (p0 = depth; p0 >= 0; p0--)
03565 if (permute_order[p0] == loops[0])
03566 break;
03567 if (p0 >= 1) {
03568 INT loop2 = permute_order[p0-1];
03569 if (Cache_Line_Edge_Reuse(loop2, unrolls[loop2], rg->Aa)) {
03570 inner_iters_to_stop_edge_effects = 0;
03571 }
03572 }
03573 }
03574
03575 const FMAT* basis = rg->OKerH ? &rg->OKerH->Basis() : NULL;
03576
03577 for (RG_NODE* n = iter.First(); !iter.Is_Empty(); n = iter.Next()) {
03578
03579
03580 FORMULA* f = NULL;
03581
03582 if (basis) {
03583
03584 INT dmultiple = approx_inner_iters[all_loops[0]];
03585 INT newd;
03586
03587 if (rglist->Stride_Loop == 0 && llsz > rglist->Actual_Stride * esz) {
03588
03589 FmtAssert(rglist->Actual_Stride >= 1,
03590 ("Bad stride %d for s1loop %d",
03591 rglist->Actual_Stride, rglist->Stride_Loop));
03592 newd = Divceil(n->Mx_Stride - n->Mn_Stride+lsz/esz,
03593 rglist->Actual_Stride);
03594 }
03595 else
03596 newd = n->Mx[0] - n->Mn[0];
03597 if (dmultiple > 1)
03598 newd = Divceil(newd, dmultiple);
03599 rval.D = MAX(rval.D, newd);
03600
03623
03624
03625
03626
03627 UINT64 gtmax = 0;
03628 INT ii;
03629 for (ii = 0; ii < basis->Rows(); ii++)
03630 for (INT jj = 0; jj < basis->Cols(); jj++)
03631 if ((*basis)(ii,jj).Abs() > Happy_Coefficient)
03632 gtmax |= (1<<jj);
03633
03634 INT s1row = rg->H->Rows() - 1;
03635
03636 if (gtmax) {
03637
03638 INT k;
03639 for (k = 0; k < basis->Cols(); k++) {
03640 if (gtmax & (1<<k)) {
03641 INT kk = Cvt_To_All_K(nc_loops[k], all_nloops, all_loops);
03642 FORMULA* fk = Formula_For_Ak(kk, v_first,
03643 outersz, approx_inner_iters,
03644 nloops, all_nloops, all_loops,
03645 nc_loops,
03646 n, rg,
03647 (*rg->H)(s1row,k), using_tlb,
03648 inner_iters_to_stop_edge_effects,
03649 unrolls);
03650 f = f ? FORMULA::Mul(f, fk) : fk;
03651 }
03652 }
03653 }
03654
03655 for (ii = 0; ii < basis->Rows(); ii++) {
03656 INT p = 1;
03657 INT k;
03658 for (k = 0; k < basis->Cols(); k++)
03659 if (!(gtmax & (1<<k)) && (*basis)(ii,k) != 0)
03660 p = p * (*basis)(ii,k).N();
03661 p = (p < 0) ? -p : p;
03662
03663 FORMULA* f2 = NULL;
03664
03665 for (k = 0; k < basis->Cols(); k++) {
03666 if (gtmax & (1<<k))
03667 continue;
03668
03669 INT c = (*basis)(ii,k).N();
03670
03671 if (c == 0)
03672 continue;
03673 INT kk = Cvt_To_All_K(nc_loops[k], all_nloops, all_loops);
03674 FORMULA* fk = Formula_For_Ak(kk, v_first,
03675 outersz, approx_inner_iters,
03676 nloops, all_nloops, all_loops,
03677 nc_loops,
03678 n, rg,
03679 (*rg->H)(s1row,k), using_tlb,
03680 inner_iters_to_stop_edge_effects,
03681 unrolls);
03682 if (p != c)
03683 fk = FORMULA::Mul(fk, FORMULA::Const(p/ABS(c)));
03684 f2 = f2 ? FORMULA::Add(f2, fk) : fk;
03685 }
03686 if (f2) {
03687 #if 0 // TODO: If I were more careful, I'd know when exactly to do
03688
03689
03690 if (p > 1)
03691 f2 = FORMULA::Sub(f2, FORMULA::Const(p-1));
03692 #endif
03693 f = f ? FORMULA::Mul(f, f2) : f2;
03694 }
03695 }
03696 }
03697
03698 if (f == NULL)
03699 f = FORMULA::Const(1);
03700 INT ii;
03701 for (ii = 0; ii < nloops; ii++) {
03702 BOOL found = FALSE;
03703 for (INT jj = 0; jj < nc_nloops; jj++) {
03704 if (all_loops[ii] == nc_loops[jj])
03705 found = TRUE;
03706 }
03707 if (found == FALSE) {
03708 FORMULA* nk = Formula_For_Nk(ii, v_first,
03709 outersz, approx_inner_iters,
03710 nloops, all_loops, NULL);
03711 f = FORMULA::Mul(nk, f);
03712 }
03713 }
03714
03715 double es_const = esz;
03716 if ((rg->Rglist.Stride_Loop == -1 || lsz <= esz) &&
03717 !arlist->Is_Scalar_Expanded()) {
03718 es_const *= double(lsz)/esz + n->Mx_Stride - n->Mn_Stride;
03719 }
03720 FORMULA* es = FORMULA::Const(INT(es_const));
03721 f = f ? FORMULA::Mul(es,f) : es;
03722 if (Debug_Cache_Model >= 3) {
03723 fprintf(TFile, "-> arl base %d, subgroup %d <-\n", ix, i);
03724 rg->Print(TFile);
03725 fprintf(TFile, "bytes = ");
03726 f->Print(TFile);
03727 fprintf(TFile, "\n");
03728 }
03729 if (rg->Rglist.Store_Count)
03730 rval.WFormula = rval.WFormula ? FORMULA::Add(rval.WFormula, f) : f;
03731 else
03732 rval.RFormula = rval.RFormula ? FORMULA::Add(rval.RFormula, f) : f;
03733 }
03734 }
03735 }
03736
03737 return rval;
03738 }
03739
03740
03741
03742
03743 static double RSolve(FORMULA* f,
03744 INT nloops,
03745 const INT* loops,
03746 const INT64* adj_max_iters,
03747 const INT64* fixed_iters,
03748 mINT64* t,
03749 INT loopno);
03750
03751 static double Rtry(INT64 b,
03752 FORMULA* f,
03753 INT nloops,
03754 const INT* loops,
03755 const mINT64* adj_max_iters,
03756 const mINT64* fixed_iters,
03757 mINT64* t,
03758 INT loopno,
03759 INT loopno_last)
03760 {
03761 double answer;
03762 INT i;
03763 for (i = loopno; i <= loopno_last; i++) {
03764
03765 t[i] = MIN(b, adj_max_iters[loops[i]]);
03766 }
03767
03768 if (loopno_last < nloops-1)
03769 answer = RSolve(f, nloops, loops, adj_max_iters, fixed_iters,
03770 t, loopno_last+1);
03771 else {
03772 answer = f->Eval(nloops, t);
03773 Rtry_Count++;
03774 if (Debug_Cache_Model >= 3) {
03775 fprintf(TFile, "%d: Formula(", Rtry_Count);
03776 INT i;
03777 for (i = 0; i < nloops; i++) {
03778 fprintf(TFile, "%lld", t[i]);
03779 if (i < nloops - 1)
03780 fprintf(TFile, ",");
03781 }
03782 fprintf(TFile, ") = %g\n", answer);
03783 }
03784 }
03785
03786 return answer;
03787 }
03788
03789 static double RSolve3(FORMULA* f,
03790 INT64 b1,
03791 INT64 b2,
03792 INT64 b3,
03793 double v1,
03794 double v2,
03795 double v3,
03796 mINT64* t1,
03797 mINT64* t2,
03798 mINT64* t3,
03799
03800 mINT64* t,
03801
03802 INT nloops,
03803 const INT* loops,
03804 const mINT64* adj_max_iters,
03805 const mINT64* fixed_iters,
03806 INT loopno,
03807 INT loopno_last)
03808 {
03809 if (v1 < v2 || v3 < v2)
03810 DevWarn("RSolve3 running sub-optimally -- okay");
03811
03812 INT blk_step = 1;
03813
03814
03815
03816
03817
03818 if ((b1 + 1 + b1/RECT_BLOCKSIZE_UNCERTAINTY >= b2 || b1 + blk_step >= b2) &&
03819 (b2 + 1 + b2/RECT_BLOCKSIZE_UNCERTAINTY >= b3 || b2 + blk_step >= b3)) {
03820 INT i;
03821 for (i = loopno; i < nloops; i++)
03822 t[i] = t2[i];
03823 return v2;
03824 }
03825
03826 INT64 bnew;
03827 double vnew;
03828 INT64 tnew[SNL_MAX_LOOPS];
03829 INT i;
03830 for (i = 0; i < loopno; i++) {
03831 Is_True(t1[i] == t[i] && t2[i] == t[i] && t3[i] == t[i], ("Bug"));
03832 tnew[i] = t[i];
03833 }
03834
03835 if (b2 - b1 > b3 - b2) {
03836 bnew = (b1 + b2 + blk_step - 1)/2;
03837 if (blk_step > 1) {
03838 bnew -= bnew%blk_step;
03839 bnew = MAX(bnew, b1+1);
03840 }
03841 vnew = Rtry(bnew, f, nloops, loops, adj_max_iters, fixed_iters,
03842 tnew, loopno, loopno_last);
03843 if (vnew < v2)
03844 return RSolve3(f, b1, bnew, b2, v1, vnew, v2, t1, tnew, t2, t,
03845 nloops, loops, adj_max_iters, fixed_iters,
03846 loopno, loopno_last);
03847 else
03848 return RSolve3(f, bnew, b2, b3, vnew, v2, v3, tnew, t2, t3, t,
03849 nloops, loops, adj_max_iters, fixed_iters,
03850 loopno, loopno_last);
03851 }
03852 else {
03853 bnew = (b2 + b3 + blk_step - 1)/2;
03854 if (blk_step > 1) {
03855 bnew -= bnew%blk_step;
03856 bnew = MAX(bnew, b2+1);
03857 }
03858 vnew = Rtry(bnew, f, nloops, loops, adj_max_iters, fixed_iters,
03859 tnew, loopno, loopno_last);
03860 if (vnew < v2)
03861 return RSolve3(f, b2, bnew, b3, v2, vnew, v3, t2, tnew, t3, t,
03862 nloops, loops, adj_max_iters, fixed_iters,
03863 loopno, loopno_last);
03864 else
03865 return RSolve3(f, b1, b2, bnew, v1, v2, vnew, t1, t2, tnew, t,
03866 nloops, loops, adj_max_iters, fixed_iters,
03867 loopno, loopno_last);
03868 }
03869 }
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897 static double RSolve(FORMULA* f,
03898 INT nloops,
03899 const INT* loops,
03900 const mINT64* adj_max_iters,
03901 const mINT64* fixed_iters,
03902 mINT64* t,
03903 INT loopno)
03904 {
03905 if (fixed_iters[loopno] >= 1)
03906 return Rtry(fixed_iters[loopno], f, nloops, loops, adj_max_iters,
03907 fixed_iters, t, loopno, loopno);
03908
03909 INT loopno_last = MAX(nloops - Max_Different_Blocksizes, loopno);
03910
03911 INT64 mxiters = 1;
03912 INT i;
03913 for (i = loopno; i <= loopno_last; i++)
03914 mxiters = MAX(mxiters, adj_max_iters[loops[i]]);
03915
03916 if (mxiters == 1)
03917 return Rtry(1, f, nloops, loops, adj_max_iters, fixed_iters,
03918 t, loopno, loopno_last);
03919
03920 INT64 t1[SNL_MAX_LOOPS];
03921 INT64 t2[SNL_MAX_LOOPS];
03922 INT64 t3[SNL_MAX_LOOPS];
03923
03924 for (i = 0; i < nloops; i++) {
03925 t3[i] = t[i];
03926 t2[i] = t3[i];
03927 t1[i] = t2[i];
03928 }
03929
03930
03931 INT64 nbksz = Nominal_Blocksize[nloops+1];
03932 INT64 start_iters = (loopno == nloops-1 && nloops > 1) ? 2*nbksz : nbksz;
03933
03934 while (start_iters >= mxiters)
03935 start_iters /= 2;
03936 const INT lowest = 2;
03937 if (start_iters < lowest)
03938 start_iters = lowest;
03939 else
03940 start_iters -= start_iters%lowest;
03941
03942 INT64 b1 = 0;
03943 INT64 b2 = start_iters;
03944 INT64 b3 = MIN(start_iters*2, mxiters);
03945
03946 double v1 = 0.0;
03947 double v2 = Rtry(b2, f, nloops, loops, adj_max_iters, fixed_iters,
03948 t2, loopno, loopno_last);
03949 double v3 = Rtry(b3, f, nloops, loops, adj_max_iters, fixed_iters,
03950 t3, loopno, loopno_last);
03951
03952 if (v2 <= v3) {
03953
03954
03955
03956
03957
03958
03959
03960 while (b2 > lowest) {
03961 b1 = b2/2;
03962 v1 = Rtry(b1, f, nloops, loops, adj_max_iters, fixed_iters,
03963 t1, loopno, loopno_last);
03964 if (v1 > v2)
03965 break;
03966
03967
03968 b3 = b2; b2 = b1;
03969 v3 = v2; v2 = v1;
03970 for (i = loopno; i < nloops; i++) {
03971 t3[i] = t2[i];
03972 t2[i] = t1[i];
03973 }
03974 }
03975
03976 if (b2 <= lowest) {
03977 for (i = loopno; i < nloops; i++)
03978 t[i] = t2[i];
03979 return v2;
03980 }
03981 }
03982 else {
03983
03984 while (v2 > v3) {
03985
03986
03987
03988
03989
03990 if (b3 >= mxiters) {
03991 for (i = loopno; i < nloops; i++)
03992 t[i] = t3[i];
03993 return v3;
03994 }
03995
03996
03997 b1 = b2; b2 = b3;
03998 v1 = v2; v2 = v3;
03999 for (i = loopno; i < nloops; i++) {
04000 t1[i] = t2[i];
04001 t2[i] = t3[i];
04002 }
04003
04004 b3 = MIN(2*b2, mxiters);
04005 v3 = Rtry(b3, f, nloops, loops, adj_max_iters, fixed_iters,
04006 t3, loopno, loopno_last);
04007 }
04008 }
04009
04010 return RSolve3(f, b1, b2, b3, v1, v2, v3, t1, t2, t3, t, nloops, loops,
04011 adj_max_iters, fixed_iters, loopno, loopno_last);
04012 }
04013
04014 static double RSolve_Go(FORMULA* f,
04015 INT nloops,
04016 const INT* loops,
04017 const mINT64* adj_max_iters,
04018 const INT* unrolls,
04019 const INT* required_blocksize,
04020 INT* t)
04021 {
04022 mINT64 required[SNL_MAX_LOOPS];
04023 mINT64 tt[SNL_MAX_LOOPS];
04024
04025
04026 INT i;
04027 for (i = 0; i < nloops; i++) {
04028 required[i] = required_blocksize[loops[i]];
04029 if (required[i] < 0 && LNO_Blocking_Size) {
04030 required[i] = MIN(LNO_Blocking_Size/unrolls[loops[i]],
04031 adj_max_iters[loops[i]]);
04032 }
04033 else if (required[i] == 0)
04034 required[i] = adj_max_iters[loops[i]];
04035 if (required[i] >= 0)
04036 required[i] = MAX(required[i],2);
04037 }
04038
04039 double answer = RSolve(f, nloops, loops, adj_max_iters, required, tt, 0);
04040
04041 for (i = 0; i < nloops; i++) {
04042 t[i] = MIN(tt[i],INT32_MAX);
04043 }
04044
04045 return answer;
04046 }
04047
04048
04049
04050
04051
04052
04053
04054
04055
04056
04057
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069
04070
04071
04072
04073
04074
04075
04076
04077
04078
04079
04080
04081
04082
04083
04084
04085
04086
04087
04088
04089
04090
04091
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107
04108
04109 static double Compute_Do_Overhead(INT depth,
04110 INT stripdepth,
04111 INT nstrips,
04112 const INT* iloop,
04113 const INT* stripsz,
04114 const INT* unrolls,
04115 const INT64* est_iters,
04116 const INT* permute_order)
04117 {
04118 double ovhd = 0.0;
04119
04120 if (nstrips == 0)
04121 stripdepth = depth+1;
04122
04123
04124
04125
04126
04127 INT unrolls_so_far = 1;
04128 INT i;
04129 for (i = 0; i < stripdepth; i++) {
04130 double iters = unrolls_so_far;
04131 INT j;
04132 for (j = i; j <= depth; j++)
04133 iters *= est_iters[permute_order[j]];
04134 if (iters > LNO_Small_Trip_Count)
04135 ovhd += Loop_Overhead/iters;
04136 unrolls_so_far *= unrolls[permute_order[i]];
04137 }
04138
04139
04140
04141
04142
04143
04144
04145 INT s;
04146 for (s = 0; s < nstrips; s++) {
04147 double iters = unrolls_so_far;
04148 for (INT i = stripdepth; i <= depth; i++) {
04149 INT ss;
04150 for (ss = s-1; ss >= 0; ss--) {
04151 if (iloop[ss] == permute_order[i])
04152 break;
04153 }
04154 iters *= (ss >= 0) ? stripsz[ss] : est_iters[permute_order[i]];
04155 }
04156 if (iters > LNO_Small_Trip_Count)
04157 ovhd += Loop_Overhead/iters;
04158 }
04159
04160
04161
04162
04163
04164 for (i = stripdepth; i <= depth; i++) {
04165 double iters = unrolls_so_far;
04166 INT j;
04167 for (j = i; j <= depth; j++) {
04168 INT ss;
04169 for (ss = nstrips-1; ss >= 0; ss--) {
04170 if (iloop[ss] == permute_order[j])
04171 break;
04172 }
04173 iters *= (ss >= 0) ? stripsz[ss] : est_iters[permute_order[i]];
04174 }
04175 if (iters > LNO_Small_Trip_Count)
04176 ovhd += Loop_Overhead/iters;
04177 unrolls_so_far *= unrolls[permute_order[i]];
04178 }
04179
04180 return ovhd;
04181 }
04182
04183 BOOL Do_Copying(const ARRAY_REF* ,
04184 INT ,
04185 INT ,
04186 const INT*
04187 )
04188 {
04189 return FALSE;
04190 }
04191
04192
04193
04194
04195
04196
04197
04198
04199
04200
04201 static BOOL Has_Outer_Reuse_In_SNL(const INT* loops,
04202 INT nloops,
04203 INT depth,
04204 INT nloops_in_snl,
04205 const INT* available_order,
04206 INT available_depth,
04207 const BOOL* originally_has_reuse)
04208 {
04209 if (depth + 1 == nloops)
04210 return FALSE;
04211
04212 BOOL outer_reuse[MAX_DEPTH];
04213
04214
04215 INT i;
04216 for (i = 0; i <= depth; i++)
04217 outer_reuse[i] = originally_has_reuse[i];
04218
04219 for (i = 0; i <= available_depth - nloops_in_snl; i++)
04220 outer_reuse[available_order[i]] = 0;
04221
04222 for (i = 0; i < nloops; i++)
04223 outer_reuse[loops[i]] = 0;
04224
04225 for (i = 0; i <= depth; i++)
04226 if (outer_reuse[i])
04227 return TRUE;
04228 return FALSE;
04229 }
04230
04231
04232
04233
04234
04235
04236
04237
04238 static BOOL Fits_In_The_Cache(const ARRAY_REF* arl,
04239 const INT* loops,
04240 const INT nloops,
04241 const mINT64* est_iters,
04242 const mINT64* max_iters,
04243 const INT* arl_stripsz,
04244 const INT* unrolls,
04245 const INT depth,
04246 const INT stripdepth,
04247 const INT* permute_order,
04248 BOOL use_tlb,
04249 double* bytes)
04250 {
04251
04252 if (Debug_Cache_Model) {
04253 fprintf(TFile, "+++++ FITS IN THE ");
04254 if (use_tlb)
04255 fprintf(TFile, "TLB ");
04256 else
04257 fprintf(TFile, "CACHE ");
04258 fprintf(TFile, "{");
04259 INT i;
04260 for (i = 0; i < nloops - 1; i++)
04261 fprintf(TFile, "%d,", loops[i]);
04262 fprintf(TFile, "%d}:", loops[nloops - 1]);
04263 fprintf(TFile, "+++++\n");
04264 }
04265
04266 if (use_tlb && !Cur_Mhd->TLB_Valid()) {
04267 *bytes = 0;
04268 return TRUE;
04269 }
04270 INT i;
04271 for (i = 0; i < nloops; i++)
04272 if (max_iters[loops[i]] == UNBOUNDED_ITERS)
04273 return FALSE;
04274 int* iters = (int*) alloca(sizeof(int)*nloops);
04275 for (i = 0; i < nloops; i++) {
04276
04277
04278
04279
04280 INT ii = loops[i];
04281 iters[i] = est_iters[ii];
04282 if (arl_stripsz[ii] > 1) {
04283 iters[i] = Divceil(iters[i], arl_stripsz[ii]);
04284 iters[i] = MAX(iters[i], 2);
04285 }
04286 }
04287 COMPUTE_FOOTPRINT_RVAL F1;
04288 F1 = Compute_Footprint(arl, nloops, loops, arl_stripsz, est_iters, max_iters,
04289 unrolls, depth, stripdepth, permute_order, -1, iters[0], use_tlb, -1);
04290 double cache_bytes = F1.AllFormula() == NULL ? 0.0 :
04291 F1.AllFormula()->Eval(nloops - 1, &iters[1]);
04292 *bytes = cache_bytes;
04293
04294 if (Debug_Cache_Model) {
04295 fprintf(TFile, "+++++ END FITS IN THE ");
04296 if (use_tlb)
04297 fprintf(TFile, "TLB ");
04298 else
04299 fprintf(TFile, "CACHE ");
04300 fprintf(TFile, "{");
04301 INT i;
04302 for (i = 0; i < nloops - 1; i++)
04303 fprintf(TFile, "%d,", loops[i]);
04304 fprintf(TFile, "%d}:", loops[nloops - 1]);
04305 fprintf(TFile, "+++++\n");
04306 }
04307 #ifndef _FIX_CACHE_MODEL_
04308 return (cache_bytes <= Cur_Mhd->Effective_Size) ? TRUE : FALSE;
04309 #else
04310
04311
04312
04313
04314
04315
04316
04317
04318 double eff_size = use_tlb ?
04319 Cur_Mhd->TLB_Entries * Cur_Mhd->Page_Size :
04320 Cur_Mhd->Effective_Size;
04321 return (cache_bytes <= eff_size) ? TRUE : FALSE;
04322 #endif // _FIX_CACHE_MODEL_
04323
04324 }
04325
04326
04327
04328
04329
04330
04331
04332
04333
04334
04335
04336
04337
04338
04339
04340
04341
04342
04343
04344
04345
04346
04347
04348
04349
04350
04351
04352
04353
04354
04355
04356
04357
04358
04359
04360
04361
04362
04363
04364
04365
04366 static FORMULA* Compute_Miss_Bytes(const ARRAY_REF* arl,
04367 INT nloops,
04368 const INT* loops,
04369 const mINT64* est_iters,
04370 const mINT64* max_iters,
04371 const INT* arl_stripsz,
04372 const INT* unrolls,
04373 INT depth,
04374 INT stripdepth,
04375 const INT* permute_order,
04376 INT mhd_level,
04377 INT64 v1,
04378 BOOL using_tlb,
04379 INT read_sreg,
04380 INT write_sreg,
04381 INT first_free_sreg,
04382 double* rbytes_per_iter,
04383 double* wbytes_per_iter)
04384 {
04385 double size;
04386 double eff_size;
04387 INT line_size;
04388 double basic_middle_penalty;
04389 double additional_middle_penalty;
04390
04391 *rbytes_per_iter = 0.0;
04392 *wbytes_per_iter = 0.0;
04393
04394 if (using_tlb) {
04395 size = Cur_Mhd->TLB_Entries * Cur_Mhd->Page_Size;
04396 eff_size = Cur_Mhd->TLB_Entries * Cur_Mhd->Page_Size;
04397 line_size = Cur_Mhd->Page_Size;
04398 basic_middle_penalty = BASIC_MIDDLE_PENALTY_TLB;
04399 additional_middle_penalty = ADDITIONAL_MIDDLE_PENALTY_TLB;
04400 }
04401 else {
04402 BOOL do_copying = Do_Copying(arl, mhd_level, nloops, loops);
04403 size = Cur_Mhd->Size;
04404 eff_size = do_copying ? 0.9 * Cur_Mhd->Size : Cur_Mhd->Effective_Size;
04405 line_size = Cur_Mhd->Line_Size;
04406 if (Cur_Mhd->Type == MHD_TYPE_MEM) {
04407 basic_middle_penalty = BASIC_MIDDLE_PENALTY_MEM;
04408 additional_middle_penalty = ADDITIONAL_MIDDLE_PENALTY_MEM;
04409 }
04410 else {
04411 basic_middle_penalty = BASIC_MIDDLE_PENALTY;
04412 additional_middle_penalty = ADDITIONAL_MIDDLE_PENALTY;
04413 }
04414 }
04415
04416 if (nloops == 1) {
04417 COMPUTE_FOOTPRINT_RVAL F1;
04418
04419 F1 = Compute_Footprint(arl, nloops, loops, arl_stripsz,
04420 est_iters, max_iters,
04421 unrolls, depth, stripdepth, permute_order, -1,
04422 v1, using_tlb, nloops > 1 ? loops[0] : -1);
04423 if (F1.RFormula != NULL)
04424 *rbytes_per_iter = F1.RFormula->Eval(0, (const double*)NULL) / v1;
04425 if (F1.WFormula != NULL)
04426 *wbytes_per_iter = F1.WFormula->Eval(0, (const double*)NULL) / v1;
04427 if (Debug_Cache_Model >= 2) {
04428 fprintf(TFile, "Compute_Miss_Bytes returns one deep with r=%g,w=%g\n",
04429 *rbytes_per_iter, *wbytes_per_iter);
04430 }
04431 return NULL;
04432 }
04433
04434
04435
04436
04437
04438 COMPUTE_FOOTPRINT_RVAL* F =
04439 CXX_NEW_ARRAY(COMPUTE_FOOTPRINT_RVAL, nloops, &LNO_local_pool);
04440 INT lp;
04441 for (lp = 0; lp < nloops; lp++) {
04442 F[lp] = Compute_Footprint(arl, nloops-lp, loops+lp, arl_stripsz,
04443 est_iters, max_iters,
04444 unrolls, depth, stripdepth, permute_order, lp-1,
04445 v1, using_tlb, lp == 0 ? loops[0] : -1);
04446
04447 if (Debug_Cache_Model >= 2) {
04448 fprintf(TFile, "<tlb=%d> F[%d] = ", using_tlb, lp);
04449 F[lp].Print(TFile);
04450 }
04451
04452 if (lp == 0 && F[0].AllFormula() == NULL) {
04453 if (Debug_Cache_Model >= 2)
04454 fprintf(TFile, "Compute_Miss_Bytes F1 found no data -- return 0s\n");
04455 CXX_DELETE_ARRAY(F, &LNO_local_pool);
04456 return NULL;
04457 }
04458 }
04459
04460
04461
04462
04463 INT fr_sreg0 = first_free_sreg;
04464 INT fw_sreg0 = first_free_sreg + LNO_MAX_DO_LOOP_DEPTH;
04465 INT frw_sreg0 = first_free_sreg + 2*LNO_MAX_DO_LOOP_DEPTH;
04466 INT newdata_sreg0 = first_free_sreg + 3*LNO_MAX_DO_LOOP_DEPTH;
04467 INT requirements_sreg0 = first_free_sreg + 4*LNO_MAX_DO_LOOP_DEPTH;
04468
04469 FORMULA* initialization = NULL;
04470
04471 for (lp = 0; lp < nloops; lp++) {
04472 FORMULA* newf = FORMULA::Comma3(FORMULA::Set0(fr_sreg0+lp, F[lp].RFormula),
04473 FORMULA::Set0(fw_sreg0+lp, F[lp].WFormula),
04474 FORMULA::Set0(frw_sreg0+lp,
04475 FORMULA::Add(FORMULA::Use(fr_sreg0+lp),
04476 FORMULA::Use(fw_sreg0+lp))));
04477 if (initialization == NULL)
04478 initialization = newf;
04479 else
04480 initialization = FORMULA::Comma(initialization, newf);
04481 }
04482
04483
04484
04485 for (lp = 0; lp < nloops-1; lp++) {
04486 if (lp == 0 && v1 <= 1)
04487 continue;
04488
04489
04490
04491
04492 FORMULA* nk_m_1 = (lp == 0) ? FORMULA::Const(v1-1) :
04493 FORMULA::Sub(FORMULA::Var(lp-1),
04494 FORMULA::Const(1));
04495
04496 FORMULA* fnewdata = FORMULA::Div(FORMULA::Sub(FORMULA::Use(frw_sreg0+lp),
04497 FORMULA::Use(frw_sreg0+lp+1)),
04498 nk_m_1);
04499
04500
04501
04502
04503 FORMULA* cond = FORMULA::Lt(FORMULA::Use(frw_sreg0+lp), FORMULA::Const(1));
04504 if (lp > 0)
04505 cond = FORMULA::Or(cond,
04506 FORMULA::Le(FORMULA::Var(lp-1), FORMULA::Const(1)));
04507 fnewdata = FORMULA::Cond(cond, FORMULA::Const(0), fnewdata);
04508
04509
04510
04511 FORMULA* requirements = FORMULA::Use(frw_sreg0+lp+1);
04512 if (F[lp].D > 2)
04513 requirements = FORMULA::Add(requirements,
04514 FORMULA::Mul(FORMULA::Use(newdata_sreg0+lp),
04515 FORMULA::Const(F[lp].D-1)));
04516 else if (F[lp].D == 2)
04517 requirements = FORMULA::Add(requirements,FORMULA::Use(newdata_sreg0+lp));
04518
04519 initialization = FORMULA::Comma3(initialization,
04520 FORMULA::Set(newdata_sreg0+lp, fnewdata),
04521 FORMULA::Set(requirements_sreg0+lp,
04522 requirements));
04523 }
04524
04525
04526
04527
04528
04529
04530
04531
04532
04533
04534
04535
04536
04537
04538
04539
04540
04541 FORMULA* reuse_lost_rbytes;
04542 FORMULA* reuse_lost_wbytes;
04543 {
04544 FORMULA* frac_rnum = FORMULA::Use(fr_sreg0);
04545 FORMULA* frac_wnum = FORMULA::Use(fw_sreg0);
04546 FORMULA* frac_denom = FORMULA::Use(frw_sreg0);
04547 FORMULA* factor = FORMULA::Var(0);
04548 INT lp;
04549 for (lp = 1; lp < nloops; lp++) {
04550 frac_rnum = FORMULA::Add(frac_rnum,
04551 FORMULA::Mul(factor->Duplicate(),
04552 FORMULA::Use(fr_sreg0+lp)));
04553 frac_wnum = FORMULA::Add(frac_wnum,
04554 FORMULA::Mul(factor->Duplicate(),
04555 FORMULA::Use(fw_sreg0+lp)));
04556 frac_denom = FORMULA::Add(frac_denom,
04557 FORMULA::Mul(factor->Duplicate(),
04558 FORMULA::Use(frw_sreg0+lp)));
04559 factor = FORMULA::Mul(factor, FORMULA::Var(lp));
04560 }
04561 FORMULA* frac_reads = FORMULA::Div(frac_rnum,
04562 frac_denom->Duplicate());
04563 FORMULA* frac_writes = FORMULA::Div(frac_wnum, frac_denom);
04564 reuse_lost_rbytes = FORMULA::Mul(FORMULA::Div(FORMULA::Use(frw_sreg0),
04565 FORMULA::Const(v1)),
04566 frac_reads);
04567 reuse_lost_wbytes = FORMULA::Mul(FORMULA::Div(FORMULA::Use(frw_sreg0),
04568 FORMULA::Const(v1)),
04569 frac_writes);
04570 }
04571
04572
04573
04574
04575
04576
04577
04578
04579
04580 for (lp = 0; lp < nloops-1; lp++) {
04581 if (lp == 0 && v1 <= 1)
04582 continue;
04583
04584
04585
04586
04587
04588 FORMULA* reuse = FORMULA::Max(FORMULA::Sub(FORMULA::Use(frw_sreg0+lp+1),
04589 FORMULA::Use(newdata_sreg0+lp)),
04590 FORMULA::Const(line_size));
04591
04592
04593 INT llp;
04594 for (llp = 1; llp <= lp; llp++)
04595 reuse = FORMULA::Mul(reuse, FORMULA::Var(llp-1));
04596
04597
04598
04599 double c1 = basic_middle_penalty / size;
04600 double c2 = additional_middle_penalty / size;
04601
04602 FORMULA* frac_reuse_lost1 = NULL;
04603 FORMULA* frac_reuse_lost2 = NULL;
04604
04605 if (c1)
04606 frac_reuse_lost1 = FORMULA::Mul(FORMULA::Use(requirements_sreg0+lp),
04607 FORMULA::Const(c1));
04608 else
04609 frac_reuse_lost1 = FORMULA::Const(0);
04610
04611 if (c2)
04612 frac_reuse_lost2 =
04613 FORMULA::Max(FORMULA::Const(0),
04614 FORMULA::Mul(FORMULA::Sub(FORMULA::Use(requirements_sreg0+lp),
04615 FORMULA::Const(eff_size)),
04616 FORMULA::Const(c2)));
04617 else
04618 frac_reuse_lost2 = FORMULA::Const(0);
04619
04620 FORMULA* reuse_lost = FORMULA::Mul(FORMULA::Add(frac_reuse_lost1,
04621 frac_reuse_lost2), reuse);
04622
04623
04624
04625
04626 FORMULA* frac_reads = FORMULA::Div(FORMULA::Use(fr_sreg0+lp+1),
04627 FORMULA::Use(frw_sreg0+lp+1));
04628 FORMULA* frac_writes = FORMULA::Div(FORMULA::Use(fw_sreg0+lp+1),
04629 FORMULA::Use(frw_sreg0+lp+1));
04630
04631
04632
04633
04634 FORMULA* reuse_lost_rbytes_lp = FORMULA::Mul(reuse_lost->Duplicate(),
04635 frac_reads);
04636 FORMULA* reuse_lost_wbytes_lp = FORMULA::Mul(reuse_lost,
04637 frac_writes);
04638
04639
04640
04641
04642
04643 FORMULA* cond = FORMULA::Lt(FORMULA::Use(frw_sreg0+lp), FORMULA::Const(1));
04644 if (lp > 0)
04645 cond = FORMULA::Or(cond,
04646 FORMULA::Le(FORMULA::Var(lp-1), FORMULA::Const(1)));
04647 reuse_lost_rbytes_lp = FORMULA::Cond(cond->Duplicate(), FORMULA::Const(0),
04648 reuse_lost_rbytes_lp);
04649 reuse_lost_wbytes_lp = FORMULA::Cond(cond, FORMULA::Const(0),
04650 reuse_lost_wbytes_lp);
04651
04652 reuse_lost_rbytes = FORMULA::Add(reuse_lost_rbytes,reuse_lost_rbytes_lp);
04653
04654 reuse_lost_wbytes = FORMULA::Add(reuse_lost_wbytes,reuse_lost_wbytes_lp);
04655 }
04656
04657
04658
04659
04660 FORMULA* variters = FORMULA::Var(0);
04661 INT ii;
04662 for (ii = 1; ii < nloops - 1; ii++)
04663 variters = FORMULA::Mul(variters, FORMULA::Var(ii));
04664
04665 reuse_lost_rbytes = FORMULA::Set(read_sreg,
04666 FORMULA::Div(reuse_lost_rbytes, variters->Duplicate()));
04667 reuse_lost_wbytes = FORMULA::Set(write_sreg,
04668 FORMULA::Div(reuse_lost_wbytes, variters));
04669
04670 return FORMULA::Comma3(initialization, reuse_lost_rbytes, reuse_lost_wbytes);
04671 }
04672
04673 static INT64 Estimate_Middle_Iters(INT loop,
04674 INT nloops,
04675 const mINT64* est_iters,
04676 const mINT64* max_iters,
04677 const INT* arl_stripsz)
04678 {
04679
04680
04681
04682 INT64 v1 = est_iters[loop];
04683 if (arl_stripsz[loop] > 1) {
04684 v1 = Divceil(INT(v1), arl_stripsz[loop]);
04685 v1 = MAX(v1, 2);
04686 }
04687 if (max_iters[loop] == UNBOUNDED_ITERS) {
04688
04689
04690
04691
04692
04693
04694
04695
04696
04697
04698
04699
04700
04701
04702
04703
04704
04705
04706
04707
04708 v1 = MAX(v1, 2*Nominal_Blocksize[nloops]);
04709 v1 = MAX(v1, 50);
04710 }
04711 return v1;
04712 }
04713
04714
04715
04716
04717
04718
04719
04720
04721 static FORMULA*
04722 Compute_Actual_Miss_Bytes(const ARRAY_REF* arl,
04723 INT nloops,
04724 const INT* loops,
04725 const mINT64* est_iters,
04726 const mINT64* max_iters,
04727 const INT* unrolls,
04728 const INT* arl_stripsz,
04729 INT depth,
04730 INT ninners,
04731 INT ntiles,
04732 INT stripdepth,
04733 const INT* permute_order,
04734 const INT* available_order,
04735 INT available_depth,
04736 INT mhd_level,
04737 INT64 v1,
04738 BOOL using_tlb,
04739 const BOOL* originally_has_reuse,
04740 INT read_sreg,
04741 INT write_sreg,
04742 INT first_free_sreg,
04743 double* rbytes_per_iter_const,
04744 double* wbytes_per_iter_const)
04745 {
04746 if (Debug_Cache_Model >= 2) {
04747 fprintf(TFile, "\n+++++ COMPUTING ACTUAL MISS BYTES FOR ");
04748 if (using_tlb)
04749 fprintf(TFile, "TLB ");
04750 else
04751 fprintf(TFile, "CACHE ");
04752 fprintf(TFile, "{");
04753 INT i;
04754 for (i = 0; i < nloops; i++) {
04755 fprintf(TFile, "%d", loops[i]);
04756 if (i < nloops - 1)
04757 fprintf(TFile, ",");
04758 }
04759 fprintf(TFile, "}\n");
04760 }
04761
04762 double bytes = 0.0;
04763
04764 FmtAssert(nloops <= available_depth+1,
04765 ("Bad nloops=%d avaliable_depth=%d", nloops, available_depth));
04766
04767
04768
04769
04770
04771
04772
04773
04774
04775
04776 if (nloops == ntiles+ninners ||
04777 !Has_Outer_Reuse_In_SNL(loops, nloops, depth, ninners+ntiles,
04778 available_order, available_depth, originally_has_reuse) ||
04779 !Fits_In_The_Cache(arl, loops, nloops, est_iters,
04780 max_iters, arl_stripsz, unrolls, depth, stripdepth,
04781 permute_order, using_tlb, &bytes)
04782 #ifdef _FIX_CACHE_MODEL_
04783 || using_tlb
04784 #endif
04785 ) {
04786
04787
04788
04789
04790
04791
04792
04793
04794
04795
04796
04797
04798
04799
04800
04801
04802 if (Debug_Cache_Model)
04803 fprintf(TFile,
04804 "HAS NO OUTER REUSE OR DOES NOT FIT IN CACHE. NORMAL CASE.\n\n");
04805 if (Debug_Cache_Model)
04806 fprintf(TFile, "+++++ COMPUTING MISS BYTES. NORMAL CASE. +++++\n");
04807 FORMULA* fp = Compute_Miss_Bytes(arl, nloops, loops, est_iters, max_iters,
04808 arl_stripsz, unrolls, depth, stripdepth, permute_order,
04809 mhd_level, v1, using_tlb, read_sreg, write_sreg, first_free_sreg,
04810 rbytes_per_iter_const, wbytes_per_iter_const);
04811 if (Debug_Cache_Model)
04812 fprintf(TFile, "+++++ END COMPUTING MISS BYTES. NORMAL CASE. +++++\n\n");
04813 return fp;
04814 }
04815
04816
04817
04818
04819
04820 INT* future_order = (INT*) alloca(sizeof(INT)*(available_depth+1));
04821 INT ao = 0;
04822 INT i;
04823 for (i = 0; i < nloops; i++)
04824 future_order[available_depth+1-nloops+i] = loops[i];
04825 for (i = 0; i <= available_depth; i++) {
04826 INT j;
04827 for (j = 0; j < nloops; j++) {
04828 if (loops[j] == available_order[i])
04829 break;
04830 }
04831 if (j >= nloops)
04832 future_order[ao++] = available_order[i];
04833 }
04834 FmtAssert(ao + nloops == available_depth + 1, ("Bug"));
04835 INT lcnt;
04836 for (lcnt = nloops + 1; lcnt <= ninners+ntiles; lcnt++) {
04837 if (Debug_Cache_Model >= 2)
04838 fprintf(TFile, "ATTEMPTING TO ADD LOOP {%d} TO SNL\n",
04839 future_order[available_depth + 1 - lcnt]);
04840
04841 if (!Has_Outer_Reuse_In_SNL(&future_order[available_depth+1-lcnt], lcnt,
04842 depth, ninners+ntiles, available_order, available_depth,
04843 originally_has_reuse)) {
04844 if (Debug_Cache_Model)
04845 fprintf(TFile, "NO OUTER USE ON SNL. SEARCH COMPLETE\n\n");
04846 break;
04847 }
04848 if (!Fits_In_The_Cache(arl, &future_order[available_depth+1-lcnt], lcnt,
04849 est_iters, max_iters, arl_stripsz, unrolls, depth, stripdepth,
04850 permute_order, using_tlb, &bytes)) {
04851 if (Debug_Cache_Model)
04852 fprintf(TFile, "DOES NOT FIT IN CACHE. SEARCH COMPLETE\n\n");
04853 break;
04854 }
04855 }
04856 if (lcnt == ninners + ntiles + 1)
04857 lcnt = nloops;
04858
04859 INT64 v1new = Estimate_Middle_Iters(future_order[available_depth+1-lcnt],
04860 lcnt, est_iters, max_iters, arl_stripsz);
04861 if (Debug_Cache_Model)
04862 fprintf(TFile, "+++++ COMPUTING MISS BYTES. EXTENDED CASE. +++++\n");
04863 FORMULA* rval =
04864 Compute_Miss_Bytes(arl, lcnt, &future_order[available_depth+1-lcnt],
04865 est_iters, max_iters, arl_stripsz, unrolls,
04866 depth, stripdepth, permute_order,
04867 mhd_level, v1new, using_tlb,
04868 read_sreg, write_sreg, first_free_sreg,
04869 rbytes_per_iter_const, wbytes_per_iter_const);
04870
04871 if (Debug_Cache_Model)
04872 fprintf(TFile, "+++++ END COMPUTING MISS BYTES. EXTENDED CASE. +++++\n\n");
04873 if (rval != NULL) {
04874
04875
04876 double* iters = (double*) alloca(sizeof(double)*lcnt);
04877 INT i;
04878 for (i = 1; i < lcnt; i++) {
04879 INT loop = future_order[available_depth+1-lcnt+i];
04880 if (unrolls[loop] <= 1)
04881 iters[i-1] = est_iters[loop];
04882 else
04883 iters[i-1] = (est_iters[loop] + unrolls[loop] - 1)/unrolls[loop];
04884 }
04885 rval->Eval(lcnt-1, iters);
04886 *rbytes_per_iter_const =
04887 FORMULA::Use(read_sreg)->Eval(0, (const double *)NULL);
04888 *wbytes_per_iter_const =
04889 FORMULA::Use(write_sreg)->Eval(0, (const double *)NULL);
04890 if (Debug_Cache_Model >= 2) {
04891 fprintf(TFile, "+++++ EXTENDED ACTUAL MISS BYTES RESULTS\n");
04892 fprintf(TFile, " Read Bytes Per Iteration %g\n",
04893 *rbytes_per_iter_const);
04894 fprintf(TFile, " Write Bytes Per Iteration %g\n",
04895 *wbytes_per_iter_const);
04896 fprintf(TFile, "+++++ END EXTENDED ACTUAL MISS BYTES RESULTS\n\n");
04897 }
04898 }
04899
04900 return NULL;
04901 }
04902
04903
04904
04905
04906
04907
04908
04909
04910
04911
04912
04913
04914
04915
04916
04917
04918
04919
04920
04921
04922
04923
04924
04925
04926
04927
04928
04929
04930
04931
04932
04933
04934
04935
04936
04937
04938
04939
04940
04941
04942
04943
04944
04945
04946
04947
04948
04949
04950
04951
04952
04953
04954
04955
04956
04957
04958
04959
04960
04961
04962
04963
04964
04965
04966
04967
04968
04969
04970
04971
04972
04973
04974
04975
04976
04977
04978
04979
04980
04981
04982
04983
04984
04985
04986
04987
04988
04989
04990
04991
04992
04993
04994
04995
04996
04997
04998
04999
05000
05001
05002
05003
05004
05005
05006
05007
05008
05009
05010 static void Nest_Model(const ARRAY_REF* arl,
05011 INT nloops,
05012 const INT* loops,
05013 const mINT64* est_iters,
05014 const mINT64* max_iters,
05015 const INT* arl_stripsz,
05016 INT depth,
05017 INT ninners,
05018 INT ntiles,
05019 INT stripdepth,
05020 const INT* permute_order,
05021 const INT* available_order,
05022 const INT available_depth,
05023 const INT* unrolls,
05024 INT iters_inside,
05025 INT mhd_level,
05026 const INT* required_blocksize,
05027 BOOL cache_fits_outside,
05028 BOOL tlb_fits_outside,
05029 double machine_cycles,
05030 double* c_cpi,
05031 double* o_cpi,
05032 INT* bsz,
05033 const BOOL* originally_has_reuse,
05034 INT64 u_nest_number)
05035 {
05036
05037
05038
05039
05040
05041
05042
05043
05044 if (Debug_Cache_Model) {
05045 fprintf(TFile, "\n+++++ NEST MODEL ");
05046 fprintf(TFile, "{");
05047 INT i;
05048 for (i = 0; i < nloops - 1; i++)
05049 fprintf(TFile, "%d,", loops[i]);
05050 fprintf(TFile, "%d}: #%lld ", loops[nloops - 1], u_nest_number);
05051 fprintf(TFile, "+++++\n");
05052 }
05053
05054 Is_True(!(cache_fits_outside && tlb_fits_outside),
05055 ("Nest_Model() shouldn't have these conditions--answer suboptimal"));
05056
05057 double rcache_const = 0.0;
05058 double wcache_const = 0.0;
05059 double rtlb_const = 0.0;
05060 double wtlb_const = 0.0;
05061
05062 FORMULA* tlb_initialization = NULL;
05063 FORMULA* cache_initialization = NULL;
05064
05065 INT rcache_sreg = 1;
05066 INT wcache_sreg = 2;
05067 INT ccpi_hidable_sreg = 3;
05068 INT ccpi_nonhidable_sreg = 4;
05069 INT rtlb_sreg = 5;
05070 INT wtlb_sreg = 6;
05071 INT tcpi_sreg = 7;
05072 INT first_free_sreg = 10;
05073
05074 double cache_clean_cpb =
05075 double(Cur_Mhd->Clean_Miss_Penalty)/Cur_Mhd->Line_Size;
05076 double cache_dirty_cpb =
05077 double(Cur_Mhd->Dirty_Miss_Penalty)/Cur_Mhd->Line_Size;
05078 double cache_dirty_cpb_nonhidable =
05079 (cache_dirty_cpb-cache_clean_cpb)*
05080 Cur_Mhd->Pct_Excess_Writes_Nonhidable/100.0;
05081 double cache_dirty_cpb_hidable =
05082 cache_dirty_cpb - cache_dirty_cpb_nonhidable;
05083 double tlb_clean_cpb =
05084 double(Cur_Mhd->TLB_Clean_Miss_Penalty)/Cur_Mhd->Page_Size;
05085 double tlb_dirty_cpb =
05086 double(Cur_Mhd->TLB_Dirty_Miss_Penalty)/Cur_Mhd->Page_Size;
05087
05088 INT64 v1 = Estimate_Middle_Iters(loops[0], nloops, est_iters,
05089 max_iters, arl_stripsz);
05090
05091 if (Debug_Cache_Model)
05092 fprintf(TFile, "USING MIDDLE LOOP ESTIMATE OF %lld ITERATIONS\n", v1);
05093
05094 if (!cache_fits_outside)
05095 cache_initialization = Compute_Actual_Miss_Bytes(
05096 arl, nloops, loops, est_iters, max_iters, unrolls, arl_stripsz,
05097 depth, ninners, ntiles, stripdepth, permute_order,
05098 available_order, available_depth, mhd_level, v1,
05099 FALSE, originally_has_reuse, rcache_sreg, wcache_sreg, first_free_sreg,
05100 &rcache_const, &wcache_const);
05101
05102 if (!tlb_fits_outside)
05103 tlb_initialization = Compute_Actual_Miss_Bytes(
05104 arl, nloops, loops, est_iters, max_iters, unrolls, arl_stripsz,
05105 depth, ninners, ntiles, stripdepth, permute_order,
05106 available_order, available_depth, mhd_level, v1,
05107 TRUE, originally_has_reuse, rtlb_sreg, wtlb_sreg, first_free_sreg,
05108 &rtlb_const, &wtlb_const);
05109
05110
05111
05112
05113
05114
05115
05116
05117
05118
05119
05120
05121
05122
05123
05124
05125
05126
05127
05128
05129
05130
05131
05132
05133 double counts0 = 1.0 - Cur_Mhd->Load_Op_Overlap_1;
05134 double countsm = 1.0 - Cur_Mhd->Load_Op_Overlap_2;
05135
05136 if (nloops == 1) {
05137 FmtAssert(cache_initialization == NULL && tlb_initialization == NULL,
05138 ("Broken Compute_Actual_Miss_Penalty()"));
05139
05140 double overhead_const = double(Loop_Overhead) / (v1 * iters_inside);
05141 double cache_cpi_const_hidable = 0.0;
05142 double cache_cpi_const_nonhidable = 0.0;
05143 double tlb_cpi_const = 0.0;
05144
05145 if (rcache_const || wcache_const) {
05146 cache_cpi_const_hidable =
05147 (rcache_const * cache_clean_cpb +
05148 wcache_const * cache_dirty_cpb_hidable) /
05149 (Cur_Mhd->Typical_Outstanding * iters_inside);
05150 cache_cpi_const_nonhidable =
05151 (wcache_const * cache_dirty_cpb_nonhidable) /
05152 (Cur_Mhd->Typical_Outstanding * iters_inside);
05153 }
05154 if (rtlb_const || wtlb_const) {
05155 tlb_cpi_const = (rtlb_const * tlb_clean_cpb +
05156 wtlb_const * tlb_dirty_cpb) / iters_inside;
05157 if (Mhd.TLB_NoBlocking_Model && machine_cycles) {
05158 double cache_cycles =
05159 cache_cpi_const_hidable + cache_cpi_const_nonhidable;
05160 if (cache_cycles < machine_cycles)
05161
05162
05163 tlb_cpi_const *= cache_cycles / machine_cycles;
05164 }
05165 }
05166 if (cache_cpi_const_hidable < machine_cycles) {
05167 cache_cpi_const_hidable *= counts0 +
05168 (cache_cpi_const_hidable/machine_cycles)*(countsm - counts0)/2;
05169 }
05170 else {
05171 cache_cpi_const_hidable = (counts0 + countsm)/2 * machine_cycles +
05172 (cache_cpi_const_hidable - machine_cycles);
05173 }
05174 *c_cpi = cache_cpi_const_hidable + cache_cpi_const_nonhidable +
05175 Mhd.TLB_Trustworthiness/100.0*tlb_cpi_const;
05176 *o_cpi = overhead_const;
05177 bsz[0] = 0;
05178 }
05179 else {
05180 FORMULA* overhead = FORMULA::Const(double(Loop_Overhead) / v1);
05181 INT ii;
05182 for (ii = 0; ii < nloops - 1; ii++) {
05183 overhead = FORMULA::Div(FORMULA::Add(overhead,
05184 FORMULA::Const(Loop_Overhead)),
05185 FORMULA::Var(ii));
05186 }
05187
05188
05189
05190
05191
05192
05193
05194
05195 if (cache_initialization == NULL) {
05196 FORMULA* rset = FORMULA::Set(rcache_sreg, FORMULA::Const(rcache_const));
05197 FORMULA* wset = FORMULA::Set(wcache_sreg, FORMULA::Const(wcache_const));
05198 cache_initialization = FORMULA::Comma(rset, wset);
05199 }
05200 if (tlb_initialization == NULL) {
05201 FORMULA* rset = FORMULA::Set(rtlb_sreg, FORMULA::Const(rtlb_const));
05202 FORMULA* wset = FORMULA::Set(wtlb_sreg, FORMULA::Const(wtlb_const));
05203 tlb_initialization = FORMULA::Comma(rset, wset);
05204 }
05205
05206 FORMULA* rcache_cycles = FORMULA::Mul(FORMULA::Use(rcache_sreg),
05207 FORMULA::Const(cache_clean_cpb));
05208 FORMULA* wcache_cycles_hidable =
05209 FORMULA::Mul(FORMULA::Use(wcache_sreg),
05210 FORMULA::Const(cache_dirty_cpb_hidable));
05211 FORMULA* wcache_cycles_nonhidable =
05212 FORMULA::Mul(FORMULA::Use(wcache_sreg),
05213 FORMULA::Const(cache_dirty_cpb_nonhidable));
05214 FORMULA* cache_cpi_hidable = FORMULA::Add(rcache_cycles,
05215 wcache_cycles_hidable);
05216 FORMULA* cache_cpi_nonhidable = wcache_cycles_nonhidable;
05217
05218 FORMULA* tlb_cpi =
05219 FORMULA::Add(FORMULA::Mul(FORMULA::Use(rtlb_sreg),
05220 FORMULA::Const(Mhd.TLB_Trustworthiness/100.0*tlb_clean_cpb)),
05221 FORMULA::Mul(FORMULA::Use(wtlb_sreg),
05222 FORMULA::Const(Mhd.TLB_Trustworthiness/100.0*tlb_dirty_cpb)));
05223
05224 if (Cur_Mhd->Typical_Outstanding > 1.0) {
05225 cache_cpi_hidable = FORMULA::Div(cache_cpi_hidable,
05226 FORMULA::Const(Cur_Mhd->Typical_Outstanding));
05227 cache_cpi_nonhidable = FORMULA::Div(cache_cpi_nonhidable,
05228 FORMULA::Const(Cur_Mhd->Typical_Outstanding));
05229 }
05230
05231 FORMULA* c_and_t_set = FORMULA::Comma5(
05232 cache_initialization,
05233 tlb_initialization,
05234 FORMULA::Set(ccpi_hidable_sreg, cache_cpi_hidable),
05235 FORMULA::Set(ccpi_nonhidable_sreg, cache_cpi_nonhidable),
05236 FORMULA::Set(tcpi_sreg, tlb_cpi));
05237 FORMULA* cpi_compute;
05238
05239 double mci = machine_cycles * iters_inside;
05240
05241 if (Cur_Mhd->Load_Op_Overlap_1 < 0.00001) {
05242
05243 cpi_compute = FORMULA::Add(FORMULA::Add(FORMULA::Use(ccpi_hidable_sreg),
05244 FORMULA::Use(tcpi_sreg)),
05245 overhead);
05246 }
05247 else {
05248
05249
05250 FORMULA* ctest = FORMULA::Lt(FORMULA::Use(ccpi_hidable_sreg),
05251 FORMULA::Const(mci));
05252
05253
05254
05255 FORMULA* lhs_x = FORMULA::Div(FORMULA::Use(ccpi_hidable_sreg),
05256 FORMULA::Const(mci));
05257 FORMULA* lhs_prop = FORMULA::Add(FORMULA::Const(counts0),
05258 FORMULA::Mul(lhs_x, FORMULA::Const((countsm - counts0)/2)));
05259 FORMULA* lhs = FORMULA::Mul(lhs_prop,
05260 FORMULA::Use(ccpi_hidable_sreg));
05261 lhs = FORMULA::Add(lhs, FORMULA::Use(tcpi_sreg));
05262
05263
05264
05265 FORMULA* rhs = FORMULA::Mul(FORMULA::Const((counts0 + countsm)/2),
05266 FORMULA::Const(mci));
05267 FORMULA* rhs2 = FORMULA::Sub(FORMULA::Use(ccpi_hidable_sreg),
05268 FORMULA::Const(mci));
05269
05270 rhs = FORMULA::Add(rhs, rhs2);
05271 rhs = FORMULA::Add(rhs, FORMULA::Use(tcpi_sreg));
05272
05273 cpi_compute = FORMULA::Add(FORMULA::Cond(ctest, lhs, rhs), overhead);
05274 }
05275
05276 cpi_compute = FORMULA::Add(cpi_compute,
05277 FORMULA::Use(ccpi_nonhidable_sreg));
05278
05279 FORMULA* total_cpi = FORMULA::Comma(c_and_t_set, cpi_compute);
05280
05281 if (Debug_Cache_Model >= 3) {
05282 fprintf(TFile, "+++++ USING TOTAL CPI FORMULA: \n");
05283 total_cpi->Print(TFile);
05284 fprintf(TFile, "\n+++++ END TOTAL CPI FORMULA\n\n");
05285 }
05286
05287 mINT64* adj_max_iters = (mINT64*) alloca(sizeof(mINT64)*(depth+1));
05288 INT i;
05289 for (i = 0; i <= depth; i++) {
05290
05291 adj_max_iters[i] = MIN(max_iters[i], MAX_BLOCKSIZE/unrolls[i]);
05292 if (arl_stripsz[i] > 1)
05293 adj_max_iters[i] = adj_max_iters[i] / arl_stripsz[i];
05294 }
05295
05296 if (Debug_Cache_Model)
05297 fprintf(TFile, "+++++ TRYING BLOCKSIZES \n");
05298
05299 RSolve_Go(total_cpi, nloops-1, loops+1, adj_max_iters,
05300 unrolls, required_blocksize, bsz+1);
05301
05302 if (Debug_Cache_Model) {
05303 fprintf(TFile, " SELECTED BLOCKSIZES:");
05304 fprintf(TFile, "(");
05305 for (i = 1; i < nloops; i++) {
05306 fprintf(TFile, "%d", bsz[i]);
05307 if (i < nloops - 1)
05308 fprintf(TFile, ",");
05309 }
05310 fprintf(TFile, ")\n");
05311 }
05312 if (Debug_Cache_Model)
05313 fprintf(TFile, "+++++ END TRYING BLOCKSIZES\n\n");
05314
05315 if (Debug_Cache_Model >= 2) {
05316 fprintf(TFile, "+++++ OVERHEAD CPI FORMULA: \n");
05317 overhead->Print(TFile);
05318 fprintf(TFile, "\n+++++ END OVERHEAD CPI FORMULA\n\n");
05319 }
05320
05321 *o_cpi = overhead->Eval(nloops-1, bsz+1) / iters_inside;
05322 *c_cpi = total_cpi->Eval(nloops-1, bsz+1) / iters_inside - *o_cpi;
05323
05324
05325
05326
05327
05328
05329
05330
05331
05332 bsz[0] = -1;
05333 INT stripped_count = 0;
05334 for (i = 1; i < nloops; i++) {
05335 INT ii = loops[i];
05336 if (bsz[i] >= adj_max_iters[ii])
05337 bsz[i] = 0;
05338 else {
05339 INT ssz = arl_stripsz[ii] > 1 ? arl_stripsz[ii] : 1;
05340 ssz *= unrolls[ii];
05341 FmtAssert(MAX_LCM % ssz == 0, ("Impossible block size"));
05342 INT new_lcm = MAX_LCM / ssz;
05343 INT new_bsz;
05344 for (new_bsz = bsz[i]; new_lcm % new_bsz; new_bsz--)
05345 continue;
05346 if (new_bsz > 1) {
05347 bsz[i] = new_bsz*ssz;
05348 stripped_count++;
05349 } else {
05350 bsz[i] = 0;
05351 }
05352 }
05353 }
05354 if (stripped_count == 0)
05355 bsz[0] = 0;
05356 }
05357
05358 if (Debug_Cache_Model) {
05359 fprintf(TFile, "++++ RESULTS FOR NEST MODEL #%lld\n", u_nest_number);
05360 fprintf(TFile, " Cache %.4g cpi\n", *c_cpi);
05361 fprintf(TFile, " Overhead %.4g cpi\n", *o_cpi);
05362 fprintf(TFile, " Total %.4g cpi\n", *c_cpi + *o_cpi);
05363 fprintf(TFile, " Iterations Inside %d\n", iters_inside);
05364 if (nloops > 1) {
05365 fprintf(TFile, " Blocksizes = ");
05366 fprintf(TFile, " (");
05367 INT i;
05368 for (i = 1; i < nloops; i++) {
05369 fprintf(TFile, "%d", bsz[i]);
05370 if (i < nloops - 1)
05371 fprintf(TFile, ",");
05372 }
05373 fprintf(TFile, ")\n");
05374 }
05375 fprintf(TFile, "++++ END RESULTS FOR NEST MODEL\n\n");
05376 }
05377
05378 if (Debug_Cache_Model) {
05379 fprintf(TFile, "+++++ END NEST MODEL ");
05380 fprintf(TFile, "{");
05381 INT i;
05382 for (i = 0; i < nloops - 1; i++)
05383 fprintf(TFile, "%d,", loops[i]);
05384 fprintf(TFile, "%d}: ", loops[nloops - 1]);
05385 fprintf(TFile, "#%lld +++++\n\n", u_nest_number);
05386 }
05387 }
05388
05389 typedef HASH_TABLE<const ARRAY_REF_NODE*,INT> CONST_TAB;
05390
05391
05392
05393
05394
05395
05396
05397
05398
05399
05400
05401
05402
05403
05404
05405
05406
05407
05408
05409
05410
05411
05412
05413
05414
05415
05416 static void Has_Reuse(const ARRAY_REF* arl,
05417 const INT* outertiles,
05418 BOOL* has_reuse,
05419 INT first,
05420 INT depth)
05421 {
05422 BOOL* used = (BOOL*) alloca(sizeof(BOOL)*(depth+1));
05423 INT i;
05424 for (i = first; i <= depth; i++) {
05425 has_reuse[i] = FALSE;
05426 used[i] = FALSE;
05427
05428 }
05429
05430 MEM_POOL_Push(&LNO_local_pool);
05431
05432 CONST_TAB* const_tab =
05433 CXX_NEW(CONST_TAB(103, &LNO_local_pool), &LNO_local_pool);
05434
05435
05436
05437
05438
05439
05440
05441 INT ix;
05442 for (ix = 0; ix < arl->Elements(); ix++) {
05443 ARRAY_REF_CONST_ITER iter(arl->Array_Ref_List(ix));
05444 for (const ARRAY_REF_NODE* n = iter.First();
05445 !iter.Is_Empty(); n = iter.Next()) {
05446 ACCESS_ARRAY* aa = n->Array;
05447 WN* wn = n->Wn;
05448
05449 if (aa->Too_Messy)
05450 continue;
05451
05452
05453
05454 BOOL constant = TRUE;
05455 INT j;
05456 for (j = aa->Num_Vec()-1; j >= 0; j--) {
05457 ACCESS_VECTOR* av = aa->Dim(j);
05458 INT ncl = av->Non_Const_Loops();
05459 for (INT ii = first; ii <= depth; ii++) {
05460 if (av->Loop_Coeff(ii) || ii < ncl) {
05461 used[ii] = TRUE;
05462 constant = FALSE;
05463 }
05464 }
05465 }
05466 if (constant) {
05467 const_tab->Enter(n,1);
05468 continue;
05469 }
05470
05471
05472
05473 BOOL normal = wn && aa->Num_Vec() == WN_num_dim(wn);
05474 INT i;
05475 for (i = first; i <= depth; i++) {
05476 if (has_reuse[i] || outertiles[i])
05477 continue;
05478
05479 INT j = aa->Num_Vec()-2;
05480
05481
05482
05483
05484
05485 if (normal) {
05486 if (LNO_Cache_Model_Edge_Effects)
05487 j--;
05488 else {
05489 WN* wn1 = WN_array_dim(wn, WN_num_dim(wn)-1);
05490 INT linesz = Cur_Mhd->TLB_Valid() ? Cur_Mhd->Page_Size :
05491 Cur_Mhd->Line_Size;
05492 if (WN_operator(wn1) == OPR_INTCONST &&
05493 n->Element_Size() * WN_const_val(wn1) < linesz)
05494 j--;
05495 }
05496 }
05497 INT jj;
05498 for (jj = j ; jj >= 0; jj--) {
05499 if (aa->Dim(jj)->Loop_Coeff(i) ||
05500 i < aa->Dim(jj)->Non_Const_Loops()) {
05501 if (used[i] == FALSE) {
05502 DevWarn("Has_Locality has bug (self-corrected)");
05503 used[i] = TRUE;
05504 }
05505 break;
05506 }
05507 }
05508 if (jj < 0) {
05509
05510
05511 BOOL reuse = TRUE;
05512 if (j >= 0) {
05513 INT jj;
05514 for (jj = aa->Num_Vec()-1; reuse && jj > j; jj--) {
05515 INT coeff = aa->Dim(jj)->Loop_Coeff(i);
05516 if (coeff > Happy_Coefficient)
05517 reuse = FALSE;
05518 else if (normal) {
05519 WN* wn1 = WN_array_dim(wn, WN_num_dim(wn)-1);
05520 if (WN_operator(wn1) == OPR_INTCONST &&
05521 coeff >= WN_const_val(wn1))
05522 reuse = FALSE;
05523 }
05524 }
05525 }
05526 has_reuse[i] = reuse;
05527 }
05528 }
05529 }
05530 }
05531
05532
05533
05534
05535
05536
05537 for (i = first; i <= depth; i++) {
05538 if (used[i] == FALSE && !outertiles[i]) {
05539 has_reuse[i] = TRUE;
05540 continue;
05541 }
05542 if (has_reuse[i] || outertiles[i])
05543 continue;
05544
05545
05546
05547 INT ix;
05548 for (ix = 0; !has_reuse[i] && ix < arl->Elements(); ix++) {
05549 ARRAY_REF_CONST_ITER iter(arl->Array_Ref_List(ix));
05550 for (const ARRAY_REF_NODE* n = iter.First();
05551 !has_reuse[i] && !iter.Is_Empty(); n = iter.Next()) {
05552 ACCESS_ARRAY* aa = n->Array;
05553 if (aa->Too_Messy || const_tab->Find(n) || aa->Num_Vec() == 1)
05554 continue;
05555
05556 BOOL seen = FALSE;
05557 ARRAY_REF_CONST_ITER iter2(arl->Array_Ref_List(ix));
05558 for (const ARRAY_REF_NODE* n2 = iter2.First();
05559 !iter2.Is_Empty(); n2 = iter2.Next()) {
05560 if (seen == FALSE) {
05561 if (n2 == n)
05562 seen = TRUE;
05563 continue;
05564 }
05565
05566 ACCESS_ARRAY* aa2 = n2->Array;
05567 if (aa2->Too_Messy || const_tab->Find(n2) ||
05568 aa2->Num_Vec() != aa->Num_Vec())
05569 continue;
05570 INT j;
05571 for (j = aa->Num_Vec()-2; j >= 0; j--) {
05572 INT c1 = aa->Dim(j)->Loop_Coeff(i);
05573 INT c2 = aa2->Dim(j)->Loop_Coeff(i);
05574 INT cc1 = aa->Dim(j)->Const_Offset;
05575 INT cc2 = aa2->Dim(j)->Const_Offset;
05576 if (c1 != 0 && c1 == c2 && cc1 != cc2 && (cc1 - cc2)%c1 == 0)
05577 has_reuse[i] = TRUE;
05578 }
05579 }
05580 }
05581 }
05582 }
05583
05584 if (Debug_Cache_Model >= 3) {
05585 fprintf(TFile, "Loops having reuse <before pruning>: (");
05586 INT has_reuse_total = 0;
05587 INT i;
05588 for (i = first; i <= depth; i++)
05589 if (has_reuse[i])
05590 has_reuse_total++;
05591 INT has_reuse_count = 0;
05592 for (i = first; i <= depth; i++) {
05593 if (has_reuse[i]) {
05594 fprintf(TFile, "%d", i);
05595 if (++has_reuse_count < has_reuse_total)
05596 fprintf(TFile, ",");
05597 }
05598 }
05599 fprintf(TFile, ")\n\n");
05600 }
05601
05602 CXX_DELETE(const_tab, &LNO_local_pool);
05603 MEM_POOL_Pop(&LNO_local_pool);
05604 }
05605
05606 static INT64 Exact_Iteration_Count(WN* loop)
05607 {
05608 INT64 rval = -1;
05609 DO_LOOP_INFO* dli = Get_Do_Loop_Info(loop);
05610
05611 if (dli->LB->Num_Vec() == 1 && dli->UB->Num_Vec() == 1 &&
05612 dli->Step->Is_Const() && dli->Step->Const_Offset) {
05613 INT stp = dli->Step->Const_Offset;
05614 ACCESS_VECTOR* lb = dli->LB->Dim(0);
05615 ACCESS_VECTOR* ub = dli->UB->Dim(0);
05616
05617 ACCESS_VECTOR *av = Add(lb, ub, &LNO_local_pool);
05618 if (av->Is_Const()) {
05619 rval = stp > 0 ? av->Const_Offset+1 : -av->Const_Offset-1;
05620 if (rval < 0)
05621 rval = 0;
05622 if (stp < 0)
05623 stp = -stp;
05624 rval = (rval + stp - 1)/stp;
05625 }
05626 CXX_DELETE(av, &LNO_local_pool);
05627 }
05628
05629 return rval;
05630 }
05631
05632
05633
05634
05635
05636
05637
05638
05639 #define MAX_TILE_LOOPS 4
05640
05641 static void Limit_Reused_Loops(const ARRAY_REF* arl,
05642 const INT* unrolls,
05643 BOOL* has_reuse,
05644 INT first,
05645 INT depth,
05646 const INT64* est_iters,
05647 const INT64* max_iters)
05648 {
05649
05650 INT reuse_count = 0;
05651 INT i;
05652 for (i = first; i <= depth; i++)
05653 if (has_reuse[i])
05654 reuse_count++;
05655 if (reuse_count <= MAX_TILE_LOOPS)
05656 return;
05657
05658
05659 INT reuse_indices[MAX_DEPTH];
05660 for (i = 0; i < first; i++)
05661 reuse_indices[i] = -1;
05662 for (i = first; i <= depth; i++)
05663 reuse_indices[i] = has_reuse[i] ? i : -1;
05664
05665
05666
05667 double footprint_sizes[MAX_DEPTH];
05668 for (i = 0; i < first; i++)
05669 footprint_sizes[i] = (double) 0;
05670 INT loops[1];
05671 INT arl_stripsz[MAX_DEPTH];
05672 for (i = 0; i <= depth; i++) {
05673 arl_stripsz[i] = 1;
05674 }
05675 INT permute_order[MAX_DEPTH];
05676 for (i = first; i <= depth; i++) {
05677 loops[0] = i;
05678 INT j;
05679 for (j = 0; j < first; j++)
05680 permute_order[j] = j;
05681 for (j = first; j < depth; j++)
05682 permute_order[j] = j < i ? j : j + 1;
05683 permute_order[depth] = i;
05684 COMPUTE_FOOTPRINT_RVAL Fmla = Compute_Footprint(arl, 1, loops, arl_stripsz,
05685 est_iters, max_iters, unrolls, depth, depth + 1, permute_order,
05686 -1, est_iters[i], FALSE, -1);
05687 double footprint = Fmla.AllFormula() == NULL ? 0 :
05688 Fmla.AllFormula()->Eval(0, (const double *) NULL);
05689 footprint /= est_iters[i];
05690 footprint_sizes[i] = MAX(footprint, 0.0);
05691 }
05692
05693
05694 INT j = 0;
05695 for (i = first; i <= depth; i++) {
05696 if (reuse_indices[i] >= 0) {
05697 reuse_indices[j] = reuse_indices[i];
05698 footprint_sizes[j++] = footprint_sizes[i];
05699 }
05700 }
05701 FmtAssert(j == reuse_count, ("Mistake counting reused loops."));
05702 for (i = 0; i < MAX_TILE_LOOPS; i++) {
05703 for (j = i + 1; j < reuse_count; j++) {
05704 if (footprint_sizes[j] < footprint_sizes[i]) {
05705 INT reuse_index = reuse_indices[i];
05706 reuse_indices[i] = reuse_indices[j];
05707 reuse_indices[j] = reuse_index;
05708 double footprint_size = footprint_sizes[i];
05709 footprint_sizes[i] = footprint_sizes[j];
05710 footprint_sizes[j] = footprint_size;
05711 }
05712 }
05713 }
05714 for (i = MAX_TILE_LOOPS + 1; i <= depth; i++) {
05715 reuse_indices[i] = -1;
05716 footprint_sizes[i] = 0.0;
05717 }
05718
05719
05720 for (i = first; i <= depth; i++)
05721 has_reuse[i] = FALSE;
05722 for (i = 0; i < MAX_TILE_LOOPS; i++)
05723 has_reuse[reuse_indices[i]] = TRUE;
05724 }
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736 static INT Update_Available_Order(INT depth,
05737 const INT* permute_order,
05738 INT nstrips,
05739 INT stripdepth,
05740 const INT* iloop,
05741 INT* available_order)
05742 {
05743 Is_True(Is_Permutation_Vector(permute_order, depth+1),
05744 ("Not a permutation"));
05745
05746 if (nstrips == 0)
05747 stripdepth = depth;
05748
05749 Is_True((nstrips == 0) == (stripdepth == depth), ("Bad stripdepth"));
05750
05751
05752 INT i;
05753 for (i = 0; i < stripdepth; i++)
05754 available_order[i] = permute_order[i];
05755
05756 if (stripdepth > depth)
05757 return depth;
05758
05759
05760
05761 INT available_depth = stripdepth-1;
05762 INT s;
05763 for (s = 0; s < nstrips; s++) {
05764 if (Is_In_Array(iloop[s], available_order, available_depth+1))
05765 return available_depth;
05766 else
05767 available_order[++available_depth] = iloop[s];
05768 }
05769
05770
05771
05772 INT middle_loop = permute_order[stripdepth];
05773 if (!Is_In_Array(middle_loop, available_order, available_depth+1))
05774 available_order[++available_depth] = middle_loop;
05775
05776 #ifdef Is_True_On
05777 for (i = 0; i <= available_depth; i++)
05778 FmtAssert(available_order[i] >= 0 && available_order[i] <= depth,
05779 ("Bad available order entry"));
05780 #endif
05781 return available_depth;
05782 }
05783
05784
05785
05786
05787
05788
05789
05790
05791
05792
05793
05794
05795
05796
05797
05798
05799
05800
05801
05802
05803
05804
05805
05806
05807
05808
05809
05810
05811
05812
05813
05814
05815
05816
05817
05818
05819
05820
05821
05822
05823
05824
05825
05826
05827
05828
05829
05830
05831
05832
05833
05834
05835
05836
05837
05838
05839
05840
05841
05842
05843
05844
05845
05846
05847
05848
05849
05850
05851
05852
05853
05854
05855
05856
05857
05858
05859
05860
05861
05862
05863
05864
05865
05866
05867
05868
05869 static void One_Cache_Model(const ARRAY_REF* arl,
05870 INT depth,
05871 INT* stripdepth,
05872 INT* nstrips,
05873 INT* iloop,
05874 INT* stripsz,
05875 INT* striplevel,
05876 INT* permute_order,
05877 INT* available_order,
05878 INT* available_depth,
05879 const INT* inners,
05880 INT ninners,
05881 const INT* tiles,
05882 INT ntiles,
05883 const INT* unrolls,
05884 const INT* outertiles,
05885 const INT64* est_iters,
05886 const INT64* max_iters,
05887 BOOL blocking_disabled,
05888 const INT* required_blocksize,
05889 INT mhd_level,
05890 double machine_cycles,
05891 double old_doverhead_best,
05892 double* cycles_per_iter,
05893 double* doverhead_best)
05894 {
05895 INT i;
05896 INT s;
05897 MEM_POOL_Push(&LNO_local_pool);
05898
05899 #if Is_True_On
05900 {
05901
05902
05903
05904
05905
05906 INT counts[MAX_DEPTH+1];
05907 for (i = 0; i <= depth; i++)
05908 counts[i] = 0;
05909 for (i = 0; i < ninners; i++) {
05910 FmtAssert(inners[i] >= 0 && inners[i] <= depth, ("sanity check dies"));
05911 counts[inners[i]]++;
05912 }
05913 for (i = 0; i < ntiles; i++) {
05914 FmtAssert(tiles[i] >= 0 && tiles[i] <= depth, ("sanity check dies"));
05915 counts[tiles[i]]++;
05916 }
05917 for (i = 0; i <= depth; i++)
05918 FmtAssert(counts[i] == 0 || counts[i] == 1, ("sanity check dies"));
05919 for (i = 0; i <= *available_depth; i++)
05920 if (counts[available_order[i]])
05921 counts[available_order[i]]--;
05922 for (i = 0; i <= depth; i++)
05923 FmtAssert(counts[i] == 0, ("sanity check dies"));
05924 }
05925 #endif
05926
05927 if (Debug_Cache_Model) {
05928 if (mhd_level == 0)
05929 fprintf(stdout, "\n");
05930 fprintf(stdout, "Memory Level #%d. Required inner #%d.\n",
05931 mhd_level, available_order[*available_depth]);
05932 }
05933
05934 Rtry_Count = 0;
05935
05936 Happy_Coefficient = MAX(10, 3*LNO_Outer_Unroll);
05937 Happy_Coefficient = MAX(Happy_Coefficient, 3*LNO_Outer_Unroll_Max);
05938 Happy_Coefficient = MAX(Happy_Coefficient, 3*LNO_Outer_Unroll_Prod_Max);
05939 Happy_Coefficient = MIN(Happy_Coefficient, 30);
05940
05941 Cur_Mhd = NULL;
05942 if (mhd_level >= 0 && mhd_level < MHD_MAX_LEVELS) {
05943 Cur_Mhd = &Mhd.L[mhd_level];
05944 if (!Cur_Mhd->Valid())
05945 Cur_Mhd = NULL;
05946 }
05947
05948 if (Debug_Cache_Model) {
05949 fprintf(TFile,
05950 "\n***** L[%d] CACHE MODEL FOR REQUIRED INNER LOOP %d *****\n",
05951 mhd_level, available_order[*available_depth]);
05952 }
05953
05954
05955
05956
05957
05958
05959
05960
05961
05962 INT* arl_stripsz = (INT*) alloca(sizeof(INT)*(depth+1));
05963 for (i = 0; i <= depth; i++) {
05964
05965 arl_stripsz[i] = 0;
05966 INT ii;
05967 for (ii = 0; ii <= *available_depth; ii++) {
05968 if (available_order[ii] == i) {
05969 arl_stripsz[i] = 1;
05970
05971 for (s = 0; s < *nstrips; s++) {
05972 if (iloop[s] == i) {
05973 FmtAssert(stripsz[s] % unrolls[i] == 0, ("impossible"));
05974 FmtAssert(stripsz[s] >= unrolls[i], ("impossible"));
05975 arl_stripsz[i] = stripsz[s] / unrolls[i];
05976 break;
05977 }
05978 }
05979 break;
05980 }
05981 }
05982 }
05983
05984
05985
05986
05987
05988
05989 INT iters_inside = 1;
05990 INT d;
05991 for (d = 0; d <= depth; d++) {
05992 INT iters = arl_stripsz[d] ? arl_stripsz[d] : est_iters[d];
05993
05994 iters_inside *= iters * unrolls[d];
05995
05996 FmtAssert(iters && unrolls[d], ("iters, unrolls cannot possibly be zero"));
05997 FmtAssert(MAX_LCM % unrolls[d] == 0,
05998 ("Unrolling factor too large for cache model: %d", unrolls[d]));
05999 }
06000
06001 UINT64 u_best = 0;
06002 double dcache_best = 0.0;
06003 double dovhd_best = 0.0;
06004 INT uloops_best = 0;
06005 INT* rsz_best = (INT*) alloca(sizeof(INT)*(depth+1));
06006
06007 if (depth >= MAX_DEPTH || Cur_Mhd == NULL) {
06008
06009 if (depth >= MAX_DEPTH)
06010 DevWarn("loop depth is %d >= %d -- can't cache model", depth, MAX_DEPTH);
06011 *cycles_per_iter = Compute_Do_Overhead(depth, *stripdepth, *nstrips, iloop,
06012 stripsz, unrolls, est_iters, permute_order);
06013 if (Debug_Cache_Model >= 2)
06014 fprintf(TFile,
06015 "***** END ONE CACHE MODEL ovhd-only ovhd = %g *****\n",
06016 *cycles_per_iter);
06017 *doverhead_best = *cycles_per_iter;
06018 goto one_cache_model_return_point;
06019 }
06020
06021 {
06022
06023 MAT<FRAC>::Set_Default_Pool(&LNO_local_pool);
06024 FORMULA::Fpool = &LNO_local_pool;
06025
06026
06027
06028
06029
06030
06031 BOOL has_reuse[MAX_DEPTH];
06032 Has_Reuse(arl, outertiles, has_reuse, 0, depth);
06033 Limit_Reused_Loops(arl, unrolls, has_reuse, depth - ninners + 1, depth,
06034 est_iters, max_iters);
06035
06036 {
06037
06038 INT i;
06039 for (i = 0; i <= depth; i++) {
06040 if (has_reuse[i] == TRUE)
06041 break;
06042 }
06043 if (i > depth)
06044 blocking_disabled = TRUE;
06045 }
06046
06047
06048
06049
06050
06051
06052
06053
06054
06055
06056
06057
06058
06059
06060
06061
06062
06063 INT krequired_inner = -1;
06064
06065 for (i = 0; i < ninners; i++) {
06066 if (inners[i] == available_order[*available_depth]) {
06067 FmtAssert(krequired_inner == -1, ("Impossible"));
06068 krequired_inner = i;
06069 }
06070 }
06071 if (krequired_inner == -1) {
06072 for (i = 0; i < ntiles; i++) {
06073 if (tiles[i] == available_order[*available_depth]) {
06074 FmtAssert(krequired_inner == -1, ("Impossible"));
06075 krequired_inner = i + ninners;
06076 }
06077 }
06078 }
06079 if (krequired_inner == -1) {
06080
06081 DevWarn("Couldn't find required inner loop!");
06082 *cycles_per_iter = Compute_Do_Overhead(depth, *stripdepth, *nstrips,
06083 iloop, stripsz, unrolls, est_iters, permute_order);
06084 if (Debug_Cache_Model >= 2)
06085 fprintf(TFile,
06086 "***** END ONE CACHE MODEL (missing inner) ovhd = %g *****\n",
06087 *cycles_per_iter);
06088 *doverhead_best = *cycles_per_iter;
06089 goto one_cache_model_return_point;
06090 }
06091
06092 Is_True(Is_Permutation_Vector(permute_order,depth+1),("Bad permutation"));
06093
06094
06095
06096
06097
06098 INT kk = 0;
06099 INT snl_loops[MAX_DEPTH];
06100 for (i = *available_depth + 1 - ninners - ntiles;i<=*available_depth;i++)
06101 snl_loops[kk++] = available_order[i];
06102 INT snl_nloops = ninners + ntiles;
06103 FmtAssert(snl_nloops == kk, ("Indexing problem detected."));
06104 double cache_bytes = (double) 0.0;
06105 BOOL fits = Fits_In_The_Cache(arl, snl_loops, snl_nloops, est_iters,
06106 max_iters, arl_stripsz, unrolls, depth, *stripdepth, permute_order,
06107 FALSE, &cache_bytes);
06108 if (Debug_Cache_Model >= 3) {
06109 if (fits)
06110 fprintf(TFile, "FITS IN THE CACHE. %g BYTES\n\n", cache_bytes);
06111 else
06112 fprintf(TFile, "DOES NOT FIT IN THE CACHE. %g BYTES\n\n",
06113 cache_bytes);
06114 }
06115 double tlb_bytes = (double) 0.0;
06116 BOOL tlb_fits = Fits_In_The_Cache(arl, snl_loops, snl_nloops, est_iters,
06117 max_iters, arl_stripsz, unrolls, depth, *stripdepth, permute_order, TRUE,
06118 &tlb_bytes);
06119 if (Debug_Cache_Model >= 3) {
06120 double pages = Cur_Mhd->TLB_Valid()
06121 ? tlb_bytes/Cur_Mhd->Page_Size : tlb_bytes;
06122 if (tlb_fits)
06123 fprintf(TFile, "FITS IN THE TLB. %g PAGES\n\n", pages);
06124 else
06125 fprintf(TFile, "DOES NOT FIT IN THE TLB. %g PAGES\n\n", pages);
06126 }
06127 if (fits && tlb_fits) {
06128 *cycles_per_iter = Compute_Do_Overhead(depth, *stripdepth, *nstrips,
06129 iloop, stripsz, unrolls, est_iters,
06130 permute_order);
06131 *doverhead_best = *cycles_per_iter;
06132 goto one_cache_model_return_point;
06133 }
06134
06135
06136
06137
06138 {
06139
06140
06141
06142
06143 double cs = !fits ? Cur_Mhd->Effective_Size/160.0 :
06144 Cur_Mhd->TLB_Entries * Cur_Mhd->Page_Size;
06145 cs /= iters_inside;
06146
06147
06148
06149
06150
06151
06152
06153
06154
06155
06156
06157
06158
06159
06160 Nominal_Blocksize[0] = 1000;
06161 Nominal_Blocksize[1] = 500;
06162 INT l;
06163 for (l = 2; l <= ntiles+ninners; l++) {
06164
06165 double nblock = l == 2 ? cs : l == 3 ? sqrt(cs) : pow(cs, 1.0/(l-1));
06166 INT i;
06167 for (i = 1; i < 20; i++) {
06168 if (double(1 << i) > nblock)
06169 break;
06170 }
06171 Nominal_Blocksize[l] = MIN(1 << (i-1), (MAX_BLOCKSIZE>>1));
06172 }
06173 }
06174
06175 INT required_blocksize_mask = 0;
06176 if (!blocking_disabled) {
06177 for (i = 0; i < ninners; i++)
06178 if (required_blocksize[inners[i]] >= 0)
06179 required_blocksize_mask |= (1<<i);
06180 for (i = 0; i < ntiles; i++)
06181 if (required_blocksize[tiles[i]] >= 0)
06182 required_blocksize_mask |= (1<<(i+ninners));
06183 }
06184
06185 BOOL bestseen = FALSE;
06186 UINT64 ninners_mask = (UINT64(1) << ninners) - 1;
06187
06188
06189 UINT64 u = 0;
06190 INT uloops = 0;
06191 double dcache = 0.0;
06192 double doverhead = 0.0;
06193 INT* rsz = (INT*) alloca(sizeof(INT)*(depth+1));
06194
06195
06196 for (UINT64 k = 1; k < (UINT64(1) << (ninners+ntiles)); k++) {
06197 if (k > ninners_mask) {
06198
06199
06200
06201
06202 INT kk;
06203 for (kk = ninners+ntiles-1; kk >= ninners; kk--)
06204 if (k && (1 << kk))
06205 break;
06206
06207 k |= ((1 << kk) - 1);
06208
06209
06210
06211 if (has_reuse[tiles[kk-ninners]] == FALSE &&
06212 required_blocksize_mask != (1<<krequired_inner) &&
06213 !(required_blocksize_mask & (1<<kk)))
06214 continue;
06215 }
06216 else {
06217
06218
06219
06220
06221 INT kk;
06222 for (kk = 0; kk < ninners; kk++)
06223 if ((k&(1<<kk)) &&
06224 inners[kk] != available_order[*available_depth] &&
06225
06226 has_reuse[inners[kk]] == FALSE &&
06227
06228 required_blocksize_mask != (1<<krequired_inner) &&
06229
06230 !(required_blocksize_mask & (1<<kk)))
06231 break;
06232 if (kk < ninners)
06233 continue;
06234 }
06235
06236
06237
06238
06239
06240 if (!(k & (1<<krequired_inner)))
06241 continue;
06242
06243 if (blocking_disabled && k != (1<<krequired_inner))
06244 continue;
06245
06246 if ((required_blocksize_mask & k) != required_blocksize_mask)
06247 continue;
06248
06249
06250
06251
06252
06253
06254
06255
06256
06257 for (i = 0; i <= depth; i++) {
06258 if (i != available_order[*available_depth] &&
06259 arl_stripsz[i] == 0 && k&(1<<i))
06260 break;
06261 }
06262 if (i <= depth)
06263 continue;
06264
06265
06266
06267 u = 0;
06268 uloops = 0;
06269 INT kk;
06270 for (kk = 0; kk < ninners+ntiles; kk++) {
06271 if (k & (UINT64(1)<<kk)) {
06272 if (kk < ninners)
06273 u |= (UINT64(1)<<inners[kk]);
06274 else
06275 u |= (UINT64(1)<<tiles[kk-ninners]);
06276 uloops++;
06277 }
06278 }
06279
06280
06281
06282
06283 INT lloops[MAX_DEPTH+1];
06284 INT nnloops = 0;
06285 INT i;
06286 for (i = 0; i <= *available_depth; i++) {
06287 INT ii = available_order[i];
06288 if (u & (UINT64(1) << ii))
06289 lloops[nnloops++] = ii;
06290 }
06291 FmtAssert(nnloops == uloops, ("Bug in cache_model.cxx"));
06292
06293 Nest_Model(arl, nnloops, lloops, est_iters, max_iters,
06294 arl_stripsz, depth, ninners, ntiles, *stripdepth, permute_order,
06295 available_order, *available_depth,
06296 unrolls, iters_inside, mhd_level, required_blocksize,
06297 fits, tlb_fits, machine_cycles,&dcache,&doverhead, rsz, has_reuse, u);
06298
06299
06300 if (!bestseen || (dcache_best+dovhd_best) > (dcache+doverhead)) {
06301 if (Debug_Cache_Model)
06302 fprintf(stdout, "*");
06303 bestseen = TRUE;
06304 dcache_best = dcache;
06305 dovhd_best = doverhead;
06306 uloops_best = uloops;
06307 u_best = u;
06308 INT i;
06309 for (i = 0; i < uloops_best; i++)
06310 rsz_best[i] = rsz[i];
06311 } else {
06312 if (Debug_Cache_Model)
06313 fprintf(stdout, " ");
06314 }
06315 if (Debug_Cache_Model) {
06316 fprintf(stdout, "{");
06317 for (i = 0; i < nnloops - 1; i++)
06318 fprintf(stdout, "%d,", lloops[i]);
06319 fprintf(stdout, "%d}:", lloops[nnloops - 1]);
06320 for (i = 0; i < 2 * (ninners + ntiles - nnloops) + 1; i++)
06321 fprintf(stdout, " ");
06322 fprintf(stdout, " Cache: %8.4g LpOver: %8.4g", dcache, doverhead);
06323 INT last_block_index = 0;
06324 for (i = 1; i < nnloops; i++)
06325 if (rsz[i] != 0)
06326 last_block_index = i;
06327 if (last_block_index > 0) {
06328 fprintf(stdout, " Blk: [");
06329 for (i = 1; i < nnloops; i++) {
06330 if (rsz[i] != 0) {
06331 fprintf(stdout, "%d=%d", lloops[i], rsz[i]);
06332 if (i != last_block_index)
06333 fprintf(stdout, ",");
06334 }
06335 }
06336 fprintf(stdout, "]\n");
06337 } else {
06338 fprintf(stdout, "\n");
06339 }
06340 }
06341 }
06342
06343
06344
06345
06346
06347
06348
06349
06350 INT strips_this_level = 0;
06351 INT lp = 0;
06352 INT* blocksize = (INT*) alloca(sizeof(INT)*depth);
06353 for (i = 0; i <= *available_depth; i++) {
06354 INT ii = available_order[i];
06355 if (u_best & (UINT64(1) << ii)) {
06356 INT blksz = rsz_best[lp++];
06357 if (blksz > 1) {
06358 blocksize[ii] = blksz;
06359 if (LNO_Blocking_Size)
06360 blocksize[ii] = MIN(blocksize[ii], LNO_Blocking_Size);
06361 strips_this_level++;
06362 }
06363 else
06364 blocksize[ii] = -1;
06365 }
06366 else
06367 blocksize[ii] = 0;
06368 }
06369
06370
06371
06372
06373
06374
06375
06376
06377
06378
06379
06380
06381 Is_True(Is_Permutation_Vector(permute_order,depth+1),("not permutation"));
06382 INT oldstripdepth = *stripdepth;
06383 for (i = *stripdepth-1; i >= 0; i--) {
06384 INT ii = permute_order[i];
06385 if (blocksize[ii] != 0) {
06386 INT j;
06387 for (j = i; j < *stripdepth-1; j++)
06388 permute_order[j] = permute_order[j+1];
06389 permute_order[*stripdepth-1] = ii;
06390 (*stripdepth)--;
06391 }
06392 }
06393 if (strips_this_level == 0)
06394 *stripdepth = oldstripdepth;
06395 Is_True(Is_Permutation_Vector(permute_order,depth+1),("not permutation"));
06396
06397
06398
06399
06400
06401
06402
06403
06404
06405
06406
06407 *doverhead_best = Compute_Do_Overhead(depth, *stripdepth, *nstrips,
06408 iloop, stripsz, unrolls, est_iters,
06409 permute_order);
06410 *cycles_per_iter = dcache_best + *doverhead_best;
06411
06412
06413
06414 for (i = *available_depth; i >= 0; i--) {
06415 INT ii = available_order[i];
06416 if (blocksize[ii] > 0) {
06417 for (s = *nstrips - 1; s >= 0; s--) {
06418 iloop[s+1] = iloop[s];
06419 stripsz[s+1] = stripsz[s];
06420 striplevel[s+1] = striplevel[s];
06421 }
06422 iloop[0] = ii;
06423 stripsz[0] = blocksize[ii];
06424 striplevel[0] = mhd_level + 1;
06425 (*nstrips)++;
06426 }
06427 }
06428
06429 *available_depth = Update_Available_Order(depth, permute_order, *nstrips,
06430 *stripdepth, iloop, available_order);
06431
06432 if (u_best == 0) {
06433
06434
06435
06436
06437
06438
06439
06440
06441
06442 *cycles_per_iter = old_doverhead_best;
06443 *doverhead_best = *cycles_per_iter;
06444 }
06445
06446 }
06447 one_cache_model_return_point:
06448
06449 if (Debug_Cache_Model) {
06450 fprintf(TFile, "+++++ RESULTS FOR L[%d] CACHE MODEL\n", mhd_level);
06451 fprintf(TFile, " Required Inner Loop %d\n",
06452 available_order[*available_depth]);
06453 fprintf(TFile, " Best Nest Model was {");
06454 INT i;
06455 for (i = 0; i <= depth; i++) {
06456 fprintf(TFile, "%d", permute_order[i]);
06457 if (i < depth)
06458 fprintf(TFile, ",");
06459 }
06460 fprintf(TFile, "}: #%lld\n", u_best);
06461 fprintf(TFile, " New Available Order is {");
06462 for (i = 0; i <= *available_depth; i++) {
06463 fprintf(TFile, "%d", available_order[i]);
06464 if (i < depth)
06465 fprintf(TFile, ",");
06466 }
06467 fprintf(TFile, "}\n");
06468 fprintf(TFile, " Cache %.4g cpi\n", dcache_best);
06469 fprintf(TFile, " Overhead %.4g cpi\n", dovhd_best);
06470 fprintf(TFile, " Total %.4g cpi\n", dcache_best + dovhd_best);
06471 fprintf(TFile, " True Overhead %.4g cpi\n", *doverhead_best);
06472 fprintf(TFile, " True Total %.4g cpi\n", *cycles_per_iter);
06473 if (uloops_best > 1) {
06474 fprintf(TFile, " Blocksizes = ");
06475 fprintf(TFile, "(");
06476 INT i;
06477 for (i = 1; i < uloops_best; i++) {
06478 fprintf(TFile, "%d", rsz_best[i]);
06479 if (i < uloops_best - 1)
06480 fprintf(TFile, ",");
06481 }
06482 fprintf(TFile, ")\n");
06483 }
06484 fprintf(TFile, " Strips must be outside loop %d\n", *stripdepth);
06485 fprintf(TFile, " List of stripped loops: \n");
06486 if (striplevel > 0) {
06487 INT i;
06488 for (i = 0; i < *nstrips; i++)
06489 fprintf(TFile, " Loop = %d, Strip Size = %d Strip Level = %d\n",
06490 iloop[i], stripsz[i], striplevel[i]);
06491 } else {
06492 for (INT i = 0; i < *nstrips; i++)
06493 fprintf(TFile, " Loop = %d, Strip Size = %d\n",
06494 iloop[i], stripsz[i]);
06495 }
06496 fprintf(TFile, "+++++ END RESULTS FOR L[%d] CACHE MODEL\n", mhd_level);
06497 fprintf(TFile,
06498 "\n***** END L[%d] CACHE MODEL FOR REQUIRED INNER LOOP %d *****\n\n",
06499 mhd_level, available_order[*available_depth]);
06500 }
06501 MEM_POOL_Pop(&LNO_local_pool);
06502 }
06503
06504
06505
06506
06507
06508
06509
06510
06511
06512
06513
06514
06515
06516
06517
06518
06519 void Cache_Model(ARRAY_REF* arl,
06520 INT depth,
06521 INT required_inner,
06522 const INT* inners,
06523 INT ninners,
06524 const INT* tiles,
06525 INT ntiles,
06526 const INT* unrolls,
06527 const DOLOOP_STACK*stack,
06528 BOOL blocking_disabled,
06529 const INT* required_blocksize,
06530 double machine_cycles,
06531 INT num_refs,
06532
06533 INT* new_order,
06534 INT* nstrips,
06535 INT* stripdepth,
06536 INT* iloop,
06537 INT* stripsz,
06538 INT* striplevel,
06539 double* cycles_per_iter,
06540 double* doverhead_best)
06541 {
06542 INT i, s;
06543 INT outertiles[MAX_DEPTH];
06544
06545 FmtAssert(new_order[depth] == -1 || required_inner == new_order[depth],
06546 ("Bad new_order/required_inner input"));
06547
06548 if (LNO_Cache_Model == 1)
06549 Max_Different_Blocksizes = 1;
06550
06551 if (Get_Trace(TP_LNOPT, TT_LNO_CACHE_MODEL_DEBUG))
06552 Debug_Cache_Model = CM_DEBUGGING_LEVEL;
06553
06554 INT mhd_level = (LNO_Cache_Model == 0) ? -1 : Mhd.First();
06555 INT* available_order = (INT*) alloca(sizeof(INT)*(depth+1));
06556 INT available_depth = depth;
06557
06558 new_order[depth] = required_inner;
06559 for (i = 0; i <= depth; i++) {
06560 BOOL already_ordered = FALSE;
06561 INT first_free_slot = -1;
06562 INT j;
06563 for (j = 0; j <= depth; j++) {
06564 if (new_order[j] == i) {
06565 already_ordered = TRUE;
06566 break;
06567 }
06568 else if (first_free_slot == -1 && new_order[j] == -1)
06569 first_free_slot = j;
06570 }
06571 if (already_ordered == FALSE) {
06572 FmtAssert(first_free_slot >= 0, ("Impossible"));
06573 new_order[first_free_slot] = i;
06574 }
06575 }
06576 for (i = 0; i <= depth; i++)
06577 available_order[i] = new_order[i];
06578
06579 *nstrips = 0;
06580 *stripdepth = depth+1;
06581
06582 Loop_Overhead = Mhd.Loop_Overhead_Base +
06583 num_refs * Mhd.Loop_Overhead_Memref;
06584
06585 if (Debug_Cache_Model) {
06586 fprintf(TFile, "*** CACHE MODEL (REQUIRED INNER LOOP=%d, ", required_inner);
06587 fprintf(TFile, "LOOP OVERHEAD=%d) ***\n\n", Loop_Overhead);
06588 Mhd.Print(TFile);
06589 if (Debug_Cache_Model >= 2) {
06590 fprintf(TFile, "ARRAY REFERENCE LIST:\n");
06591 arl->Print(TFile);
06592 fprintf(TFile, "END ARRAY REFERENCE LIST\n");
06593 }
06594 }
06595
06596 mINT64* est_iters = (mINT64*) alloca(sizeof(mINT64)*(depth+1));
06597 mINT64* max_iters = (mINT64*) alloca(sizeof(mINT64)*(depth+1));
06598
06599 for (i = 0; i <= depth; i++) {
06600 WN* wn_loop = stack->Bottom_nth(i);
06601 DO_LOOP_INFO* dli = Get_Do_Loop_Info(wn_loop);
06602 outertiles[i] = dli->Is_Outer_Lego_Tile;
06603 WN* wn_outer_tile = Outer_Tile(wn_loop, Du_Mgr);
06604 if (wn_outer_tile != NULL) {
06605 INT j;
06606 for (j = 0; j < i; j++) {
06607 WN* wn_tile_loop = stack->Bottom_nth(j);
06608 if (wn_tile_loop == wn_outer_tile)
06609 break;
06610 }
06611 FmtAssert(j < i, ("Could not find tile loop"));
06612 outertiles[j] = TRUE;
06613 }
06614 }
06615
06616 for (i = 0; i <= depth; i++) {
06617 WN* loop = stack->Bottom_nth(i);
06618 DO_LOOP_INFO* dli = Get_Do_Loop_Info(loop);
06619
06620 INT64 exact_iters = Exact_Iteration_Count(loop);
06621
06622
06623 extern INT64 Get_Good_Num_Iters (DO_LOOP_INFO *dli);
06624 INT64 est = Get_Good_Num_Iters(dli);
06625 if (exact_iters != -1 && est != exact_iters) {
06626 DevWarn("Est_Num_Iterations=%lld, expected %lld",
06627 dli->Est_Num_Iterations, exact_iters);
06628 est = exact_iters;
06629 }
06630 est_iters[i] = (est + unrolls[i] - 1) / unrolls[i];
06631 Is_True(est_iters[i] >= 0, ("Bug: est_iters=%lld", est_iters[i]));
06632 if (est_iters[i] < 2)
06633 est_iters[i] = 1;
06634
06635 INT64 max1 = dli->Est_Max_Iterations_Index == -1 ? UNBOUNDED_ITERS :
06636 (dli->Est_Max_Iterations_Index + unrolls[i] - 1) / unrolls[i];
06637 INT64 max2 = dli->Num_Iterations_Symbolic ? UNBOUNDED_ITERS :
06638 exact_iters != -1 ? est_iters[i] : est_iters[i]*2;
06639 INT64 max3 = dli->Is_Inner_Tile && dli->Tile_Size > 0
06640 ? dli->Tile_Size : UNBOUNDED_ITERS;
06641 max_iters[i] = MIN(MIN(max1, max2), max3);
06642 if (max_iters[i] < 2)
06643 max_iters[i] = 1;
06644 }
06645
06646 INT total_unrolls = 1;
06647 INT* rqd_blksz = (INT*)alloca(sizeof(INT)*(depth+1));
06648 for (i = 0; i <= depth; i++) {
06649 total_unrolls *= unrolls[i];
06650 INT sz = required_blocksize[i*MHD_MAX_LEVELS + mhd_level];
06651 if (sz > 1) {
06652 INT iters_inside = unrolls[i];
06653 INT s;
06654 for (s = 0; s < *nstrips; s++) {
06655 if (iloop[s] == i) {
06656 iters_inside = stripsz[s];
06657 break;
06658 }
06659 }
06660 rqd_blksz[i] = (sz + iters_inside - 1)/iters_inside;
06661 }
06662 else
06663 rqd_blksz[i] = sz;
06664 }
06665
06666 One_Cache_Model(arl, depth, stripdepth, nstrips, iloop,
06667 stripsz, striplevel, new_order,
06668 available_order, &available_depth,
06669 inners, ninners, tiles, ntiles,
06670 unrolls, outertiles, est_iters,
06671 max_iters, blocking_disabled,
06672 rqd_blksz, mhd_level, machine_cycles,
06673 0.0, cycles_per_iter, doverhead_best);
06674
06675 if (Debug_Cache_Model) {
06676 double cache_cycles = *cycles_per_iter - *doverhead_best;
06677 double overhead_cycles = *doverhead_best;
06678 fprintf(stdout, " BEST:");
06679 INT i;
06680 for (i = 0; i < 2 * (ninners + ntiles) - 3; i++)
06681 fprintf(stdout, " ");
06682 fprintf(stdout, " Cache: %8.4g DoOver: %8.4g\n", cache_cycles,
06683 overhead_cycles);
06684 }
06685
06686 while ((mhd_level = Mhd.Next(mhd_level)) != -1) {
06687
06688
06689 double n_cpi;
06690 double n_doverhead_best;
06691 INT n_ninners;
06692 INT n_ntiles;
06693 INT n_inners[LNO_MAX_DO_LOOP_DEPTH];
06694 INT n_tiles[LNO_MAX_DO_LOOP_DEPTH];
06695
06696
06697
06698
06699
06700
06701
06702
06703
06704
06705
06706
06707 n_ntiles = 0;
06708 n_ninners = 0;
06709 INT i;
06710 for (i = available_depth; i >= 0; i--) {
06711 if (i != available_order[available_depth] &&
06712 Is_In_Array(i, tiles, ntiles) &&
06713 Is_In_Array(i, available_order, available_depth+1))
06714 n_tiles[n_ntiles++] = i;
06715 }
06716
06717
06718 if (n_ntiles == 1) {
06719 INT ii;
06720 for (ii = 0; ii < ninners; ii++) {
06721 if (inners[ii] != available_order[available_depth] &&
06722 Is_In_Array(inners[ii], available_order, available_depth+1) &&
06723 !Is_In_Array(inners[ii], iloop, *nstrips))
06724 break;
06725 }
06726 if (ii >= ninners)
06727 n_ntiles = 0;
06728 }
06729
06730
06731 for (i = available_depth; i >= 0; i--) {
06732 if (Is_In_Array(i, available_order, available_depth+1) &&
06733 (Is_In_Array(i, tiles, ntiles) || Is_In_Array(i, inners, ninners)) &&
06734 !Is_In_Array(i, n_tiles, n_ntiles))
06735 n_inners[n_ninners++] = i;
06736 }
06737
06738 for (i = 0; i <= depth; i++) {
06739 INT sz = required_blocksize[i*MHD_MAX_LEVELS + mhd_level];
06740 rqd_blksz[i] = sz > 1 ? (sz + unrolls[i] - 1)/unrolls[i] : sz;
06741 }
06742
06743 One_Cache_Model(arl, depth, stripdepth, nstrips, iloop,
06744 stripsz, striplevel, new_order, available_order, &available_depth,
06745 n_inners, n_ninners, n_tiles, n_ntiles, unrolls, outertiles,
06746 est_iters, max_iters, blocking_disabled, rqd_blksz, mhd_level,
06747 machine_cycles, *doverhead_best, &n_cpi, &n_doverhead_best);
06748
06749 if (Debug_Cache_Model) {
06750 double overhead_cycles = n_doverhead_best - *doverhead_best;
06751 double cache_cycles = n_cpi - n_doverhead_best;
06752 fprintf(stdout, " BEST:");
06753 INT i;
06754 for (i = 0; i < 2 * (ninners + ntiles) - 3; i++)
06755 fprintf(stdout, " ");
06756 fprintf(stdout, " Cache: %8.4g DoOver: %8.4g\n", cache_cycles,
06757 overhead_cycles);
06758 }
06759
06760
06761 *cycles_per_iter += n_cpi - *doverhead_best;
06762 *doverhead_best = n_doverhead_best;
06763
06764 }
06765
06766
06767
06768
06769
06770 for (s = 0; s < *nstrips; s++) {
06771 INT i;
06772 for (i = 0; i <= depth; i++)
06773 if (new_order[i] == iloop[s])
06774 break;
06775 iloop[s] = i;
06776 }
06777
06778 if (depth != required_inner && arl->Num_Bad()) {
06779 INT nbodies = 1;
06780 INT i;
06781 for (i = 0; i <= depth; i++)
06782 nbodies *= unrolls[i];
06783 double bias = BAD_REF_CYCLE_FIRST_BIAS +
06784 (arl->Num_Bad()-1.0) * BAD_REF_CYCLE_INCREMENTAL_BIAS;
06785 if (Debug_Cache_Model) {
06786 fprintf(stdout, "Adding bad reference bias: %8.4g\n", bias / nbodies);
06787 }
06788 *cycles_per_iter += bias / nbodies;
06789 }
06790 if (Debug_Cache_Model) {
06791 fprintf(TFile, "*** END CACHE MODEL (REQUIRED INNER LOOP=%d, ",
06792 required_inner);
06793 fprintf(TFile, "LOOP OVERHEAD=%d) ***\n\n", Loop_Overhead);
06794 }
06795 }