From 2f842fbf21af79846ec0bce8ade94bdd58d34398 Mon Sep 17 00:00:00 2001 From: Protonu Basu Date: Thu, 7 Jun 2018 14:53:17 -0700 Subject: [PATCH] Add support for strided tensors This commit is to start support for strided tensors. I made changes to percolate a vector in TensorInfo down to emitCudaKernel to allow codegen to cast strided tensors. This required changes to an unit test to expect the correct cast. --- .test_tc_mapper_output.txt.swp | Bin 0 -> 16384 bytes tc/core/cuda/cuda_tc_executor.cc | 5 ++++- tc/core/polyhedral/cuda/codegen.cc | 26 ++++++++++++++++++------- tc/core/polyhedral/cuda/codegen.h | 3 ++- tc/core/polyhedral/cuda/mapped_scop.cc | 7 ++++--- tc/core/polyhedral/cuda/mapped_scop.h | 4 +++- test/cuda/test_tc_mapper.cc | 4 ++-- 7 files changed, 34 insertions(+), 15 deletions(-) create mode 100644 .test_tc_mapper_output.txt.swp diff --git a/.test_tc_mapper_output.txt.swp b/.test_tc_mapper_output.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..da018f2969f26a221cefaf8aa16e28f72ea0b4a1 GIT binary patch literal 16384 zcmeHOU5p!76}~`I0<@H$s*nfLljMP!jqM%V`{Q_bL$_|4I%L<&t`m@~Wi+0-vmP>@ zYt4-JCn2Fc@gr5df#?erm4FHXr94#>Bq9nGPdp$bgj7(4P+veuAfBM`-I=l1_9mN6 zva298qkQqq{XO@)=bky|-r1&hvARIhxe12reT;cuInsFUH`fop++eKZi%@u7nX~I< z@pap9d@Vn{YcX~$X>h#Gy+HUC(eC)Xg?mSM!Fb4nP!CPLZFD-^*F`t%bi-V@8SdU_ znwv6^GO$Mm9%RKrUXz7S96v@M{J@1h)=P_}45SRC45SRC45SRC45SRC45SSFKQIt> z?qi?BK;E7Vrkeb|ZRGbq$rS|;$U8TZ|5lPeo}^zM$$v-kTuy#p7|Bl_)1Q=ql!26i zl!26il!26il!26il!26il!26ilz{_az_1v5812?&z>o9)Ui|<2hZy@F@HB7{_$crZ z;L3X$y9^Y7L%^>dWb8HIRp7_K5#YnXuODFSd7uT@fC>EjJ&gSbcn09Whk(Dlo3Wn( zUjsf5_`niS0FD4}y^FCofLDR<0AB_wzzp!W!;JkD2!Y3df4-Bk>%cnj81Rq#8G9M{ zAz%YJ;7^Abdl9gK0`TU2jC~h)9{3bMz;Cfx@m1hF@BnZfn*ZQ%QzqX8$C)(^tR@J7oY04MNTA@5Sk<+FpOVb(^@~5VY6quruGtKhE<YD1P2Pd&4FPi*GF>f>#U{NU|UAwM;lXrkG*4Ea$b-{fXD6uz`krQ#|H z-&u1!!-e6B^peL~nd=68nHyHkbGIr6vJ~5J1Fp_QI-83pup&BJD=ow4R+5`!27JkP z){T%a@V4-`Ml<6oH^{>Wmv!;DE6huQQka~|5dM=YLX?ZL>Mb841xm4yp;A91G+XDL zrf>KV)OCZQ-e7upwGCCYxf!}!i}r)-fd@d$?%OfhXoHF z=eF?qN`Dy-b?A7ZZ`9F9lFye!;DpXPulZrifZq`MU7obMqwKuvij9>^d;`W|_g0_m z%1>2S&{xRUwv^B;mxq(e2WnB1g|+FZS9VuUsxiG>bPTJ#}XJ zOr#4n1)g(TrIT}1)%10TZ}bwVnjd&(7Pl#kJ})*bD+qnKjaVkGFhXT9LknuJdWMD) z<2F?3tt&N^t}wQY*M%v^MjKAp0^Q_zP$|Q8E*mmAM^&3@zA2 z2^)`P_R0C>#p>eOGNlY3IKvHEzOYEqwMuofw*a_bhi%Kn-?PtK7EFUwIM _f{zn zg$zQnebL@gb-3hNS2XcXh>CM>-SLIjM$|(A53}7)jy2}`*D5V{Jx}V+VDrhnX#_mS z7DdR*bXJHp7upwz;JDH=o};lu&@m?C|D%X`k0TbA@qh1p{w2iv&j9~IjQA% z=ZNh)z)^q!KS5mo3h*>=0ayVZ1&#vu1Aj-1{|DeT;6>mEz!!iepavWQ-b9@Lec;=` zw}7t$j{>hF#(x(0G;kPr2{HbcfIlO)e*^d<@H!9x4)9CF_rCzH0iOUK0Uie45Bv_} zdk**p&;h0aIp$+PdZi4c45SR)n1Ke-()`NGnJ0*PwtU1A-^2owZ@^1X3a3ic4k#3% z;i`MH_jt?uSxanqY`FYjN24zx+0pc1NAcT=Y9EK#qlI_8v3j*rZ|&f{wGpqoI8iEU z`CYv3DQsyqDz$~B>iILPB>!qI-r16VDYXP#i*waFJW81^_q~L;R7P{9b^yOydfu`3 zU27o1qA_Y}bdt=1y4p}ec^fC@u~Re$-}jbV_~y4NsYU@V;yY3hok2K= z66R;*4R#2N@f}?MlJv|ea-QK&Nv9sdD`QD7Y9n7L$x-+mt#BVcd8oq`3TLIaJy;XKZ+j$B5y(}G>Pjv!Mc^Wez^m3~ZgU1v z08;Zqsar{E#9P1YWjQKY8Z$meO&Q=1G|`kz^b?a7nzDs{A{x;3wa6#cby^pWwIj6B zrzY@E&KhO{2OjYaBCTXhW7Mg|(N3R#Bqh||Afp$DU159^Q|Qt0%5fQuo#3A3*boPQ zYwDQ1CyuTMTb{XL_#LH-fNIS{@YFVfOBjW=`DAiIX|A#pj!n8=x$wk%rJ`Rvv#c-7 zpQ|mSomo1?#>Xk%ULaY3PGzXqtc>prM0lb@2-HkYvM(h0oD6rv*b1VOEja4VWp3pl zC<0K&;~+xz=L!$}d26~6=$7cp^9*fs+P1a9>->3ju$rs~8fBF@M3ps4Q3VxeU0WS4 zW8Ap4nC7X8nSOdyrN{nuvJ^s2&R|hI#`85J33v*Tt;I>UPuMcCWZXJ199u^W>**Z3 zQ3_Uyl`%3+m6YgSrH_$fs8lAUsj895!P4tu_T;|4hTJSESr4|x!d+e+w@}ohak3yF zZ+Bd5r_9QZJ?!;R{WPRP?;JezXS*(kBz#M$6E*_sWJNLhh$YPzwZg=yK}+g~rrdFx zgjra;xI7SnQy{v&ne4hza>dI`4?5g*Y{!hYE_!*}0o`5QVMX??+u6r&?>vWV51+%u zt=^Sv+m%^Q#@pN5zV&J0K}mV1rxLw!Mti~;7+WB}%b6`SJPX@pu2dCx$ literal 0 HcmV?d00001 diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc index 72a1350ad..1ebb2047b 100644 --- a/tc/core/cuda/cuda_tc_executor.cc +++ b/tc/core/cuda/cuda_tc_executor.cc @@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper( auto parameters = mappedScop->scop().getParameterValues(); auto specializedName = specializeKernelName(tcName, parameters); + auto inputsInfo = makeTensorInfoVector(inputs); + // This updates the launch bounds with the actual result from compilation // with tightening of launch_bounds. What you get is not necessarily what // you asked for, the autotuner should adapt to that. std::string source; Grid grid; Block block; - std::tie(source, grid, block) = mappedScop->codegen(specializedName); + std::tie(source, grid, block) = + mappedScop->codegen(specializedName, inputsInfo); LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n" << "grid: " << grid << " block: " << block; diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc index ee1643984..8b5677322 100644 --- a/tc/core/polyhedral/cuda/codegen.cc +++ b/tc/core/polyhedral/cuda/codegen.cc @@ -183,7 +183,8 @@ void emitTensorView( stringstream& ss, Halide::OutputImageParam p, const map& paramValues, - bool constInput = false) { + bool constInput = false, + const TensorInfo* tinfo = NULL) { WS ws; stringstream ssViewType; for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension @@ -191,7 +192,14 @@ void emitTensorView( extent = Halide::Internal::substitute(paramValues, extent); CHECK(extent.defined()) << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n"; - ssViewType << "[" << extent << "]"; + // TODO: Handle non-unit stride in the innermost dimension + if (tinfo && tinfo->strides.size() == p.dimensions() && + tinfo->strides[p.dimensions() - 1] == 1 && + tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) { + ssViewType << "[" << tinfo->strides[i - 1] << "]"; + } else { + ssViewType << "[" << extent << "]"; + } } ss << ws.tab(); ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")" @@ -216,9 +224,12 @@ void emitTensorViews( void emitTensorViews( stringstream& ss, const vector& params, - const map& paramValues) { - for (auto p : params) { - emitTensorView(ss, p, paramValues, true); + const map& paramValues, + const std::vector& inputsInfo = std::vector{}) { + for (size_t i = 0; i < params.size(); ++i) { + inputsInfo.size() + ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i]) + : emitTensorView(ss, params[i], paramValues, true); } } @@ -738,7 +749,8 @@ std::unordered_set gatherReadOnlySet( string emitCudaKernel( const std::string& specializedName, - const MappedScop& mscop) { + const MappedScop& mscop, + const std::vector& inputsInfo) { // Expecting a schedule with domain root and context first child. CHECK(mscop.schedule()->elemAs()); CHECK( @@ -755,7 +767,7 @@ string emitCudaKernel( emitKernelSignature(ss, specializedName, scop); emitThreadIdInit(ss, mscop); emitTensorViews(ss, scop.halide.outputs, paramValues); - emitTensorViews(ss, scop.halide.inputs, paramValues); + emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo); emitTmpDecl(ss, scop); emitPromotedArrayViewsHalide(ss, scop); NodeInfoMapType nodeInfoMap; diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h index ff3631d92..bd94f1bd3 100644 --- a/tc/core/polyhedral/cuda/codegen.h +++ b/tc/core/polyhedral/cuda/codegen.h @@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext { std::string emitCudaKernel( const std::string& specializedName, - const MappedScop& scop); + const MappedScop& scop, + const std::vector& inputsInfo = std::vector{}); } // namespace polyhedral } // namespace tc diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc index e0dc474ae..1efb03c0b 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.cc +++ b/tc/core/polyhedral/cuda/mapped_scop.cc @@ -910,7 +910,8 @@ std::unique_ptr makeSpecializedMappedScop( // the context of the original scop as top-level // context node in schedule tree. std::tuple MappedScop::codegen( - const std::string& specializedName) const { + const std::string& specializedName, + const std::vector& inputsInfo) const { validate(schedule()); auto mappedScopForCodegen = makeSpecializedMappedScop(*this); @@ -927,8 +928,8 @@ std::tuple MappedScop::codegen( code << code::cuda::cubBlockReduce; } code << "extern \"C\" {" << std::endl - << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}" - << std::endl; + << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo) + << "}" << std::endl; return std::make_tuple( code.str(), diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h index 169b4f138..5af792df9 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.h +++ b/tc/core/polyhedral/cuda/mapped_scop.h @@ -115,7 +115,9 @@ class MappedScop { // Generate CUDA code at the current state of transformation provided a // name for the generated function. std::tuple codegen( - const std::string& specializedName) const; + const std::string& specializedName, + const std::vector& inputsInfo = + std::vector{}) const; // Accessors.. // Const accessor to schedule of underlying Scop. diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc index e89756aea..3aedafa29 100644 --- a/test/cuda/test_tc_mapper.cc +++ b/test/cuda/test_tc_mapper.cc @@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { auto res = Check(TC, name, options, inputs, checkFun); // This test should be modified when strided tensors are handled std::string expected = - "const float32 (*I0_view)[64] = " - "reinterpret_cast(pI0_view)"; + "const float32 (*I0_view)[128] = " + "reinterpret_cast(pI0_view)"; ASSERT_NE(std::string::npos, res.second.find(expected)) << "In resulting code:\n" << res.second << "\nfound unexpected: " << expected;