From ff1ed36fced4ba9c648d6d6dfebd484eed09e79d Mon Sep 17 00:00:00 2001 From: Protonu Basu Date: Thu, 7 Jun 2018 14:53:17 -0700 Subject: [PATCH] Add support for strided tensors This commit is to start support for strided tensors. I made changes to percolate a vector in TensorInfo down to emitCudaKernel to allow codegen to cast strided tensors. This required changes to an unit test to expect the correct cast. --- .test_tc_mapper_output.txt.swp | Bin 0 -> 16384 bytes tc/core/cuda/cuda_tc_executor.cc | 5 +- tc/core/polyhedral/cuda/codegen.cc | 26 ++- tc/core/polyhedral/cuda/codegen.h | 3 +- tc/core/polyhedral/cuda/mapped_scop.cc | 7 +- tc/core/polyhedral/cuda/mapped_scop.h | 4 +- test/cuda/test_tc_mapper.cc | 4 +- test_tc_mapper_output.txt | 226 +++++++++++++++++++++++++ 8 files changed, 260 insertions(+), 15 deletions(-) create mode 100644 .test_tc_mapper_output.txt.swp create mode 100644 test_tc_mapper_output.txt diff --git a/.test_tc_mapper_output.txt.swp b/.test_tc_mapper_output.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..da018f2969f26a221cefaf8aa16e28f72ea0b4a1 GIT binary patch literal 16384 zcmeHOU5p!76}~`I0<@H$s*nfLljMP!jqM%V`{Q_bL$_|4I%L<&t`m@~Wi+0-vmP>@ zYt4-JCn2Fc@gr5df#?erm4FHXr94#>Bq9nGPdp$bgj7(4P+veuAfBM`-I=l1_9mN6 zva298qkQqq{XO@)=bky|-r1&hvARIhxe12reT;cuInsFUH`fop++eKZi%@u7nX~I< z@pap9d@Vn{YcX~$X>h#Gy+HUC(eC)Xg?mSM!Fb4nP!CPLZFD-^*F`t%bi-V@8SdU_ znwv6^GO$Mm9%RKrUXz7S96v@M{J@1h)=P_}45SRC45SRC45SRC45SRC45SSFKQIt> z?qi?BK;E7Vrkeb|ZRGbq$rS|;$U8TZ|5lPeo}^zM$$v-kTuy#p7|Bl_)1Q=ql!26i zl!26il!26il!26il!26il!26ilz{_az_1v5812?&z>o9)Ui|<2hZy@F@HB7{_$crZ z;L3X$y9^Y7L%^>dWb8HIRp7_K5#YnXuODFSd7uT@fC>EjJ&gSbcn09Whk(Dlo3Wn( zUjsf5_`niS0FD4}y^FCofLDR<0AB_wzzp!W!;JkD2!Y3df4-Bk>%cnj81Rq#8G9M{ zAz%YJ;7^Abdl9gK0`TU2jC~h)9{3bMz;Cfx@m1hF@BnZfn*ZQ%QzqX8$C)(^tR@J7oY04MNTA@5Sk<+FpOVb(^@~5VY6quruGtKhE<YD1P2Pd&4FPi*GF>f>#U{NU|UAwM;lXrkG*4Ea$b-{fXD6uz`krQ#|H z-&u1!!-e6B^peL~nd=68nHyHkbGIr6vJ~5J1Fp_QI-83pup&BJD=ow4R+5`!27JkP z){T%a@V4-`Ml<6oH^{>Wmv!;DE6huQQka~|5dM=YLX?ZL>Mb841xm4yp;A91G+XDL zrf>KV)OCZQ-e7upwGCCYxf!}!i}r)-fd@d$?%OfhXoHF z=eF?qN`Dy-b?A7ZZ`9F9lFye!;DpXPulZrifZq`MU7obMqwKuvij9>^d;`W|_g0_m z%1>2S&{xRUwv^B;mxq(e2WnB1g|+FZS9VuUsxiG>bPTJ#}XJ zOr#4n1)g(TrIT}1)%10TZ}bwVnjd&(7Pl#kJ})*bD+qnKjaVkGFhXT9LknuJdWMD) z<2F?3tt&N^t}wQY*M%v^MjKAp0^Q_zP$|Q8E*mmAM^&3@zA2 z2^)`P_R0C>#p>eOGNlY3IKvHEzOYEqwMuofw*a_bhi%Kn-?PtK7EFUwIM _f{zn zg$zQnebL@gb-3hNS2XcXh>CM>-SLIjM$|(A53}7)jy2}`*D5V{Jx}V+VDrhnX#_mS z7DdR*bXJHp7upwz;JDH=o};lu&@m?C|D%X`k0TbA@qh1p{w2iv&j9~IjQA% z=ZNh)z)^q!KS5mo3h*>=0ayVZ1&#vu1Aj-1{|DeT;6>mEz!!iepavWQ-b9@Lec;=` zw}7t$j{>hF#(x(0G;kPr2{HbcfIlO)e*^d<@H!9x4)9CF_rCzH0iOUK0Uie45Bv_} zdk**p&;h0aIp$+PdZi4c45SR)n1Ke-()`NGnJ0*PwtU1A-^2owZ@^1X3a3ic4k#3% z;i`MH_jt?uSxanqY`FYjN24zx+0pc1NAcT=Y9EK#qlI_8v3j*rZ|&f{wGpqoI8iEU z`CYv3DQsyqDz$~B>iILPB>!qI-r16VDYXP#i*waFJW81^_q~L;R7P{9b^yOydfu`3 zU27o1qA_Y}bdt=1y4p}ec^fC@u~Re$-}jbV_~y4NsYU@V;yY3hok2K= z66R;*4R#2N@f}?MlJv|ea-QK&Nv9sdD`QD7Y9n7L$x-+mt#BVcd8oq`3TLIaJy;XKZ+j$B5y(}G>Pjv!Mc^Wez^m3~ZgU1v z08;Zqsar{E#9P1YWjQKY8Z$meO&Q=1G|`kz^b?a7nzDs{A{x;3wa6#cby^pWwIj6B zrzY@E&KhO{2OjYaBCTXhW7Mg|(N3R#Bqh||Afp$DU159^Q|Qt0%5fQuo#3A3*boPQ zYwDQ1CyuTMTb{XL_#LH-fNIS{@YFVfOBjW=`DAiIX|A#pj!n8=x$wk%rJ`Rvv#c-7 zpQ|mSomo1?#>Xk%ULaY3PGzXqtc>prM0lb@2-HkYvM(h0oD6rv*b1VOEja4VWp3pl zC<0K&;~+xz=L!$}d26~6=$7cp^9*fs+P1a9>->3ju$rs~8fBF@M3ps4Q3VxeU0WS4 zW8Ap4nC7X8nSOdyrN{nuvJ^s2&R|hI#`85J33v*Tt;I>UPuMcCWZXJ199u^W>**Z3 zQ3_Uyl`%3+m6YgSrH_$fs8lAUsj895!P4tu_T;|4hTJSESr4|x!d+e+w@}ohak3yF zZ+Bd5r_9QZJ?!;R{WPRP?;JezXS*(kBz#M$6E*_sWJNLhh$YPzwZg=yK}+g~rrdFx zgjra;xI7SnQy{v&ne4hza>dI`4?5g*Y{!hYE_!*}0o`5QVMX??+u6r&?>vWV51+%u zt=^Sv+m%^Q#@pN5zV&J0K}mV1rxLw!Mti~;7+WB}%b6`SJPX@pu2dCx$ literal 0 HcmV?d00001 diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc index 72a1350ad..1ebb2047b 100644 --- a/tc/core/cuda/cuda_tc_executor.cc +++ b/tc/core/cuda/cuda_tc_executor.cc @@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper( auto parameters = mappedScop->scop().getParameterValues(); auto specializedName = specializeKernelName(tcName, parameters); + auto inputsInfo = makeTensorInfoVector(inputs); + // This updates the launch bounds with the actual result from compilation // with tightening of launch_bounds. What you get is not necessarily what // you asked for, the autotuner should adapt to that. std::string source; Grid grid; Block block; - std::tie(source, grid, block) = mappedScop->codegen(specializedName); + std::tie(source, grid, block) = + mappedScop->codegen(specializedName, inputsInfo); LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n" << "grid: " << grid << " block: " << block; diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc index ee1643984..8b5677322 100644 --- a/tc/core/polyhedral/cuda/codegen.cc +++ b/tc/core/polyhedral/cuda/codegen.cc @@ -183,7 +183,8 @@ void emitTensorView( stringstream& ss, Halide::OutputImageParam p, const map& paramValues, - bool constInput = false) { + bool constInput = false, + const TensorInfo* tinfo = NULL) { WS ws; stringstream ssViewType; for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension @@ -191,7 +192,14 @@ void emitTensorView( extent = Halide::Internal::substitute(paramValues, extent); CHECK(extent.defined()) << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n"; - ssViewType << "[" << extent << "]"; + // TODO: Handle non-unit stride in the innermost dimension + if (tinfo && tinfo->strides.size() == p.dimensions() && + tinfo->strides[p.dimensions() - 1] == 1 && + tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) { + ssViewType << "[" << tinfo->strides[i - 1] << "]"; + } else { + ssViewType << "[" << extent << "]"; + } } ss << ws.tab(); ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")" @@ -216,9 +224,12 @@ void emitTensorViews( void emitTensorViews( stringstream& ss, const vector& params, - const map& paramValues) { - for (auto p : params) { - emitTensorView(ss, p, paramValues, true); + const map& paramValues, + const std::vector& inputsInfo = std::vector{}) { + for (size_t i = 0; i < params.size(); ++i) { + inputsInfo.size() + ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i]) + : emitTensorView(ss, params[i], paramValues, true); } } @@ -738,7 +749,8 @@ std::unordered_set gatherReadOnlySet( string emitCudaKernel( const std::string& specializedName, - const MappedScop& mscop) { + const MappedScop& mscop, + const std::vector& inputsInfo) { // Expecting a schedule with domain root and context first child. CHECK(mscop.schedule()->elemAs()); CHECK( @@ -755,7 +767,7 @@ string emitCudaKernel( emitKernelSignature(ss, specializedName, scop); emitThreadIdInit(ss, mscop); emitTensorViews(ss, scop.halide.outputs, paramValues); - emitTensorViews(ss, scop.halide.inputs, paramValues); + emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo); emitTmpDecl(ss, scop); emitPromotedArrayViewsHalide(ss, scop); NodeInfoMapType nodeInfoMap; diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h index ff3631d92..bd94f1bd3 100644 --- a/tc/core/polyhedral/cuda/codegen.h +++ b/tc/core/polyhedral/cuda/codegen.h @@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext { std::string emitCudaKernel( const std::string& specializedName, - const MappedScop& scop); + const MappedScop& scop, + const std::vector& inputsInfo = std::vector{}); } // namespace polyhedral } // namespace tc diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc index e0dc474ae..1efb03c0b 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.cc +++ b/tc/core/polyhedral/cuda/mapped_scop.cc @@ -910,7 +910,8 @@ std::unique_ptr makeSpecializedMappedScop( // the context of the original scop as top-level // context node in schedule tree. std::tuple MappedScop::codegen( - const std::string& specializedName) const { + const std::string& specializedName, + const std::vector& inputsInfo) const { validate(schedule()); auto mappedScopForCodegen = makeSpecializedMappedScop(*this); @@ -927,8 +928,8 @@ std::tuple MappedScop::codegen( code << code::cuda::cubBlockReduce; } code << "extern \"C\" {" << std::endl - << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}" - << std::endl; + << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo) + << "}" << std::endl; return std::make_tuple( code.str(), diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h index 169b4f138..5af792df9 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.h +++ b/tc/core/polyhedral/cuda/mapped_scop.h @@ -115,7 +115,9 @@ class MappedScop { // Generate CUDA code at the current state of transformation provided a // name for the generated function. std::tuple codegen( - const std::string& specializedName) const; + const std::string& specializedName, + const std::vector& inputsInfo = + std::vector{}) const; // Accessors.. // Const accessor to schedule of underlying Scop. diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc index e89756aea..3aedafa29 100644 --- a/test/cuda/test_tc_mapper.cc +++ b/test/cuda/test_tc_mapper.cc @@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { auto res = Check(TC, name, options, inputs, checkFun); // This test should be modified when strided tensors are handled std::string expected = - "const float32 (*I0_view)[64] = " - "reinterpret_cast(pI0_view)"; + "const float32 (*I0_view)[128] = " + "reinterpret_cast(pI0_view)"; ASSERT_NE(std::string::npos, res.second.find(expected)) << "In resulting code:\n" << res.second << "\nfound unexpected: " << expected; diff --git a/test_tc_mapper_output.txt b/test_tc_mapper_output.txt new file mode 100644 index 000000000..24a7ee836 --- /dev/null +++ b/test_tc_mapper_output.txt @@ -0,0 +1,226 @@ +Note: Google Test filter = *Strided* +[==========] Running 1 test from 1 test case. +[----------] Global test environment set-up. +[----------] 1 test from TcCudaMapperTest +[ RUN ] TcCudaMapperTest.TensorAddStrided +WARNING: +Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: + +def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { + O(n, m) += I0_view(n, m) + I1_view(n, m) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE +} + +WARNING: +Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: + +def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { + O(n, m) += I0_view(n, m) + I1_view(n, m) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE +} + +I0607 13:02:54.070823 21973 cuda_tc_executor.cc:82] tc::CudaMappingOptions::makeNaiveMappingOptions() + .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident) + .outerScheduleAllowSkewing(false) + .outerSchedulePositiveOrthant(true) + .intraTileScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident) + .intraTileScheduleAllowSkewing(false) + .intraTileSchedulePositiveOrthant(true) + .fixParametersBeforeScheduling(false) + .tile(32, 32, 32) + .unroll(1) + .tileImperfectlyNested(false) + .matchLibraryCalls(false) + .mapToThreads(32, 8) + .mapToBlocks(256, 256) + .useSharedMemory(false) + .usePrivateMemory(false) + .unrollCopyShared(false) + .useReadOnlyCache(false); +I0607 13:02:54.072165 21973 cuda_tc_executor.cc:83] original schedule: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) + band(n(1) permutable(0) coincident(0) unroll(0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] } + ----------------------------------------------------------------------- + band(n(1) permutable(0) coincident(0) unroll(0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] } + ----------------------------------------------------------------------- +I0607 13:02:54.075304 21973 scop.cc:400] After scheduling: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] } + ----------------------------------------------------------------------- +I0607 13:02:54.075870 21973 scop.cc:454] After tiling outer: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- +I0607 13:02:54.078128 21973 mapped_scop.cc:1021] After mapping to threads: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) + context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + mapping_filter(ids(t1, t0, ) + [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + thread_specific() +I0607 13:02:54.079393 21973 schedule_transforms.cc:391] Resizing scales to 2 entries: 32 32 32 +I0607 13:02:54.079439 21973 mapped_scop.cc:1029] After mapping to blocks: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) + context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) + mapping_filter(ids(b1, b0, ) + [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + mapping_filter(ids(t1, t0, ) + [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + thread_specific() +I0607 13:02:54.079643 21973 mapped_scop.cc:1083] After outerBlockInnerThread strategy: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) + context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) + mapping_filter(ids(b1, b0, ) + [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + mapping_filter(ids(t1, t0, ) + [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + thread_specific() +I0607 13:02:54.079829 21973 cuda_tc_executor.cc:90] Mapped schedule: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) + context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) + mapping_filter(ids(b1, b0, ) + [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + mapping_filter(ids(t1, t0, ) + [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + thread_specific() +I0607 13:02:54.091660 21973 mapped_scop.cc:900] Codegen with tightened bounds [blocks:CudaDim(2, 2, 1) @0x7ffefab63f90, threads:CudaDim(32, 8, 1) @0x7ffefab63fd0] for tree: +domain( + [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) + context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : M = 64 and N = 64 and t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 1 and 0 <= b0 <= 1 }) + mapping_filter(ids(b1, b0, ) + [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + mapping_filter(ids(t1, t0, ) + [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) + band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } + ----------------------------------------------------------------------- + | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } + ----------------------------------------------------------------------- + thread_specific() +I0607 13:02:54.130249 21973 cuda_rtc.cc:58] NVRTC function source: + +template inline __device__ T floord(T n, T d) { + return n < 0 ? - (-n + d - 1)/d : n / d; +} +#define if_then_else(cond,a,b) ((cond) ? (a) : (b)) + +// Halide type handling +typedef int int32; +typedef long int64; +typedef float float32; +typedef double float64; + +#define inff __int_as_float(0x7f800000) +#define inf __longlong_as_double(0x7ff0000000000000LL) + +// Before CUDA 9, syncwarp is a noop since warps are always synchronized. +#if __CUDACC_VER_MAJOR__ < 9 +__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {} +#endif + +extern "C" { +__global__ void tensoraddstrided_64_64(int32 M, int32 N, float32* pO, const float32* pI0_view, const float32* pI1_view) { + int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; + int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; + float32 (*O)[64] = reinterpret_cast(pO); + const float32 (*I0_view)[128] = reinterpret_cast(pI0_view); + const float32 (*I1_view)[128] = reinterpret_cast(pI1_view); + for (int c2 = t1; c2 <= 31; c2 += 8) { + O[(32 * b0 + c2)][(t0 + 32 * b1)] = (O[(32 * b0 + c2)][(t0 + 32 * b1)] + (I0_view[(32 * b0 + c2)][(t0 + 32 * b1)] + I1_view[(32 * b0 + c2)][(t0 + 32 * b1)])); + } +} +} +I0607 13:02:54.348301 21973 cuda_tc_executor.cc:64] [COMPILE] Compiling with host JIT compiler took: 218ms +WARNING: +Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: + +def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { + O(n, m) += I0_view(n, m) + I1_view(n, m) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE +} + +[ OK ] TcCudaMapperTest.TensorAddStrided (297 ms) +[----------] 1 test from TcCudaMapperTest (297 ms total) + +[----------] Global test environment tear-down +[==========] 1 test from 1 test case ran. (298 ms total) +[ PASSED ] 1 test.