From 2f842fbf21af79846ec0bce8ade94bdd58d34398 Mon Sep 17 00:00:00 2001
From: Protonu Basu <protonu@devfair017.maas>
Date: Thu, 7 Jun 2018 14:53:17 -0700
Subject: [PATCH] Add support for strided tensors

This commit is to start support for strided tensors. I made changes
to percolate a vector in TensorInfo down to emitCudaKernel to allow
codegen to cast strided tensors. This required changes to an unit test
to expect the correct cast.
---
 .test_tc_mapper_output.txt.swp         | Bin 0 -> 16384 bytes
 tc/core/cuda/cuda_tc_executor.cc       |   5 ++++-
 tc/core/polyhedral/cuda/codegen.cc     |  26 ++++++++++++++++++-------
 tc/core/polyhedral/cuda/codegen.h      |   3 ++-
 tc/core/polyhedral/cuda/mapped_scop.cc |   7 ++++---
 tc/core/polyhedral/cuda/mapped_scop.h  |   4 +++-
 test/cuda/test_tc_mapper.cc            |   4 ++--
 7 files changed, 34 insertions(+), 15 deletions(-)
 create mode 100644 .test_tc_mapper_output.txt.swp
diff --git a/.test_tc_mapper_output.txt.swp b/.test_tc_mapper_output.txt.swp
new file mode 100644
index 0000000000000000000000000000000000000000..da018f2969f26a221cefaf8aa16e28f72ea0b4a1
GIT binary patch
literal 16384
zcmeHOU5p!76}~`I0<@H$s*nfLljMP!jqM%V`{Q_bL$_|4I%L<&t`m@~Wi+0-vmP>@
zYt4-JCn2Fc@gr5df#?erm4FHXr94#>Bq9nGPdp$bgj7(4P+veuAfBM`-I=l1_9mN6
zva298qkQqq{XO@)=bky|-r1&hvARIhxe12reT;cuInsFUH`fop++eKZi%@u7nX~I<
z@pap9d@Vn{YcX~$X>h#Gy+HUC(eC)Xg?mSM!Fb4nP!CPLZFD-^*F`t%bi-V@8SdU_
znwv6^GO$Mm9%RKrUXz7S96v@M{J@1h)=P_}45SRC45SRC45SRC45SRC45SSFKQIt>
z?qi?BK;E7Vrkeb|ZRGbq$rS|;$U8TZ|5lPeo}^zM$$v-kTuy#p7|Bl_)1Q=ql!26i
zl!26il!26il!26il!26il!26ilz{_az_1v5812?&z>o9)Ui|<2hZy@F@HB7{_$crZ
z;L3X$y9^Y7L%^>dWb8HIRp7_K5#YnXuODFSd7uT@fC>EjJ&gSbcn09Whk(Dlo3Wn(
zUjsf5_`niS0FD4}y^FCofLDR<0AB_wzzp!W!;JkD2!Y3df4-Bk>%cnj81Rq#8G9M{
zAz%YJ;7^Abdl9gK0`TU2jC~h)9{3bMz;Cfx@m1hF@BnZfn<HNYW`Vzgr*8t1uNBGJ
z4KBj<S!yg~XmORYr|Al<=vt$u2YKDgQ1si5G8#(!Kv^ml#%xyzUrCbG@nS($SLrH~
zt!4M;irTsz$-JGCc?Tz1)9@_CQwl2K1CVw%G@33~@+z6a@l40!UZ~_VgoLi=3)fY&
z3~4IHv_Irz(b68dx#>*ZQ%QzqX8$C)(^<Kd{+CIk*|~T*cr;6kgy|nhYa?xeuByxu
zZNu>tR@J7oY04MNTA@5Sk<+FpOVb(^@~5VY6quruGtKhE<isk?+adQUbX><<BYa21
z{qnugl6Y+=Yk2?WV~;u-o11;_{eM^+irU1eHfUN;8v?V%t*+DvsSEpQGxlsXhV)^7
zB}irkrg6Y>YD1P2Pd&4FPi*GF>f>#U{NU|UAwM;lXrkG*4Ea$b-{fXD6uz`krQ#|H
z-&u1!!-e6B^peL~nd=68nHyHkbGIr6vJ~5J1Fp_QI-83pup&BJD=ow4R+5`!27JkP
z){T%a@V4-`Ml<6oH^{>Wmv!;DE6huQQka~|5dM=YLX?ZL>Mb841xm4yp;A91G+XDL
zrf>KV)OCZQ-e7upwGCCYxf!}!i}<p0RM_|po==*S6{7ZKA8>r)-fd@d$?%OfhXoHF
z=eF?qN`Dy-b?A7ZZ`9F9lFye!;DpXPulZrifZq`MU7obMqwKuvij9>^d;`W|_g0_m
z%1>2S&{xRUwv^B;mxq(e<t3j7++XL#O5bTo`i)*hdysFa+)ZU0Dm5&`-sRfyM%7A%
z;%=H-SS2~=3bc9wpXq|IS|^lW@0QDL;}TyqK>2WnB1g|+FZS9VuUsxiG>bPTJ#}XJ
zOr#4n1)g(TrIT}1)%10TZ}bwVnjd&(7Pl#kJ})*bD+qnKjaVkGFhXT9LknuJdWMD)
z<2F?3tt&N^t}wQY*M%v^MjKAp0^Q_zP$|Q8E*mmAM^&3@<OJl2knkaMwx7=}YDU^h
zCYFYDLc$iOdWGz+7d4rION+0bEqE!=s3iYbHt5<=dDO<zPWl#DL<Z~z27U|zU>zA2
z2^)`P_R0C>#p>eOGNlY3IKvHEzOYEqwMuofw*a_bhi%Kn-?PtK7EFUwIM&#9_f{zn
zg$zQnebL@gb-3hNS2XcXh>CM>-SLIjM$|(A53}7)jy2}`*D5V{Jx}V+VDrhnX#_mS
z7DdR*bXJHp7upwz;JDH=o};lu&@m?C|D%X`k0TbA@qh1p{w2iv&j9~IjQ<Mo8Q>A%
z=ZNh)z)^q!KS5mo3h*>=0ayVZ1&#vu1Aj-1{|DeT;6>mEz!!iepavWQ-b9@Lec;=`
zw}7t$j{>hF#(x(0G;kPr2{HbcfIlO)e*^d<@H!9x4)9CF_rCzH0iOUK0Uie45Bv_}
zdk**p&;h0aIp$+PdZi4c45SR)n1Ke-()`NGnJ0*PwtU1A-^2owZ@^1X3a3ic4k#3%
z;i`MH_jt?uSxanqY`FYjN24zx+0pc1NAcT=Y9EK#qlI_8v3j*rZ|&f{wGpqoI8iEU
z`CYv3DQsyqDz$~B>iILPB>!qI-r16VDYXP#i*waFJW81^_q~L;R7P{9b^yOydfu`3
zU27o1qA_Y}bdt=1y4p}ec^fC@u~Re$-}jb<PLh&%;Cm>V_~y4NsYU@V;yY3hok2K=
z66R;*4R#2N@f}?MlJv|ea-QK&Nv9sdD`QD7Y9n7L$x-+mt#BVcd8oq`3TLIaJy;<w
zd0Of8YRdR}v`{k2J83nJP~t)LrL3t9Ig>XKZ+j$B5y(}G>Pjv!Mc^Wez^m3~ZgU1v
z08;Zqsar{E#9P1YWjQKY8Z$meO&Q=1G|`kz^b?a7nzDs{A{x;3wa6#cby^pWwIj6B
zrzY@E&KhO{2OjYaBCTXhW7Mg|(N3R#Bqh||Afp$DU159^Q|Qt0%5fQuo#3A3*boPQ
zYwDQ1CyuTMTb{XL_#LH-fNIS{@YFVfOBjW=`DAiIX|A#pj!n8=x$wk%rJ`Rvv#c-7
zpQ|mSomo1?#>Xk%ULaY3PGzXqtc>prM0lb@2-HkYvM(h0oD6rv*b1VOEja4VWp3pl
zC<0K&;~+xz=L!$}d26~6=$7cp^9*fs+P1a9>->3ju$rs~8fBF@M3ps4Q3VxeU0WS4
zW8Ap4nC7X8nSOdyrN{nuvJ^s2&R|hI#`85J33v*Tt;I>UPuMcCWZXJ199u^W>**Z3
zQ3_Uyl`%3+m6YgSrH_$fs8lAUsj895!P4tu_T;|4hTJSESr4|x!d+e+w@}ohak3yF
zZ+Bd5r_9QZJ?!;R{WPRP?;JezXS*(kBz#M$6E*_sWJNLhh$YPzwZg=yK}+g~rrdFx
zgjra;xI7SnQy{v&ne4hza>dI`4?5g*Y{!hYE_!*}0o`5QVMX??+u6r&?>vWV51+%u
zt=^Sv+m%^Q#@pN5zV&J0K}mV1rxLw!Mti~;7+WB}%b6`SJPX@<GC8$UsVyujNox}w
zULjMyQIn1SC=+I}mm3{-p+NLX+dk|<dq|m<CtW>pu<a`M)^t0H*A)kwuOUu8rFgwU
K+mm%yckI6>2dCx$

literal 0
HcmV?d00001

diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
index 72a1350ad..1ebb2047b 100644
--- a/tc/core/cuda/cuda_tc_executor.cc
+++ b/tc/core/cuda/cuda_tc_executor.cc
@@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(
   auto parameters = mappedScop->scop().getParameterValues();
   auto specializedName = specializeKernelName(tcName, parameters);
 
+  auto inputsInfo = makeTensorInfoVector(inputs);
+
   // This updates the launch bounds with the actual result from compilation
   // with tightening of launch_bounds. What you get is not necessarily what
   // you asked for, the autotuner should adapt to that.
   std::string source;
   Grid grid;
   Block block;
-  std::tie(source, grid, block) = mappedScop->codegen(specializedName);
+  std::tie(source, grid, block) =
+      mappedScop->codegen(specializedName, inputsInfo);
   LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n"
                                 << "grid: " << grid << " block: " << block;
 
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
index ee1643984..8b5677322 100644
--- a/tc/core/polyhedral/cuda/codegen.cc
+++ b/tc/core/polyhedral/cuda/codegen.cc
@@ -183,7 +183,8 @@ void emitTensorView(
     stringstream& ss,
     Halide::OutputImageParam p,
     const map<string, Halide::Expr>& paramValues,
-    bool constInput = false) {
+    bool constInput = false,
+    const TensorInfo* tinfo = NULL) {
   WS ws;
   stringstream ssViewType;
   for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension
@@ -191,7 +192,14 @@ void emitTensorView(
     extent = Halide::Internal::substitute(paramValues, extent);
     CHECK(extent.defined())
         << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n";
-    ssViewType << "[" << extent << "]";
+    // TODO: Handle non-unit stride in the innermost dimension
+    if (tinfo && tinfo->strides.size() == p.dimensions() &&
+        tinfo->strides[p.dimensions() - 1] == 1 &&
+        tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) {
+      ssViewType << "[" << tinfo->strides[i - 1] << "]";
+    } else {
+      ssViewType << "[" << extent << "]";
+    }
   }
   ss << ws.tab();
   ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")"
@@ -216,9 +224,12 @@ void emitTensorViews(
 void emitTensorViews(
     stringstream& ss,
     const vector<Halide::ImageParam>& params,
-    const map<string, Halide::Expr>& paramValues) {
-  for (auto p : params) {
-    emitTensorView(ss, p, paramValues, true);
+    const map<string, Halide::Expr>& paramValues,
+    const std::vector<TensorInfo>& inputsInfo = std::vector<TensorInfo>{}) {
+  for (size_t i = 0; i < params.size(); ++i) {
+    inputsInfo.size()
+        ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i])
+        : emitTensorView(ss, params[i], paramValues, true);
   }
 }
 
@@ -738,7 +749,8 @@ std::unordered_set<isl::id, isl::IslIdIslHash> gatherReadOnlySet(
 
 string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& mscop) {
+    const MappedScop& mscop,
+    const std::vector<TensorInfo>& inputsInfo) {
   // Expecting a schedule with domain root and context first child.
   CHECK(mscop.schedule()->elemAs<detail::ScheduleTreeElemDomain>());
   CHECK(
@@ -755,7 +767,7 @@ string emitCudaKernel(
   emitKernelSignature(ss, specializedName, scop);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
-  emitTensorViews(ss, scop.halide.inputs, paramValues);
+  emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo);
   emitTmpDecl(ss, scop);
   emitPromotedArrayViewsHalide(ss, scop);
   NodeInfoMapType nodeInfoMap;
diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h
index ff3631d92..bd94f1bd3 100644
--- a/tc/core/polyhedral/cuda/codegen.h
+++ b/tc/core/polyhedral/cuda/codegen.h
@@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext {
 
 std::string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& scop);
+    const MappedScop& scop,
+    const std::vector<TensorInfo>& inputsInfo = std::vector<TensorInfo>{});
 
 } // namespace polyhedral
 } // namespace tc
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
index e0dc474ae..1efb03c0b 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.cc
+++ b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -910,7 +910,8 @@ std::unique_ptr<MappedScop> makeSpecializedMappedScop(
 // the context of the original scop as top-level
 // context node in schedule tree.
 std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
-    const std::string& specializedName) const {
+    const std::string& specializedName,
+    const std::vector<TensorInfo>& inputsInfo) const {
   validate(schedule());
 
   auto mappedScopForCodegen = makeSpecializedMappedScop(*this);
@@ -927,8 +928,8 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
     code << code::cuda::cubBlockReduce;
   }
   code << "extern \"C\" {" << std::endl
-       << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}"
-       << std::endl;
+       << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo)
+       << "}" << std::endl;
 
   return std::make_tuple(
       code.str(),
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
index 169b4f138..5af792df9 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.h
+++ b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -115,7 +115,9 @@ class MappedScop {
   // Generate CUDA code at the current state of transformation provided a
   // name for the generated function.
   std::tuple<std::string, tc::Grid, tc::Block> codegen(
-      const std::string& specializedName) const;
+      const std::string& specializedName,
+      const std::vector<TensorInfo>& inputsInfo =
+          std::vector<TensorInfo>{}) const;
 
   // Accessors..
   // Const accessor to schedule of underlying Scop.
diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc
index e89756aea..3aedafa29 100644
--- a/test/cuda/test_tc_mapper.cc
+++ b/test/cuda/test_tc_mapper.cc
@@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
   auto res = Check(TC, name, options, inputs, checkFun);
   // This test should be modified  when strided tensors are handled
   std::string expected =
-      "const float32 (*I0_view)[64] = "
-      "reinterpret_cast<const float32 (*)[64]>(pI0_view)";
+      "const float32 (*I0_view)[128] = "
+      "reinterpret_cast<const float32 (*)[128]>(pI0_view)";
   ASSERT_NE(std::string::npos, res.second.find(expected))
       << "In resulting code:\n"
       << res.second << "\nfound unexpected: " << expected;