From ff1ed36fced4ba9c648d6d6dfebd484eed09e79d Mon Sep 17 00:00:00 2001
From: Protonu Basu <protonu@devfair017.maas>
Date: Thu, 7 Jun 2018 14:53:17 -0700
Subject: [PATCH] Add support for strided tensors

This commit is to start support for strided tensors. I made changes
to percolate a vector in TensorInfo down to emitCudaKernel to allow
codegen to cast strided tensors. This required changes to an unit test
to expect the correct cast.
---
 .test_tc_mapper_output.txt.swp         | Bin 0 -> 16384 bytes
 tc/core/cuda/cuda_tc_executor.cc       |   5 +-
 tc/core/polyhedral/cuda/codegen.cc     |  26 ++-
 tc/core/polyhedral/cuda/codegen.h      |   3 +-
 tc/core/polyhedral/cuda/mapped_scop.cc |   7 +-
 tc/core/polyhedral/cuda/mapped_scop.h  |   4 +-
 test/cuda/test_tc_mapper.cc            |   4 +-
 test_tc_mapper_output.txt              | 226 +++++++++++++++++++++++++
 8 files changed, 260 insertions(+), 15 deletions(-)
 create mode 100644 .test_tc_mapper_output.txt.swp
 create mode 100644 test_tc_mapper_output.txt
diff --git a/.test_tc_mapper_output.txt.swp b/.test_tc_mapper_output.txt.swp
new file mode 100644
index 0000000000000000000000000000000000000000..da018f2969f26a221cefaf8aa16e28f72ea0b4a1
GIT binary patch
literal 16384
zcmeHOU5p!76}~`I0<@H$s*nfLljMP!jqM%V`{Q_bL$_|4I%L<&t`m@~Wi+0-vmP>@
zYt4-JCn2Fc@gr5df#?erm4FHXr94#>Bq9nGPdp$bgj7(4P+veuAfBM`-I=l1_9mN6
zva298qkQqq{XO@)=bky|-r1&hvARIhxe12reT;cuInsFUH`fop++eKZi%@u7nX~I<
z@pap9d@Vn{YcX~$X>h#Gy+HUC(eC)Xg?mSM!Fb4nP!CPLZFD-^*F`t%bi-V@8SdU_
znwv6^GO$Mm9%RKrUXz7S96v@M{J@1h)=P_}45SRC45SRC45SRC45SRC45SSFKQIt>
z?qi?BK;E7Vrkeb|ZRGbq$rS|;$U8TZ|5lPeo}^zM$$v-kTuy#p7|Bl_)1Q=ql!26i
zl!26il!26il!26il!26il!26ilz{_az_1v5812?&z>o9)Ui|<2hZy@F@HB7{_$crZ
z;L3X$y9^Y7L%^>dWb8HIRp7_K5#YnXuODFSd7uT@fC>EjJ&gSbcn09Whk(Dlo3Wn(
zUjsf5_`niS0FD4}y^FCofLDR<0AB_wzzp!W!;JkD2!Y3df4-Bk>%cnj81Rq#8G9M{
zAz%YJ;7^Abdl9gK0`TU2jC~h)9{3bMz;Cfx@m1hF@BnZfn<HNYW`Vzgr*8t1uNBGJ
z4KBj<S!yg~XmORYr|Al<=vt$u2YKDgQ1si5G8#(!Kv^ml#%xyzUrCbG@nS($SLrH~
zt!4M;irTsz$-JGCc?Tz1)9@_CQwl2K1CVw%G@33~@+z6a@l40!UZ~_VgoLi=3)fY&
z3~4IHv_Irz(b68dx#>*ZQ%QzqX8$C)(^<Kd{+CIk*|~T*cr;6kgy|nhYa?xeuByxu
zZNu>tR@J7oY04MNTA@5Sk<+FpOVb(^@~5VY6quruGtKhE<isk?+adQUbX><<BYa21
z{qnugl6Y+=Yk2?WV~;u-o11;_{eM^+irU1eHfUN;8v?V%t*+DvsSEpQGxlsXhV)^7
zB}irkrg6Y>YD1P2Pd&4FPi*GF>f>#U{NU|UAwM;lXrkG*4Ea$b-{fXD6uz`krQ#|H
z-&u1!!-e6B^peL~nd=68nHyHkbGIr6vJ~5J1Fp_QI-83pup&BJD=ow4R+5`!27JkP
z){T%a@V4-`Ml<6oH^{>Wmv!;DE6huQQka~|5dM=YLX?ZL>Mb841xm4yp;A91G+XDL
zrf>KV)OCZQ-e7upwGCCYxf!}!i}<p0RM_|po==*S6{7ZKA8>r)-fd@d$?%OfhXoHF
z=eF?qN`Dy-b?A7ZZ`9F9lFye!;DpXPulZrifZq`MU7obMqwKuvij9>^d;`W|_g0_m
z%1>2S&{xRUwv^B;mxq(e<t3j7++XL#O5bTo`i)*hdysFa+)ZU0Dm5&`-sRfyM%7A%
z;%=H-SS2~=3bc9wpXq|IS|^lW@0QDL;}TyqK>2WnB1g|+FZS9VuUsxiG>bPTJ#}XJ
zOr#4n1)g(TrIT}1)%10TZ}bwVnjd&(7Pl#kJ})*bD+qnKjaVkGFhXT9LknuJdWMD)
z<2F?3tt&N^t}wQY*M%v^MjKAp0^Q_zP$|Q8E*mmAM^&3@<OJl2knkaMwx7=}YDU^h
zCYFYDLc$iOdWGz+7d4rION+0bEqE!=s3iYbHt5<=dDO<zPWl#DL<Z~z27U|zU>zA2
z2^)`P_R0C>#p>eOGNlY3IKvHEzOYEqwMuofw*a_bhi%Kn-?PtK7EFUwIM&#9_f{zn
zg$zQnebL@gb-3hNS2XcXh>CM>-SLIjM$|(A53}7)jy2}`*D5V{Jx}V+VDrhnX#_mS
z7DdR*bXJHp7upwz;JDH=o};lu&@m?C|D%X`k0TbA@qh1p{w2iv&j9~IjQ<Mo8Q>A%
z=ZNh)z)^q!KS5mo3h*>=0ayVZ1&#vu1Aj-1{|DeT;6>mEz!!iepavWQ-b9@Lec;=`
zw}7t$j{>hF#(x(0G;kPr2{HbcfIlO)e*^d<@H!9x4)9CF_rCzH0iOUK0Uie45Bv_}
zdk**p&;h0aIp$+PdZi4c45SR)n1Ke-()`NGnJ0*PwtU1A-^2owZ@^1X3a3ic4k#3%
z;i`MH_jt?uSxanqY`FYjN24zx+0pc1NAcT=Y9EK#qlI_8v3j*rZ|&f{wGpqoI8iEU
z`CYv3DQsyqDz$~B>iILPB>!qI-r16VDYXP#i*waFJW81^_q~L;R7P{9b^yOydfu`3
zU27o1qA_Y}bdt=1y4p}ec^fC@u~Re$-}jb<PLh&%;Cm>V_~y4NsYU@V;yY3hok2K=
z66R;*4R#2N@f}?MlJv|ea-QK&Nv9sdD`QD7Y9n7L$x-+mt#BVcd8oq`3TLIaJy;<w
zd0Of8YRdR}v`{k2J83nJP~t)LrL3t9Ig>XKZ+j$B5y(}G>Pjv!Mc^Wez^m3~ZgU1v
z08;Zqsar{E#9P1YWjQKY8Z$meO&Q=1G|`kz^b?a7nzDs{A{x;3wa6#cby^pWwIj6B
zrzY@E&KhO{2OjYaBCTXhW7Mg|(N3R#Bqh||Afp$DU159^Q|Qt0%5fQuo#3A3*boPQ
zYwDQ1CyuTMTb{XL_#LH-fNIS{@YFVfOBjW=`DAiIX|A#pj!n8=x$wk%rJ`Rvv#c-7
zpQ|mSomo1?#>Xk%ULaY3PGzXqtc>prM0lb@2-HkYvM(h0oD6rv*b1VOEja4VWp3pl
zC<0K&;~+xz=L!$}d26~6=$7cp^9*fs+P1a9>->3ju$rs~8fBF@M3ps4Q3VxeU0WS4
zW8Ap4nC7X8nSOdyrN{nuvJ^s2&R|hI#`85J33v*Tt;I>UPuMcCWZXJ199u^W>**Z3
zQ3_Uyl`%3+m6YgSrH_$fs8lAUsj895!P4tu_T;|4hTJSESr4|x!d+e+w@}ohak3yF
zZ+Bd5r_9QZJ?!;R{WPRP?;JezXS*(kBz#M$6E*_sWJNLhh$YPzwZg=yK}+g~rrdFx
zgjra;xI7SnQy{v&ne4hza>dI`4?5g*Y{!hYE_!*}0o`5QVMX??+u6r&?>vWV51+%u
zt=^Sv+m%^Q#@pN5zV&J0K}mV1rxLw!Mti~;7+WB}%b6`SJPX@<GC8$UsVyujNox}w
zULjMyQIn1SC=+I}mm3{-p+NLX+dk|<dq|m<CtW>pu<a`M)^t0H*A)kwuOUu8rFgwU
K+mm%yckI6>2dCx$

literal 0
HcmV?d00001

diff --git a/tc/core/cuda/cuda_tc_executor.cc b/tc/core/cuda/cuda_tc_executor.cc
index 72a1350ad..1ebb2047b 100644
--- a/tc/core/cuda/cuda_tc_executor.cc
+++ b/tc/core/cuda/cuda_tc_executor.cc
@@ -93,13 +93,16 @@ CudaCompilationResult CudaBackend::compileWithTcMapper(
   auto parameters = mappedScop->scop().getParameterValues();
   auto specializedName = specializeKernelName(tcName, parameters);
 
+  auto inputsInfo = makeTensorInfoVector(inputs);
+
   // This updates the launch bounds with the actual result from compilation
   // with tightening of launch_bounds. What you get is not necessarily what
   // you asked for, the autotuner should adapt to that.
   std::string source;
   Grid grid;
   Block block;
-  std::tie(source, grid, block) = mappedScop->codegen(specializedName);
+  std::tie(source, grid, block) =
+      mappedScop->codegen(specializedName, inputsInfo);
   LOG_IF(INFO, FLAGS_dump_cuda) << "generatedCuda: " << source << "\n"
                                 << "grid: " << grid << " block: " << block;
 
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
index ee1643984..8b5677322 100644
--- a/tc/core/polyhedral/cuda/codegen.cc
+++ b/tc/core/polyhedral/cuda/codegen.cc
@@ -183,7 +183,8 @@ void emitTensorView(
     stringstream& ss,
     Halide::OutputImageParam p,
     const map<string, Halide::Expr>& paramValues,
-    bool constInput = false) {
+    bool constInput = false,
+    const TensorInfo* tinfo = NULL) {
   WS ws;
   stringstream ssViewType;
   for (int i = 1; i < p.dimensions(); ++i) { // Skip the outermost dimension
@@ -191,7 +192,14 @@ void emitTensorView(
     extent = Halide::Internal::substitute(paramValues, extent);
     CHECK(extent.defined())
         << "Undefined extent on input/output tensor. Forward bounds inference should have set these\n";
-    ssViewType << "[" << extent << "]";
+    // TODO: Handle non-unit stride in the innermost dimension
+    if (tinfo && tinfo->strides.size() == p.dimensions() &&
+        tinfo->strides[p.dimensions() - 1] == 1 &&
+        tinfo->strides[i - 1] != (tinfo->shape[i] * tinfo->strides[i])) {
+      ssViewType << "[" << tinfo->strides[i - 1] << "]";
+    } else {
+      ssViewType << "[" << extent << "]";
+    }
   }
   ss << ws.tab();
   ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")"
@@ -216,9 +224,12 @@ void emitTensorViews(
 void emitTensorViews(
     stringstream& ss,
     const vector<Halide::ImageParam>& params,
-    const map<string, Halide::Expr>& paramValues) {
-  for (auto p : params) {
-    emitTensorView(ss, p, paramValues, true);
+    const map<string, Halide::Expr>& paramValues,
+    const std::vector<TensorInfo>& inputsInfo = std::vector<TensorInfo>{}) {
+  for (size_t i = 0; i < params.size(); ++i) {
+    inputsInfo.size()
+        ? emitTensorView(ss, params[i], paramValues, true, &inputsInfo[i])
+        : emitTensorView(ss, params[i], paramValues, true);
   }
 }
 
@@ -738,7 +749,8 @@ std::unordered_set<isl::id, isl::IslIdIslHash> gatherReadOnlySet(
 
 string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& mscop) {
+    const MappedScop& mscop,
+    const std::vector<TensorInfo>& inputsInfo) {
   // Expecting a schedule with domain root and context first child.
   CHECK(mscop.schedule()->elemAs<detail::ScheduleTreeElemDomain>());
   CHECK(
@@ -755,7 +767,7 @@ string emitCudaKernel(
   emitKernelSignature(ss, specializedName, scop);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
-  emitTensorViews(ss, scop.halide.inputs, paramValues);
+  emitTensorViews(ss, scop.halide.inputs, paramValues, inputsInfo);
   emitTmpDecl(ss, scop);
   emitPromotedArrayViewsHalide(ss, scop);
   NodeInfoMapType nodeInfoMap;
diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h
index ff3631d92..bd94f1bd3 100644
--- a/tc/core/polyhedral/cuda/codegen.h
+++ b/tc/core/polyhedral/cuda/codegen.h
@@ -145,7 +145,8 @@ struct CodegenStatementContext : CodegenContext {
 
 std::string emitCudaKernel(
     const std::string& specializedName,
-    const MappedScop& scop);
+    const MappedScop& scop,
+    const std::vector<TensorInfo>& inputsInfo = std::vector<TensorInfo>{});
 
 } // namespace polyhedral
 } // namespace tc
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
index e0dc474ae..1efb03c0b 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.cc
+++ b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -910,7 +910,8 @@ std::unique_ptr<MappedScop> makeSpecializedMappedScop(
 // the context of the original scop as top-level
 // context node in schedule tree.
 std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
-    const std::string& specializedName) const {
+    const std::string& specializedName,
+    const std::vector<TensorInfo>& inputsInfo) const {
   validate(schedule());
 
   auto mappedScopForCodegen = makeSpecializedMappedScop(*this);
@@ -927,8 +928,8 @@ std::tuple<std::string, tc::Grid, tc::Block> MappedScop::codegen(
     code << code::cuda::cubBlockReduce;
   }
   code << "extern \"C\" {" << std::endl
-       << emitCudaKernel(specializedName, *mappedScopForCodegen) << "}"
-       << std::endl;
+       << emitCudaKernel(specializedName, *mappedScopForCodegen, inputsInfo)
+       << "}" << std::endl;
 
   return std::make_tuple(
       code.str(),
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
index 169b4f138..5af792df9 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.h
+++ b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -115,7 +115,9 @@ class MappedScop {
   // Generate CUDA code at the current state of transformation provided a
   // name for the generated function.
   std::tuple<std::string, tc::Grid, tc::Block> codegen(
-      const std::string& specializedName) const;
+      const std::string& specializedName,
+      const std::vector<TensorInfo>& inputsInfo =
+          std::vector<TensorInfo>{}) const;
 
   // Accessors..
   // Const accessor to schedule of underlying Scop.
diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc
index e89756aea..3aedafa29 100644
--- a/test/cuda/test_tc_mapper.cc
+++ b/test/cuda/test_tc_mapper.cc
@@ -326,8 +326,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
   auto res = Check(TC, name, options, inputs, checkFun);
   // This test should be modified  when strided tensors are handled
   std::string expected =
-      "const float32 (*I0_view)[64] = "
-      "reinterpret_cast<const float32 (*)[64]>(pI0_view)";
+      "const float32 (*I0_view)[128] = "
+      "reinterpret_cast<const float32 (*)[128]>(pI0_view)";
   ASSERT_NE(std::string::npos, res.second.find(expected))
       << "In resulting code:\n"
       << res.second << "\nfound unexpected: " << expected;
diff --git a/test_tc_mapper_output.txt b/test_tc_mapper_output.txt
new file mode 100644
index 000000000..24a7ee836
--- /dev/null
+++ b/test_tc_mapper_output.txt
@@ -0,0 +1,226 @@
+Note: Google Test filter = *Strided*
+[==========] Running 1 test from 1 test case.
+[----------] Global test environment set-up.
+[----------] 1 test from TcCudaMapperTest
+[ RUN      ] TcCudaMapperTest.TensorAddStrided
+WARNING: 
+Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=:
+
+def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
+    O(n, m) += I0_view(n, m) + I1_view(n, m)
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...  <--- HERE
+}
+  
+WARNING: 
+Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=:
+
+def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
+    O(n, m) += I0_view(n, m) + I1_view(n, m)
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...  <--- HERE
+}
+  
+I0607 13:02:54.070823 21973 cuda_tc_executor.cc:82] tc::CudaMappingOptions::makeNaiveMappingOptions()
+    .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
+    .outerScheduleAllowSkewing(false)
+    .outerSchedulePositiveOrthant(true)
+    .intraTileScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
+    .intraTileScheduleAllowSkewing(false)
+    .intraTileSchedulePositiveOrthant(true)
+    .fixParametersBeforeScheduling(false)
+    .tile(32, 32, 32)
+    .unroll(1)
+    .tileImperfectlyNested(false)
+    .matchLibraryCalls(false)
+    .mapToThreads(32, 8)
+    .mapToBlocks(256, 256)
+    .useSharedMemory(false)
+    .usePrivateMemory(false)
+    .unrollCopyShared(false)
+    .useReadOnlyCache(false);
+I0607 13:02:54.072165 21973 cuda_tc_executor.cc:83] original schedule:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M })
+  band(n(1) permutable(0) coincident(0) unroll(0)
+  -----------------------------------------------------------------------
+  | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] }
+  -----------------------------------------------------------------------
+    band(n(1) permutable(0) coincident(0) unroll(0)
+    -----------------------------------------------------------------------
+    | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] }
+    -----------------------------------------------------------------------
+I0607 13:02:54.075304 21973 scop.cc:400] After scheduling:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M })
+  band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+  -----------------------------------------------------------------------
+  | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] }
+  -----------------------------------------------------------------------
+  | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] }
+  -----------------------------------------------------------------------
+I0607 13:02:54.075870 21973 scop.cc:454] After tiling outer:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M })
+  band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+  -----------------------------------------------------------------------
+  | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] }
+  -----------------------------------------------------------------------
+  | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] }
+  -----------------------------------------------------------------------
+    band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+    -----------------------------------------------------------------------
+    | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+    -----------------------------------------------------------------------
+    | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+    -----------------------------------------------------------------------
+I0607 13:02:54.078128 21973 mapped_scop.cc:1021] After mapping to threads:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 })
+  context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 })
+    band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+    -----------------------------------------------------------------------
+    | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] }
+    -----------------------------------------------------------------------
+    | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] }
+    -----------------------------------------------------------------------
+      mapping_filter(ids(t1, t0, )
+        [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 })
+        band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+        -----------------------------------------------------------------------
+        | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+        -----------------------------------------------------------------------
+        | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+        -----------------------------------------------------------------------
+          thread_specific()
+I0607 13:02:54.079393 21973 schedule_transforms.cc:391] Resizing scales to 2 entries: 32 32 32 
+I0607 13:02:54.079439 21973 mapped_scop.cc:1029] After mapping to blocks:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 })
+  context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 })
+    mapping_filter(ids(b1, b0, )
+      [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n })
+      band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] }
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] }
+      -----------------------------------------------------------------------
+        mapping_filter(ids(t1, t0, )
+          [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 })
+          band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+          -----------------------------------------------------------------------
+            thread_specific()
+I0607 13:02:54.079643 21973 mapped_scop.cc:1083] After outerBlockInnerThread strategy:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 })
+  context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 })
+    mapping_filter(ids(b1, b0, )
+      [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n })
+      band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] }
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] }
+      -----------------------------------------------------------------------
+        mapping_filter(ids(t1, t0, )
+          [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 })
+          band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+          -----------------------------------------------------------------------
+            thread_specific()
+I0607 13:02:54.079829 21973 cuda_tc_executor.cc:90] Mapped schedule:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 })
+  context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 })
+    mapping_filter(ids(b1, b0, )
+      [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n })
+      band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] }
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] }
+      -----------------------------------------------------------------------
+        mapping_filter(ids(t1, t0, )
+          [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 })
+          band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+          -----------------------------------------------------------------------
+            thread_specific()
+I0607 13:02:54.091660 21973 mapped_scop.cc:900] Codegen with tightened bounds [blocks:CudaDim(2, 2, 1) @0x7ffefab63f90, threads:CudaDim(32, 8, 1) @0x7ffefab63fd0] for tree:
+domain(
+  [M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 })
+  context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : M = 64 and N = 64 and t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 1 and 0 <= b0 <= 1 })
+    mapping_filter(ids(b1, b0, )
+      [M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n })
+      band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] }
+      -----------------------------------------------------------------------
+      | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] }
+      -----------------------------------------------------------------------
+        mapping_filter(ids(t1, t0, )
+          [M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 })
+          band(n(2) permutable(1) coincident(1, 1) unroll(0, 0)
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] }
+          -----------------------------------------------------------------------
+          | [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] }
+          -----------------------------------------------------------------------
+            thread_specific()
+I0607 13:02:54.130249 21973 cuda_rtc.cc:58] NVRTC function source:
+
+template<typename T> inline __device__ T floord(T n, T d) {
+  return n < 0 ? - (-n + d - 1)/d : n / d;
+}
+#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
+
+// Halide type handling
+typedef int int32;
+typedef long int64;
+typedef float float32;
+typedef double float64;
+
+#define inff __int_as_float(0x7f800000)
+#define inf __longlong_as_double(0x7ff0000000000000LL)
+
+// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
+#if __CUDACC_VER_MAJOR__ < 9
+__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
+#endif
+
+extern "C" {
+__global__ void tensoraddstrided_64_64(int32 M, int32 N, float32* pO, const float32* pI0_view, const float32* pI1_view) {
+  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
+  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
+  float32 (*O)[64] = reinterpret_cast<float32 (*)[64]>(pO);
+  const float32 (*I0_view)[128] = reinterpret_cast<const float32 (*)[128]>(pI0_view);
+  const float32 (*I1_view)[128] = reinterpret_cast<const float32 (*)[128]>(pI1_view);
+  for (int c2 = t1; c2 <= 31; c2 += 8) {
+    O[(32 * b0 + c2)][(t0 + 32 * b1)] = (O[(32 * b0 + c2)][(t0 + 32 * b1)] + (I0_view[(32 * b0 + c2)][(t0 + 32 * b1)] + I1_view[(32 * b0 + c2)][(t0 + 32 * b1)]));
+  }
+}
+}
+I0607 13:02:54.348301 21973 cuda_tc_executor.cc:64] [COMPILE] Compiling with host JIT compiler took: 218ms
+WARNING: 
+Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=:
+
+def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
+    O(n, m) += I0_view(n, m) + I1_view(n, m)
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~...  <--- HERE
+}
+  
+[       OK ] TcCudaMapperTest.TensorAddStrided (297 ms)
+[----------] 1 test from TcCudaMapperTest (297 ms total)
+
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test case ran. (298 ms total)
+[  PASSED  ] 1 test.