Distcoalesce2 (#42)

fschlimb · tkarna · web-flow · commit 2a940b1ab338 · 2023-08-28T16:29:57.000+02:00
* enable distcoalesce
* fix update_halo for general subview case
* updating imex sha

---------

Co-authored-by: Tuomas Karna &lt;tuomas.karna@intel.com&gt;
diff --git a/imex_version.txt b/imex_version.txt
@@ -1 +1 @@
-89b5d56c4774ddb82ab8f896c3d977c6edae267b
+571f54577e2301c70033fef9a05b8a96fa841d2b
diff --git a/src/idtr.cpp b/src/idtr.cpp
@@ -464,16 +464,21 @@ void _idtr_update_halo(DTypeId ddpttype, int64_t ndims, int64_t *ownedOff,
   auto ownedRows = ownedShape[0];
   auto ownedRowEnd = ownedRowStart + ownedRows;
   // all remaining dims are treated as one large column
-  auto ownedCols = std::accumulate(&ownedShape[1], &ownedShape[ndims], 1,
+  auto ownedTotCols = std::accumulate(&ownedShape[1], &ownedShape[ndims], 1,
+                                      std::multiplies<int64_t>());
+  auto bbTotCols = std::accumulate(&bbShape[1], &bbShape[ndims], 1,
                                    std::multiplies<int64_t>());
 
   // find local elements to send to next workers (destination leftHalo)
   // and previous workers (destination rightHalo)
   std::vector<int> lSendOff(nworkers, 0), rSendOff(nworkers, 0);
   std::vector<int> lSendSize(nworkers, 0), rSendSize(nworkers, 0);
 
-  // use send buffer if owned data is strided
-  bool bufferizeSend = !is_contiguous(ownedShape, ownedStride, ndims);
+  // use send buffer if owned data is strided or sending a subview
+  bool bufferizeSend = (!is_contiguous(ownedShape, ownedStride, ndims) ||
+                        bbTotCols != ownedTotCols);
+
+  // assert(!bufferizeSend);
   std::vector<int64_t> lBufferStart(nworkers * ndims, 0);
   std::vector<int64_t> lBufferSize(nworkers * ndims, 0);
   std::vector<int64_t> rBufferStart(nworkers * ndims, 0);
@@ -495,9 +500,9 @@ void _idtr_update_halo(DTypeId ddpttype, int64_t ndims, int64_t *ownedOff,
       auto globalRowStart = std::max(ownedRowStart, bRowStart);
       auto globalRowEnd = std::min(ownedRowEnd, bRowEnd);
       auto localRowStart = globalRowStart - ownedRowStart;
-      auto localStart = (int)(localRowStart)*ownedCols;
+      auto localStart = (int)(localRowStart)*ownedTotCols;
       auto nRows = globalRowEnd - globalRowStart;
-      auto nSend = (int)(nRows)*ownedCols;
+      auto nSend = (int)(nRows)*bbTotCols;
 
       if (i < myWorkerIndex) {
         // target is rightHalo
@@ -506,8 +511,8 @@ void _idtr_update_halo(DTypeId ddpttype, int64_t ndims, int64_t *ownedOff,
           rBufferStart[i * ndims] = localRowStart;
           rBufferSize[i * ndims] = nRows;
           for (auto j = 1; j < ndims; ++j) {
-            rBufferStart[i * ndims + j] = ownedOff[j];
-            rBufferSize[i * ndims + j] = ownedShape[j];
+            rBufferStart[i * ndims + j] = bbOff[j];
+            rBufferSize[i * ndims + j] = bbShape[j];
           }
         } else {
           rSendOff[i] = localStart;
@@ -521,8 +526,8 @@ void _idtr_update_halo(DTypeId ddpttype, int64_t ndims, int64_t *ownedOff,
           lBufferStart[i * ndims] = localRowStart;
           lBufferSize[i * ndims] = nRows;
           for (auto j = 1; j < ndims; ++j) {
-            lBufferStart[i * ndims + j] = ownedOff[j];
-            lBufferSize[i * ndims + j] = ownedShape[j];
+            lBufferStart[i * ndims + j] = bbOff[j];
+            lBufferSize[i * ndims + j] = bbShape[j];
           }
         } else {
           lSendOff[i] = localStart;
diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
@@ -431,7 +431,7 @@ JIT::createExecutionEngine(::mlir::ModuleOp &module) {
 static const char *pass_pipeline =
     getenv("DDPT_PASSES") ? getenv("DDPT_PASSES")
                           : "func.func(ptensor-dist),"
-                            // "func.func(dist-coalesce)," FIXME
+                            "func.func(dist-coalesce),"
                             "convert-dist-to-standard,"
                             "convert-ptensor-to-linalg,"
                             "canonicalize,"
@@ -452,7 +452,7 @@ static const char *pass_pipeline =
                             "func.func(linalg-detensorize),"
                             "func.func(tensor-bufferize),"
                             "func.func(finalizing-bufferize),"
-                            // "func.func(buffer-deallocation)," FIXME
+                            "func.func(buffer-deallocation),"
                             // "imex-remove-temporaries," FIXME
                             "func.func(convert-linalg-to-parallel-loops),"
                             "func.func(scf-parallel-loop-fusion),"
@@ -491,9 +491,10 @@ JIT::JIT()
   }
   // some verbosity
   if (_verbose) {
-    std::cerr << "pass pipeline: " << pass_pipeline << std::endl;
+    std::cerr << "DDPT_PASSES=\"" << pass_pipeline << "\"" << std::endl;
     // _pm.enableStatistics();
-    _pm.enableTiming();
+    if (_verbose > 2)
+      _pm.enableTiming();
     // if(_verbose > 1)
     //   _pm.dump();
     if (_verbose > 3)
diff --git a/test/test_setget.py b/test/test_setget.py
@@ -26,7 +26,7 @@ def doit(aapi):
     def test_setitem2(self):
         def doit(aapi):
             a = aapi.ones((16, 16), aapi.float64)
-            b = aapi.zeros((16, 16), aapi.float64)
+            b = aapi.fromfunction(lambda i, j: 10 * i + j, (16, 16), dtype=aapi.float64)
             a[1:8, 0:6] = b[0:7, 0:6]
             return a
 
@@ -35,12 +35,31 @@ def doit(aapi):
     def test_setitem3(self):
         def doit(aapi):
             a = aapi.ones((16, 16), aapi.float64)
-            b = aapi.zeros((16, 16), aapi.float64)
+            b = aapi.fromfunction(lambda i, j: 10 * i + j, (16, 16), dtype=aapi.float64)
             a[7:16:3, 4:10:2] = b[4:7, 10:16:2]
             return a
 
         assert runAndCompare(doit)
 
+    def test_setitem4(self):
+        # Note: test halo update without send buffer
+        def doit(aapi):
+            a = aapi.ones((16, 16), aapi.float64)
+            b = aapi.fromfunction(lambda i, j: 10 * i + j, (16, 16), dtype=aapi.float64)
+            a[7:16:3, 0:16] = b[4:7, 0:16]
+            return a
+
+        assert runAndCompare(doit)
+
+    def test_setitem5(self):
+        # Note: test assignment to one full local part
+        def doit(aapi):
+            a = aapi.fromfunction(lambda i, j: 10 * i + j, (16, 16), dtype=aapi.int64)
+            a[0:10, 4:11] = a[0:10, 4:11]
+            return a
+
+        assert runAndCompare(doit)
+
     def test_colon(self):
         a = dt.ones((16, 16), dt.float64)
         b = dt.zeros((16, 16), dt.float64)
diff --git a/test/utils.py b/test/utils.py
@@ -1,5 +1,8 @@
 import numpy
 import ddptensor
+from ddptensor.numpy import fromfunction
+
+ddptensor.fromfunction = fromfunction
 
 
 def runAndCompare(func, do_gather=True):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-89b5d56c4774ddb82ab8f896c3d977c6edae267b`
	`1`	`+571f54577e2301c70033fef9a05b8a96fa841d2b`